diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db1fc30111a2d..149acef72db26 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,9 @@ on: push: branches: master pull_request: - branches: master + branches: + - master + - 1.1.x env: ENV_FILE: environment.yml diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index dc6f45f810f3d..e0a2257b0ca1f 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -6,7 +6,7 @@ from .pandas_vb_common import tm try: - from pandas.tseries.offsets import Nano, Hour + from pandas.tseries.offsets import Hour, Nano except ImportError: # For compatibility with older versions from pandas.core.datetools import * # noqa diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index e266d871f5bc6..5d9070de92ec7 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -7,14 +7,14 @@ try: from pandas import ( - rolling_median, + rolling_kurt, + rolling_max, rolling_mean, + rolling_median, rolling_min, - rolling_max, - rolling_var, rolling_skew, - rolling_kurt, rolling_std, + rolling_var, ) have_rolling_methods = True diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index ec3eddfff7184..5390056ba36f2 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,8 +2,8 @@ try: from pandas._libs.tslibs.parsing import ( - concat_date_cols, _does_string_look_like_datetime, + concat_date_cols, ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index 7d4e0556f4d96..9a206410d8775 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -1,5 +1,5 @@ try: - from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized + from pandas._libs.tslibs import is_date_array_normalized, normalize_i8_timestamps except ImportError: from pandas._libs.tslibs.conversion import ( normalize_i8_timestamps, diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e45cafc02cb61..113ad3e338952 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,9 +1,11 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: - master +- 1.1.x pr: - master +- 1.1.x variables: PYTEST_WORKERS: auto diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7b12de387d648..816bb23865c04 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts" + ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else @@ -230,6 +230,11 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + # https://github.com/python/mypy/issues/7384 + # MSG='Check for missing error codes with # type: ignore' ; echo $MSG + # invgrep -R --include="*.py" -P '# type: ignore(?!\[)' pandas + # RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG invgrep -R --include=*.{py,pyx} '\.__class__' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index a9b9a5a47ccf5..3034ed3dc43af 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b85e9403038ab..4ffd1d586a99a 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -153,14 +153,38 @@ to build the documentation locally before pushing your changes. Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ -Instead of manually setting up a development environment, you can use Docker to -automatically create the environment with just several commands. Pandas provides a `DockerFile` -in the root directory to build a Docker image with a full pandas development environment. +Instead of manually setting up a development environment, you can use `Docker +`_ to automatically create the environment with just several +commands. Pandas provides a `DockerFile` in the root directory to build a Docker image +with a full pandas development environment. -Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code, +**Docker Commands** + +Pass your GitHub username in the `DockerFile` to use your own fork:: + + # Build the image pandas-yourname-env + docker build --tag pandas-yourname-env . + # Run a container and bind your local forked repo, pandas-yourname, to the container + docker run -it --rm -v path-to-pandas-yourname:/home/pandas-yourname pandas-yourname-env + +Even easier, you can integrate Docker with the following IDEs: + +**Visual Studio Code** + +You can use the DockerFile to launch a remote session with Visual Studio Code, a popular free IDE, using the `.devcontainer.json` file. See https://code.visualstudio.com/docs/remote/containers for details. +**PyCharm (Professional)** + +Enable Docker support and use the Services tool window to build and manage images as well as +run and interact with containers. +See https://www.jetbrains.com/help/pycharm/docker.html for details. + +Note that you might need to rebuild the C extensions if/when you merge with upstream/master using:: + + python setup.py build_ext --inplace -j 4 + .. _contributing.dev_c: Installing a C compiler @@ -751,7 +775,7 @@ Imports are alphabetically sorted within these sections. As part of :ref:`Continuous Integration ` checks we run:: - isort --recursive --check-only pandas + isort --check-only pandas to check that imports are correctly formatted as per the `setup.cfg`. @@ -770,8 +794,6 @@ You should run:: to automatically format imports correctly. This will modify your local copy of the files. -The `--recursive` flag can be passed to sort all files in a directory. - Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: git diff upstream/master --name-only -- "*.py" | xargs -r isort diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index b02d4abd3ddf8..de231e43918f8 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -80,6 +80,11 @@ ML pipeline. Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. +`Compose `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. + .. _ecosystem.visualization: Visualization @@ -445,6 +450,7 @@ Library Accessor Classes Description `pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. `pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. `pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. +`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. =============== ========== ========================= =============================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest @@ -453,3 +459,4 @@ Library Accessor Classes Description .. _pandas_path: https://github.com/drivendataorg/pandas-path/ .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html .. _pint-pandas: https://github.com/hgrecco/pint-pandas +.. _composeml: https://github.com/FeatureLabs/compose diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 360a14998b227..23bd44c1969a5 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -397,6 +397,32 @@ The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided). + +.. _basics.dataframe.from_list_namedtuples: + +From a list of namedtuples +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The field names of the first ``namedtuple`` in the list determine the columns +of the ``DataFrame``. The remaining namedtuples (or tuples) are simply unpacked +and their values are fed into the rows of the ``DataFrame``. If any of those +tuples is shorter than the first ``namedtuple`` then the later columns in the +corresponding row are marked as missing values. If any are longer than the +first ``namedtuple``, a ``ValueError`` is raised. + +.. ipython:: python + + from collections import namedtuple + + Point = namedtuple('Point', 'x y') + + pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]) + + Point3D = namedtuple('Point3D', 'x y z') + + pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]) + + .. _basics.dataframe.from_list_dataclasses: From a list of dataclasses diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 6843dd1eadc81..cac18f5bf39cd 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1532,12 +1532,8 @@ Setting metadata ~~~~~~~~~~~~~~~~ Indexes are "mostly immutable", but it is possible to set and change their -metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and -``codes``). - -You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_codes`` -to set these attributes directly. They default to returning a copy; however, -you can specify ``inplace=True`` to have the data change in place. +``name`` attribute. You can use the ``rename``, ``set_names`` to set these attributes +directly, and they default to returning a copy. See :ref:`Advanced Indexing ` for usage of MultiIndexes. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d4be9d802d697..ab233f653061a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1064,6 +1064,23 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: pd.read_csv('tmp.csv', parse_dates=[0]) pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) +Writing CSVs to binary file objects ++++++++++++++++++++++++++++++++++++ + +.. versionadded:: 1.2.0 + +``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object +opened binary mode. For this to work, it is necessary that ``mode`` +contains a "b": + +.. ipython:: python + + import io + + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") + .. _io.float_precision: Specifying method for floating-point conversion @@ -3441,10 +3458,11 @@ for some advanced strategies .. warning:: - pandas requires ``PyTables`` >= 3.0.0. - There is a indexing bug in ``PyTables`` < 3.2 which may appear when querying stores using an index. - If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. - Stores created previously will need to be rewritten using the updated version. + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle. Loading pickled data received from + untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index 75949a90d09a6..66d3ab3305565 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0220: -v0.22.0 (December 29, 2017) ---------------------------- +Version 0.22.0 (December 29, 2017) +---------------------------------- {{ header }} @@ -96,7 +96,7 @@ returning ``1`` instead. These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well. Finally, a few less obvious places in pandas are affected by this change. -Grouping by a categorical +Grouping by a Categorical ^^^^^^^^^^^^^^^^^^^^^^^^^ Grouping by a ``Categorical`` and summing now returns ``0`` instead of diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index b9e1b5060d1da..f91d89679dad1 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -86,8 +86,8 @@ Please note that the string `index` is not supported with the round trip format, .. _whatsnew_0230.enhancements.assign_dependent: -``.assign()`` accepts dependent arguments -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Method ``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468 `_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the @@ -244,7 +244,7 @@ documentation. If you build an extension array, publicize it on our .. _whatsnew_0230.enhancements.categorical_grouping: -New ``observed`` keyword for excluding unobserved categories in ``groupby`` +New ``observed`` keyword for excluding unobserved categories in ``GroupBy`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Grouping by a categorical includes the unobserved categories in the output. @@ -360,8 +360,8 @@ Fill all consecutive outside values in both directions .. _whatsnew_0210.enhancements.get_dummies_dtype: -``get_dummies`` now supports ``dtype`` argument -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Function ``get_dummies`` now supports ``dtype`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`) @@ -388,8 +388,8 @@ See the :ref:`documentation here `. (:issue:`19365`) .. _whatsnew_0230.enhancements.ran_inf: -``.rank()`` handles ``inf`` values when ``NaN`` are present -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Method ``.rank()`` handles ``inf`` values when ``NaN`` are present +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions, ``.rank()`` would assign ``inf`` elements ``NaN`` as their ranks. Now ranks are calculated properly. (:issue:`6945`) @@ -587,7 +587,7 @@ If installed, we now require: .. _whatsnew_0230.api_breaking.dict_insertion_order: -Instantiation from dicts preserves dict insertion order for python 3.6+ +Instantiation from dicts preserves dict insertion order for Python 3.6+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Until Python 3.6, dicts in Python had no formally defined ordering. For Python @@ -1365,8 +1365,8 @@ MultiIndex - Bug in indexing where nested indexers having only numpy arrays are handled incorrectly (:issue:`19686`) -I/O -^^^ +IO +^^ - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) - :meth:`DataFrame.to_html` now has an option to add an id to the leading `` tag (:issue:`8496`) @@ -1403,7 +1403,7 @@ Plotting - :func:`DataFrame.plot` now supports multiple columns to the ``y`` argument (:issue:`19699`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 45399792baecf..5bfaa7a5a3e6b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -277,8 +277,8 @@ For earlier versions this can be done using the following. .. _whatsnew_0240.enhancements.read_html: -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ +Function ``read_html`` enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. Now it understands them, treating them as sequences of cells with the same @@ -1371,7 +1371,7 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). .. _whatsnew_0240.deprecations.integer_tz: -Passing integer data and a timezone to datetimeindex +Passing integer data and a timezone to DatetimeIndex ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The behavior of :class:`DatetimeIndex` when passed integer data and @@ -1769,8 +1769,8 @@ MultiIndex - :class:`MultiIndex` has gained the :meth:`MultiIndex.from_frame`, it allows constructing a :class:`MultiIndex` object from a :class:`DataFrame` (:issue:`22420`) - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) -I/O -^^^ +IO +^^ - Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`) - Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`) @@ -1827,7 +1827,7 @@ Plotting - Bug in :func:`DataFrame.plot.bar` caused bars to use multiple colors instead of a single one (:issue:`20585`) - Bug in validating color parameter caused extra color to be appended to the given color array. This happened to multiple plotting functions using matplotlib. (:issue:`20726`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`pandas.core.window.Rolling.min` and :func:`pandas.core.window.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index d1a893f99cff4..27e84bf0a7cd7 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -51,7 +51,6 @@ Bug fixes - Bug where calling :meth:`Series.replace` on categorical data could return a ``Series`` with incorrect dimensions (:issue:`24971`) - -- **Reshaping** diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 443589308ad4c..e5860644fa371 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -15,9 +15,12 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- -- -- +- Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). +- Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) +- Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) +- Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) .. --------------------------------------------------------------------------- @@ -26,22 +29,43 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`). + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) + + **Datetimelike** - - +**Timedelta** + +- Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) + + **Numeric** - - +**Groupby/resample/rolling** + +- Bug in :class:`pandas.core.groupby.RollingGroupby` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) + **Plotting** - **Indexing** +- Bug in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) + +**DataFrame** +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9a1748d72eec1..24887ee26c934 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,11 +13,30 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_120.binary_handle_to_csv: + +Support for binary file handles in ``to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) +with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`). +``mode`` has to contain a ``b`` for binary handles to be supported. + +For example: + +.. ipython:: python + + import io + + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") + .. _whatsnew_120.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ - +- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - - @@ -28,7 +47,7 @@ Other enhancements Deprecations ~~~~~~~~~~~~ - +- Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - - @@ -59,19 +78,20 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- +- Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) +- Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) - Timedelta ^^^^^^^^^ - +- Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) - - Timezones ^^^^^^^^^ -- +- Bug in :func:`date_range` was raising AmbiguousTimeError for valid input with `ambiguous=False` (:issue:`35297`) - @@ -108,19 +128,19 @@ Indexing Missing ^^^^^^^ -- +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) - MultiIndex ^^^^^^^^^^ -- +- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message `Expected label or tuple of labels` (:issue:`35301`) - I/O ^^^ -- +- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - Plotting @@ -132,14 +152,17 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - - +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) +- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Sparse @@ -165,4 +188,4 @@ Other .. _whatsnew_120.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ diff --git a/environment.yml b/environment.yml index 3b088ca511be9..ed9762e5b8893 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - flake8<3.8.0 # temporary pin, GH#34150 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - - isort=4.3.21 # check that imports are in the right order + - isort>=5.2.1 # check that imports are in the right order - mypy=0.730 - pycodestyle # used by flake8 @@ -109,3 +109,4 @@ dependencies: - pip: - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master - git+https://github.com/numpy/numpydoc + - pyflakes>=2.2.0 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index f5e16cddeb04c..fb41b37980b2e 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -442,8 +442,8 @@ def register_option( ValueError if `validator` is specified and `defval` is not a valid value. """ - import tokenize import keyword + import tokenize key = key.lower() @@ -462,7 +462,7 @@ def register_option( for k in path: # NOTE: tokenize.Name is not a public constant # error: Module has no attribute "Name" [attr-defined] - if not re.match("^" + tokenize.Name + "$", k): # type: ignore + if not re.match("^" + tokenize.Name + "$", k): # type: ignore[attr-defined] raise ValueError(f"{k} is not a valid identifier") if keyword.iskeyword(k): raise ValueError(f"{k} is a python keyword") @@ -660,8 +660,8 @@ def _build_option_description(k: str) -> str: def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): """ Builds a concise listing of available options, grouped by prefix """ - from textwrap import wrap from itertools import groupby + from textwrap import wrap def pp(name: str, ks: Iterable[str]) -> List[str]: pfx = "- " + name + ".[" if name else "" diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6b6ead795584f..7e90a8cc681ef 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1,11 +1,12 @@ import cython from cython import Py_ssize_t -from libc.stdlib cimport malloc, free -from libc.string cimport memmove from libc.math cimport fabs, sqrt +from libc.stdlib cimport free, malloc +from libc.string cimport memmove import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_FLOAT32, @@ -31,12 +32,11 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.util cimport numeric, get_nat - from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -46,7 +46,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) - +from pandas._libs.util cimport get_nat, numeric import pandas._libs.missing as missing diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7c57e6ee9dbfd..38cb973d6dde9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,27 +1,51 @@ import cython from cython import Py_ssize_t -from cython cimport floating -from libc.stdlib cimport malloc, free +from cython cimport floating +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, - int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) +from numpy cimport ( + complex64_t, + complex128_t, + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) from numpy.math cimport NAN -cnp.import_array() -from pandas._libs.util cimport numeric, get_nat +cnp.import_array() -from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, - TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, - TIEBREAK_DENSE) -from pandas._libs.algos import (take_2d_axis1_float64_float64, - groupsort_indexer, tiebreakers) +from pandas._libs.algos cimport ( + TIEBREAK_AVERAGE, + TIEBREAK_DENSE, + TIEBREAK_FIRST, + TIEBREAK_MAX, + TIEBREAK_MIN, + TiebreakEnumType, + swap, +) +from pandas._libs.util cimport get_nat, numeric + +from pandas._libs.algos import ( + groupsort_indexer, + take_2d_axis1_float64_float64, + tiebreakers, +) from pandas._libs.missing cimport checknull + cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index a98820ca57895..f2af04d91a3e3 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -2,10 +2,13 @@ # at https://github.com/veorq/SipHash import cython -from libc.stdlib cimport malloc, free + +from libc.stdlib cimport free, malloc import numpy as np -from numpy cimport ndarray, uint8_t, uint32_t, uint64_t, import_array + +from numpy cimport import_array, ndarray, uint8_t, uint32_t, uint64_t + import_array() from pandas._libs.util cimport is_nan diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c3dcbb942d7fe..ffaf6d6505955 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,60 +1,57 @@ cimport cython - -from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Free - -from libc.stdlib cimport malloc, free +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from cpython.ref cimport Py_INCREF, PyObject +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint32_t, float64_t +from numpy cimport float64_t, ndarray, uint8_t, uint32_t from numpy.math cimport NAN + cnp.import_array() +from pandas._libs cimport util from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, - kh_init_str, - kh_put_str, - kh_exist_str, - kh_get_str, - kh_destroy_str, - kh_resize_str, - kh_put_strbox, - kh_get_strbox, - kh_init_strbox, - kh_int64_t, - kh_init_int64, - kh_resize_int64, + kh_destroy_float64, kh_destroy_int64, - kh_get_int64, + kh_destroy_pymap, + kh_destroy_str, + kh_destroy_uint64, + kh_exist_float64, kh_exist_int64, - kh_put_int64, + kh_exist_pymap, + kh_exist_str, + kh_exist_uint64, kh_float64_t, - kh_exist_float64, - kh_put_float64, - kh_init_float64, kh_get_float64, - kh_destroy_float64, - kh_resize_float64, - kh_resize_uint64, - kh_exist_uint64, - kh_destroy_uint64, - kh_put_uint64, + kh_get_int64, + kh_get_pymap, + kh_get_str, + kh_get_strbox, kh_get_uint64, - kh_init_uint64, - kh_destroy_pymap, - kh_exist_pymap, + kh_init_float64, + kh_init_int64, kh_init_pymap, - kh_get_pymap, + kh_init_str, + kh_init_strbox, + kh_init_uint64, + kh_int64_t, + kh_put_float64, + kh_put_int64, kh_put_pymap, + kh_put_str, + kh_put_strbox, + kh_put_uint64, + kh_resize_float64, + kh_resize_int64, kh_resize_pymap, + kh_resize_str, + kh_resize_uint64, + kh_str_t, + khiter_t, ) - - -from pandas._libs cimport util - from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0b11860848ac0..82a55137fb37c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,6 +1,7 @@ import warnings import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,17 +17,16 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - +from pandas._libs.hashtable cimport HashTable from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timedeltas cimport _Timedelta - -from pandas._libs.hashtable cimport HashTable +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs import algos, hashtable as _hash from pandas._libs.missing import checknull, isnaobj diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b4b490f49b12..4f27fde52414a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -5,12 +5,15 @@ from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx + cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np + cimport numpy as cnp from numpy cimport NPY_INT64, int64_t + cnp.import_array() from pandas._libs.algos import ensure_int64 diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 95881ebf1385c..6867e8aba7411 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,7 +1,8 @@ import numbers from operator import le, lt -from cpython.datetime cimport PyDelta_Check, PyDateTime_IMPORT +from cpython.datetime cimport PyDateTime_IMPORT, PyDelta_Check + PyDateTime_IMPORT from cpython.object cimport ( @@ -16,8 +17,8 @@ from cpython.object cimport ( import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_QUICKSORT, @@ -30,22 +31,21 @@ from numpy cimport ( ndarray, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - from pandas._libs.hashtable cimport Int64Vector +from pandas._libs.tslibs.timedeltas cimport _Timedelta +from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs.util cimport ( - is_integer_object, is_float_object, + is_integer_object, is_timedelta64_object, ) -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timedeltas cimport _Timedelta - _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 54892a7e4bc77..13c7187923473 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,6 +16,7 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs.algos import ( @@ -640,7 +641,11 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): # ---------------------------------------------------------------------- from pandas._libs.hashtable cimport ( - HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable) + HashTable, + Int64HashTable, + PyObjectHashTable, + UInt64HashTable, +) ctypedef fused asof_t: uint8_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5ecbb2c3ffd35..5fa91ffee8ea8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -5,23 +5,24 @@ import warnings import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_RichCompareBool, Py_EQ -from cpython.ref cimport Py_INCREF -from cpython.tuple cimport PyTuple_SET_ITEM, PyTuple_New -from cpython.iterator cimport PyIter_Check -from cpython.sequence cimport PySequence_Check -from cpython.number cimport PyNumber_Check - from cpython.datetime cimport ( - PyDateTime_Check, PyDate_Check, - PyTime_Check, - PyDelta_Check, + PyDateTime_Check, PyDateTime_IMPORT, + PyDelta_Check, + PyTime_Check, ) +from cpython.iterator cimport PyIter_Check +from cpython.number cimport PyNumber_Check +from cpython.object cimport Py_EQ, PyObject_RichCompareBool +from cpython.ref cimport Py_INCREF +from cpython.sequence cimport PySequence_Check +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM + PyDateTime_IMPORT import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_OBJECT, @@ -39,6 +40,7 @@ from numpy cimport ( uint8_t, uint64_t, ) + cnp.import_array() cdef extern from "numpy/arrayobject.h": @@ -63,28 +65,23 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 from pandas._libs cimport util -from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan from pandas._libs.tslib import array_to_datetime -from pandas._libs.tslibs.nattype cimport ( - NPY_NAT, - c_NaT as NaT, - checknull_with_nat, -) -from pandas._libs.tslibs.conversion cimport convert_to_tsobject -from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.missing cimport ( + C_NA, checknull, - isnaobj, is_null_datetime64, is_null_timedelta64, - C_NA, + isnaobj, ) - +from pandas._libs.tslibs.conversion cimport convert_to_tsobject +from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT, checknull_with_nat +from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.period cimport is_period_object +from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 +from pandas._libs.tslibs.timezones cimport tz_compare # constants that will be compared to potentially arbitrarily large # python int @@ -1317,8 +1314,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if not isinstance(value, list): value = list(value) - from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike) + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike values = construct_1d_object_array_from_listlike(value) # make contiguous diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index fdd06fe631b97..771e8053ac9be 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,27 +1,25 @@ -import cython -from cython import Py_ssize_t - import numbers +import cython +from cython import Py_ssize_t import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t + cnp.import_array() from pandas._libs cimport util - - -from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, checknull_with_nat, is_null_datetimelike, ) -from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit - cdef: float64_t INF = np.inf float64_t NEGINF = -INF @@ -157,7 +155,10 @@ def isnaobj_old(arr: ndarray) -> ndarray: result = np.zeros(n, dtype=np.uint8) for i in range(n): val = arr[i] - result[i] = checknull(val) or val == INF or val == NEGINF + result[i] = ( + checknull(val) + or util.is_float_object(val) and (val == INF or val == NEGINF) + ) return result.view(np.bool_) diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 658600cdfbe6c..d1f897d237c1b 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -10,18 +10,17 @@ from cpython.object cimport ( PyObject_RichCompareBool, ) - import cython from cython import Py_ssize_t - import numpy as np -from numpy cimport ndarray, uint8_t, import_array -import_array() +from numpy cimport import_array, ndarray, uint8_t + +import_array() -from pandas._libs.util cimport UINT8_MAX, is_nan from pandas._libs.missing cimport checknull +from pandas._libs.util cimport UINT8_MAX, is_nan @cython.wraparound(False) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6ffb036e01595..fa77af6bd5a25 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,6 +1,8 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license import bz2 +from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC +from errno import ENOENT import gzip import io import os @@ -9,17 +11,14 @@ import time import warnings import zipfile -from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE -from errno import ENOENT - from libc.stdlib cimport free -from libc.string cimport strncpy, strlen, strcasecmp +from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString -from cpython.exc cimport PyErr_Occurred, PyErr_Fetch +from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode @@ -30,37 +29,59 @@ cdef extern from "Python.h": import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t + cnp.import_array() from pandas._libs cimport util -from pandas._libs.util cimport UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX + import pandas._libs.lib as lib from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, - kh_float64_t, kh_get_float64, kh_destroy_float64, - kh_put_float64, kh_init_float64, kh_resize_float64, - kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, + kh_destroy_float64, + kh_destroy_str, + kh_destroy_str_starts, kh_destroy_strbox, - kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, - kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) + kh_exist_str, + kh_float64_t, + kh_get_float64, + kh_get_str, + kh_get_str_starts_item, + kh_get_strbox, + kh_init_float64, + kh_init_str, + kh_init_str_starts, + kh_init_strbox, + kh_put_float64, + kh_put_str, + kh_put_str_starts_item, + kh_put_strbox, + kh_resize_float64, + kh_resize_str_starts, + kh_str_starts_t, + kh_str_t, + kh_strbox_t, + khiter_t, +) + +from pandas.compat import _get_lzma_file, _import_lzma +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( + is_bool_dtype, is_categorical_dtype, - is_integer_dtype, is_float_dtype, - is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype, is_extension_array_dtype) + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + pandas_dtype, +) from pandas.core.dtypes.concat import union_categoricals -from pandas.compat import _import_lzma, _get_lzma_file -from pandas.errors import (ParserError, DtypeWarning, - EmptyDataError, ParserWarning) - lzma = _import_lzma() cdef: diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a01e0c5705dcf..7b36bc8baf891 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -2,15 +2,18 @@ from copy import copy from cython import Py_ssize_t -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t +from numpy cimport int64_t, ndarray + cnp.import_array() from pandas._libs cimport util -from pandas._libs.lib import maybe_convert_objects, is_scalar + +from pandas._libs.lib import is_scalar, maybe_convert_objects cdef _check_result_array(object obj, Py_ssize_t cnt): diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index da4dd00027395..5c6c15fb50fed 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -16,7 +16,9 @@ from numpy cimport ( ) import numpy as np + cimport numpy as cnp + cnp.import_array() from pandas._libs.lib cimport c_is_list_like diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 7c9575d921dc9..321d7c374d8ec 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -1,9 +1,18 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, - float64_t, float32_t) +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, +) + cnp.import_array() diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 785a4d1f8b923..64fc8d615ea9c 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,13 +1,16 @@ import math import numpy as np + from numpy cimport import_array + import_array() from pandas._libs.util cimport is_array -from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.missing import array_equivalent, isna + cdef NUMERIC_TYPES = ( bool, @@ -129,6 +132,7 @@ cpdef assert_almost_equal(a, b, if not isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) @@ -181,6 +185,7 @@ cpdef assert_almost_equal(a, b, elif isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 35d5cd8f1e275..e4128af62d06d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -7,23 +7,20 @@ from cpython.datetime cimport ( datetime, tzinfo, ) + # import datetime C API PyDateTime_IMPORT cimport numpy as cnp from numpy cimport float64_t, int64_t, ndarray + import numpy as np + cnp.import_array() import pytz -from pandas._libs.util cimport ( - is_datetime64_object, - is_float_object, - is_integer_object, -) - from pandas._libs.tslibs.np_datetime cimport ( _string_to_dts, check_dts_bounds, @@ -34,9 +31,9 @@ from pandas._libs.tslibs.np_datetime cimport ( pydate_to_dt64, pydatetime_to_dt64, ) +from pandas._libs.util cimport is_datetime64_object, is_float_object, is_integer_object from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime - from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.conversion cimport ( @@ -45,22 +42,18 @@ from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, get_datetime64_nanos, ) - from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, ) - from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timestamps import Timestamp -from pandas._libs.tslibs.tzconversion cimport ( - tz_localize_to_utc_single, -) +from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na +from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single def _test_parse_iso8601(ts: str): diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 00cecd25e5225..6cce2f5e1fd95 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -5,7 +5,7 @@ Cython implementations of functions resembling the stdlib calendar module import cython -from numpy cimport int64_t, int32_t +from numpy cimport int32_t, int64_t # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 8cc3d25e86340..adf1dfbc1ac72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,44 +1,68 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int32_t, intp_t, ndarray +from numpy cimport int32_t, int64_t, intp_t, ndarray + cnp.import_array() import pytz # stdlib datetime imports -from cpython.datetime cimport (datetime, time, tzinfo, - PyDateTime_Check, PyDate_Check, - PyDateTime_IMPORT) + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + datetime, + time, + tzinfo, +) + PyDateTime_IMPORT from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, - _string_to_dts, npy_datetime, dt64_to_dtstruct, dtstruct_to_dt64, - get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64, - NPY_DATETIMEUNIT, NPY_FR_ns) -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + NPY_DATETIMEUNIT, + NPY_FR_ns, + _string_to_dts, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + get_datetime64_unit, + get_datetime64_value, + npy_datetime, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, + pydatetime_to_dt64, +) -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_integer_object, is_float_object) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - maybe_get_tz, tz_compare, + get_dst_info, + get_utcoffset, + is_fixed_offset, + is_tzlocal, + is_utc, + maybe_get_tz, + tz_compare, utc_pytz as UTC, ) +from pandas._libs.tslibs.util cimport ( + is_datetime64_object, + is_float_object, + is_integer_object, +) + from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - checknull_with_nat, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, ) - from pandas._libs.tslibs.tzconversion cimport ( tz_convert_utc_to_tzlocal, tz_localize_to_utc_single, diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 1d1f900bc18b3..16fa05c3801c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -6,26 +6,37 @@ from locale import LC_TIME import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t +from numpy cimport int8_t, int32_t, int64_t, ndarray, uint32_t + cnp.import_array() from pandas._config.localization import set_locale -from pandas._libs.tslibs.ccalendar import MONTHS_FULL, DAYS_FULL +from pandas._libs.tslibs.ccalendar import DAYS_FULL, MONTHS_FULL + from pandas._libs.tslibs.ccalendar cimport ( - get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year, get_iso_calendar, iso_calendar_t, - month_offset, + dayofweek, + get_day_of_year, + get_days_in_month, get_firstbday, + get_iso_calendar, get_lastbday, + get_week_of_year, + is_leapyear, + iso_calendar_t, + month_offset, ) -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, - td64_to_tdstruct) from pandas._libs.tslibs.nattype cimport NPY_NAT +from pandas._libs.tslibs.np_datetime cimport ( + dt64_to_dtstruct, + npy_datetimestruct, + pandas_timedeltastruct, + td64_to_tdstruct, +) + from pandas._libs.tslibs.strptime import LocaleTime diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 264013f928d22..79f50c7261905 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,3 +1,10 @@ +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, +) from cpython.object cimport ( Py_EQ, Py_GE, @@ -8,28 +15,19 @@ from cpython.object cimport ( PyObject_RichCompare, ) -from cpython.datetime cimport ( - PyDateTime_Check, - PyDateTime_IMPORT, - PyDelta_Check, - datetime, - timedelta, -) PyDateTime_IMPORT from cpython.version cimport PY_MINOR_VERSION import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() -from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, - get_timedelta64_value, -) cimport pandas._libs.tslibs.util as util - +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value # ---------------------------------------------------------------------- # Constants @@ -109,30 +107,25 @@ cdef class _NaT(datetime): __array_priority__ = 100 def __richcmp__(_NaT self, object other, int op): - cdef: - int ndim = getattr(other, "ndim", -1) + if util.is_datetime64_object(other) or PyDateTime_Check(other): + # We treat NaT as datetime-like for this comparison + return _nat_scalar_rules[op] - if ndim == -1: + elif util.is_timedelta64_object(other) or PyDelta_Check(other): + # We treat NaT as timedelta-like for this comparison return _nat_scalar_rules[op] elif util.is_array(other): - result = np.empty(other.shape, dtype=np.bool_) - result.fill(_nat_scalar_rules[op]) + if other.dtype.kind in "mM": + result = np.empty(other.shape, dtype=np.bool_) + result.fill(_nat_scalar_rules[op]) + elif other.dtype.kind == "O": + result = np.array([PyObject_RichCompare(self, x, op) for x in other]) + else: + return NotImplemented return result - elif ndim == 0: - if util.is_datetime64_object(other): - return _nat_scalar_rules[op] - else: - raise TypeError( - f"Cannot compare type {type(self).__name__} " - f"with type {type(other).__name__}" - ) - - # Note: instead of passing "other, self, _reverse_ops[op]", we observe - # that `_nat_scalar_rules` is invariant under `_reverse_ops`, - # rendering it unnecessary. - return PyObject_RichCompare(other, self, op) + return NotImplemented def __add__(self, other): if self is not c_NaT: diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 31cc55ad981bb..12aaaf4ce3977 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,5 +1,3 @@ -from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE - from cpython.datetime cimport ( PyDateTime_DATE_GET_HOUR, PyDateTime_DATE_GET_MICROSECOND, @@ -10,11 +8,15 @@ from cpython.datetime cimport ( PyDateTime_GET_YEAR, PyDateTime_IMPORT, ) +from cpython.object cimport Py_EQ, Py_GE, Py_GT, Py_LE, Py_LT, Py_NE + PyDateTime_IMPORT from numpy cimport int64_t + from pandas._libs.tslibs.util cimport get_c_string_buf_and_size + cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, npy_datetimestruct *b) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 9a7ca15a2a1c2..ac2725fc58aee 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,39 +1,51 @@ -import cython - import operator import re import time from typing import Any import warnings -from cpython.datetime cimport (PyDateTime_IMPORT, - PyDateTime_Check, - PyDate_Check, - PyDelta_Check, - datetime, timedelta, date, - time as dt_time) + +import cython + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + date, + datetime, + time as dt_time, + timedelta, +) + PyDateTime_IMPORT -from dateutil.relativedelta import relativedelta from dateutil.easter import easter - +from dateutil.relativedelta import relativedelta import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure + from pandas._libs.properties import cache_readonly from pandas._libs.tslibs cimport util from pandas._libs.tslibs.util cimport ( - is_integer_object, is_datetime64_object, is_float_object, + is_integer_object, ) from pandas._libs.tslibs.ccalendar import ( - MONTH_ALIASES, MONTH_TO_CAL_NUM, weekday_to_int, int_to_weekday, + MONTH_ALIASES, + MONTH_TO_CAL_NUM, + int_to_weekday, + weekday_to_int, ) + from pandas._libs.tslibs.ccalendar cimport ( DAY_NANOS, dayofweek, @@ -47,17 +59,20 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, pydate_to_dtstruct, ) from pandas._libs.tslibs.tzconversion cimport tz_convert_from_utc_single from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds + from .timedeltas import Timedelta + from .timestamps cimport _Timestamp + from .timestamps import Timestamp # --------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c4f369d0d3b3f..8429aebbd85b8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -9,39 +9,44 @@ from libc.string cimport strchr import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_Str - from cpython.datetime cimport datetime, datetime_new, import_datetime, tzinfo +from cpython.object cimport PyObject_Str from cpython.version cimport PY_VERSION_HEX + import_datetime() import numpy as np + cimport numpy as cnp -from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, - PyArray_IterNew, flatiter, float64_t) +from numpy cimport ( + PyArray_GETITEM, + PyArray_ITER_DATA, + PyArray_ITER_NEXT, + PyArray_IterNew, + flatiter, + float64_t, +) + cnp.import_array() # dateutil compat -from dateutil.tz import (tzoffset, - tzlocal as _dateutil_tzlocal, - tzutc as _dateutil_tzutc, - tzstr as _dateutil_tzstr) + +from dateutil.parser import DEFAULTPARSER, parse as du_parse from dateutil.relativedelta import relativedelta -from dateutil.parser import DEFAULTPARSER -from dateutil.parser import parse as du_parse +from dateutil.tz import ( + tzlocal as _dateutil_tzlocal, + tzoffset, + tzstr as _dateutil_tzstr, + tzutc as _dateutil_tzutc, +) from pandas._config import get_option from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS -from pandas._libs.tslibs.nattype cimport ( - c_nat_strings as nat_strings, - c_NaT as NaT, -) -from pandas._libs.tslibs.util cimport ( - is_array, - get_c_string_buf_and_size, -) +from pandas._libs.tslibs.nattype cimport c_NaT as NaT, c_nat_strings as nat_strings from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.util cimport get_c_string_buf_and_size, is_array + cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 20961c6da56bd..86b6533f5caf5 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,96 +1,98 @@ import warnings -from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompareBool +from numpy cimport import_array, int64_t, ndarray -from numpy cimport int64_t, import_array, ndarray import numpy as np + import_array() from libc.stdlib cimport free, malloc +from libc.string cimport memset, strlen from libc.time cimport strftime, tm -from libc.string cimport strlen, memset import cython from cpython.datetime cimport ( - datetime, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, PyDelta_Check, + datetime, ) + # import datetime C API PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, - dt64_to_dtstruct, - pandas_datetime_to_datetimestruct, - check_dts_bounds, NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, ) + cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.timedeltas import Timedelta -from pandas._libs.tslibs.timedeltas cimport ( - delta_to_nanoseconds, - is_any_td_scalar, -) +from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.ccalendar cimport ( + c_MONTH_NUMBERS, dayofweek, get_day_of_year, - is_leapyear, - get_week_of_year, get_days_in_month, + get_week_of_year, + is_leapyear, ) -from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.conversion import ensure_datetime64ns from pandas._libs.tslibs.dtypes cimport ( - PeriodDtypeBase, - FR_UND, FR_ANN, - FR_QTR, - FR_MTH, - FR_WK, FR_BUS, FR_DAY, FR_HR, FR_MIN, - FR_SEC, FR_MS, - FR_US, + FR_MTH, FR_NS, + FR_QTR, + FR_SEC, + FR_UND, + FR_US, + FR_WK, + PeriodDtypeBase, attrname_to_abbrevs, ) - from pandas._libs.tslibs.parsing cimport get_rule_month + from pandas._libs.tslibs.parsing import parse_time_string + from pandas._libs.tslibs.nattype cimport ( - _nat_scalar_rules, NPY_NAT, - is_null_datetimelike, + _nat_scalar_rules, c_NaT as NaT, c_nat_strings as nat_strings, + is_null_datetimelike, ) from pandas._libs.tslibs.offsets cimport ( BaseOffset, - to_offset, - is_tick_object, is_offset_object, + is_tick_object, + to_offset, ) -from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG cdef: enum: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 660b582f73e6e..d2690be905a68 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,27 +1,30 @@ """Strptime-related classes and functions. """ -import time -import locale import calendar +import locale import re +import time from cpython.datetime cimport date, tzinfo from _thread import allocate_lock as _thread_allocate_lock +import numpy as np import pytz -import numpy as np from numpy cimport int64_t -from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, dtstruct_to_dt64, npy_datetimestruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_nat_strings as nat_strings, + checknull_with_nat, ) +from pandas._libs.tslibs.np_datetime cimport ( + check_dts_bounds, + dtstruct_to_dt64, + npy_datetimestruct, +) + cdef dict _parse_code_table = {'y': 0, 'Y': 1, diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8f3a599bf107c..ee32ed53a908b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2,39 +2,47 @@ import collections import cython -from cpython.object cimport Py_NE, Py_EQ, PyObject_RichCompare +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() -from cpython.datetime cimport (timedelta, - PyDateTime_Check, PyDelta_Check, - PyDateTime_IMPORT) +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + timedelta, +) + PyDateTime_IMPORT cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.util cimport ( - is_timedelta64_object, is_datetime64_object, is_integer_object, - is_float_object, is_array -) - from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.conversion cimport cast_from_unit - -from pandas._libs.tslibs.np_datetime cimport ( - cmp_scalar, td64_to_tdstruct, pandas_timedeltastruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, +) +from pandas._libs.tslibs.np_datetime cimport ( + cmp_scalar, + pandas_timedeltastruct, + td64_to_tdstruct, ) from pandas._libs.tslibs.offsets cimport is_tick_object +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8cef685933863..bddfc30d86a53 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -9,54 +9,66 @@ shadows the python class, where we do any heavy lifting. import warnings import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int8_t, uint8_t, ndarray -cnp.import_array() +from numpy cimport int8_t, int64_t, ndarray, uint8_t -from cpython.object cimport (PyObject_RichCompareBool, PyObject_RichCompare, - Py_EQ, Py_NE) +cnp.import_array() -from cpython.datetime cimport ( - datetime, - time, - tzinfo, - tzinfo as tzinfo_type, # alias bc `tzinfo` is a kwarg below +from cpython.datetime cimport ( # alias bc `tzinfo` is a kwarg below PyDateTime_Check, + PyDateTime_IMPORT, PyDelta_Check, PyTZInfo_Check, - PyDateTime_IMPORT, -) -PyDateTime_IMPORT - -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, - is_timedelta64_object, is_array, + datetime, + time, + tzinfo as tzinfo_type, ) +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare, PyObject_RichCompareBool -from pandas._libs.tslibs.base cimport ABCTimestamp +PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar - +from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.conversion cimport ( _TSObject, - convert_to_tsobject, convert_datetime_to_tsobject, + convert_to_tsobject, normalize_i8_stamp, ) -from pandas._libs.tslibs.fields import get_start_end_field, get_date_name_field +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) + +from pandas._libs.tslibs.fields import get_date_name_field, get_start_end_field + from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct, + check_dts_bounds, cmp_scalar, + dt64_to_dtstruct, + npy_datetimestruct, pydatetime_to_dt64, ) + from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas._libs.tslibs.offsets cimport to_offset, is_offset_object -from pandas._libs.tslibs.timedeltas cimport is_any_td_scalar, delta_to_nanoseconds + +from pandas._libs.tslibs.offsets cimport is_offset_object, to_offset +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._libs.tslibs.timezones cimport ( - is_utc, maybe_get_tz, treat_tz_as_pytz, utc_pytz as UTC, - get_timezone, tz_compare, + get_timezone, + is_utc, + maybe_get_tz, + treat_tz_as_pytz, + tz_compare, + utc_pytz as UTC, ) from pandas._libs.tslibs.tzconversion cimport ( tz_convert_from_utc_single, diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index a8c785704d8e8..b82291a71057e 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,27 +1,31 @@ from datetime import timezone + from cpython.datetime cimport datetime, timedelta, tzinfo # dateutil compat + from dateutil.tz import ( gettz as dateutil_gettz, tzfile as _dateutil_tzfile, tzlocal as _dateutil_tzlocal, tzutc as _dateutil_tzutc, ) - - -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo import pytz +from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + UTC = pytz.utc import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport is_integer_object, get_nat +from pandas._libs.tslibs.util cimport get_nat, is_integer_object + cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 606639af16a18..4c62b16d430bd 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -5,21 +5,27 @@ import cython from cython import Py_ssize_t from cpython.datetime cimport ( - PyDateTime_IMPORT, PyDelta_Check, datetime, timedelta, tzinfo) + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, + tzinfo, +) + PyDateTime_IMPORT -import pytz from dateutil.tz import tzutc - import numpy as np +import pytz + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, intp_t +from numpy cimport int64_t, intp_t, ndarray, uint8_t + cnp.import_array() from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dt64_to_dtstruct) +from pandas._libs.tslibs.np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from pandas._libs.tslibs.timezones cimport ( get_dst_info, get_utcoffset, @@ -404,7 +410,7 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): return val + deltas[pos] -def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): +def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the values (in i8) from UTC to tz @@ -429,7 +435,7 @@ def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): +cdef int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the given values (in i8) either to UTC or from UTC. @@ -451,7 +457,7 @@ cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): str typ if is_utc(tz): - converted = vals + converted = vals.copy() elif is_tzlocal(tz): converted = np.empty(n, dtype=np.int64) for i in range(n): diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index c8f8daf6724c2..b23f8255a76ac 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,18 +1,21 @@ import cython -from cpython.datetime cimport datetime, date, time, tzinfo +from cpython.datetime cimport date, datetime, time, tzinfo import numpy as np + from numpy cimport int64_t, intp_t, ndarray from .conversion cimport normalize_i8_stamp + from .dtypes import Resolution + from .nattype cimport NPY_NAT, c_NaT as NaT -from .np_datetime cimport npy_datetimestruct, dt64_to_dtstruct +from .np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from .offsets cimport to_offset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts -from .timezones cimport is_utc, is_tzlocal, get_dst_info +from .timezones cimport get_dst_info, is_tzlocal, is_utc from .tzconversion cimport tz_convert_utc_to_tzlocal # ------------------------------------------------------------------------- @@ -208,49 +211,40 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None): int reso = RESO_DAY, curr_reso ndarray[int64_t] trans int64_t[:] deltas - Py_ssize_t[:] pos - int64_t local_val, delta + intp_t[:] pos + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False if is_utc(tz) or tz is None: - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + delta, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + + for i in range(n): + if stamps[i] == NPY_NAT: + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + dt64_to_dtstruct(local_val, &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso return Resolution(reso) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 362d0e6263697..3ec4547d223ce 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,13 +2,15 @@ import cython from cython import Py_ssize_t -from libcpp.deque cimport deque -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc +from libcpp.deque cimport deque import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, float64_t, float32_t, uint8_t +from numpy cimport float32_t, float64_t, int64_t, ndarray, uint8_t + cnp.import_array() @@ -22,6 +24,7 @@ from pandas._libs.algos import is_monotonic from pandas._libs.util cimport numeric + cdef extern from "../src/skiplist.h": ctypedef struct node_t: node_t **next diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 8a1e7feb57ace..9af1159a805ec 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,7 +1,8 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import numpy as np -from numpy cimport ndarray, int64_t + +from numpy cimport int64_t, ndarray # Cython routines for window indexers diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 2d5b31d7ccbcf..40c39aabb7a7a 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -5,8 +5,8 @@ from cpython.bytes cimport PyBytes_GET_SIZE from cpython.unicode cimport PyUnicode_GET_SIZE import numpy as np -from numpy cimport ndarray, uint8_t +from numpy cimport ndarray, uint8_t ctypedef fused pandas_string: str diff --git a/pandas/_testing.py b/pandas/_testing.py index 1cf9304ed2715..713f29466f097 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -535,7 +535,7 @@ def rands(nchars): def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close + from matplotlib.pyplot import close as _close, get_fignums if fignum is None: for fignum in get_fignums(): @@ -1339,10 +1339,8 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact: - if not is_numeric_dtype(left.dtype): - raise AssertionError("check_exact may only be used with numeric Series") - + if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + # Only check exact if dtype is numeric assert_numpy_array_equal( left._values, right._values, diff --git a/pandas/_typing.py b/pandas/_typing.py index 8e98833ad37f7..76ec527e6e258 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -24,13 +24,15 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 - from pandas.core.arrays.base import ExtensionArray # noqa: F401 + from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 - from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 - from pandas.core.series import Series # noqa: F401 + from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.frame import DataFrame # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 + from pandas.core.indexes.base import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 # array-like diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 0484de3fa165d..ef9f36705a7ee 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -14,7 +14,7 @@ from pandas import Index if TYPE_CHECKING: - from pandas import Series, DataFrame + from pandas import DataFrame, Series def load_reduce(self): @@ -64,7 +64,7 @@ class _LoadSparseSeries: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "Series", but must return # a subtype of "_LoadSparseSeries") - def __new__(cls) -> "Series": # type: ignore + def __new__(cls) -> "Series": # type: ignore[misc] from pandas import Series warnings.warn( @@ -82,7 +82,7 @@ class _LoadSparseFrame: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "DataFrame", but must # return a subtype of "_LoadSparseFrame") - def __new__(cls) -> "DataFrame": # type: ignore + def __new__(cls) -> "DataFrame": # type: ignore[misc] from pandas import DataFrame warnings.warn( @@ -181,7 +181,7 @@ def __new__(cls) -> "DataFrame": # type: ignore # functions for compat and uses a non-public class of the pickle module. # error: Name 'pkl._Unpickler' is not defined -class Unpickler(pkl._Unpickler): # type: ignore +class Unpickler(pkl._Unpickler): # type: ignore[name-defined] def find_class(self, module, name): # override superclass key = (module, name) diff --git a/pandas/conftest.py b/pandas/conftest.py index e0adb37e7d2f5..c1925b4f5ca3b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -359,7 +359,7 @@ def multiindex_year_month_day_dataframe_random_data(): tdf = tm.makeTimeDataFrame(100) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index = ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels]) ymd.index.set_names(["year", "month", "day"], inplace=True) return ymd diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e3ca4cc53363..befde7c355818 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -427,7 +427,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return comps.isin(values) # type: ignore + # error: "ExtensionArray" has no attribute "isin" [attr-defined] + return comps.isin(values) # type: ignore[attr-defined] comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 733dbeed34b72..6b8d7dc35fe95 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -15,7 +15,7 @@ from pandas.core.construction import create_series_with_explicit_dtype if TYPE_CHECKING: - from pandas import DataFrame, Series, Index + from pandas import DataFrame, Index, Series ResType = Dict[int, Any] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2553a65aed07b..921927325a144 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -22,7 +22,7 @@ from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -1273,7 +1273,7 @@ def convert_values(param): ovalues = [param] * len(self) return ovalues - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCSeries, ABCIndexClass, ABCDataFrame)): # rely on pandas to unbox and dispatch to us return NotImplemented diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dbce71b77a425..bd4bdc5ecb46f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -20,7 +20,6 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -559,13 +558,10 @@ def all(self, skipna: bool = True, **kwargs): @classmethod def _create_logical_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} - other = lib.item_from_zerodim(other) other_is_booleanarray = isinstance(other, BooleanArray) other_is_scalar = lib.is_scalar(other) mask = None @@ -605,16 +601,14 @@ def logical_method(self, other): @classmethod def _create_comparison_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): from pandas.arrays import IntegerArray - if isinstance( - other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) - ): + if isinstance(other, IntegerArray): # Rely on pandas to unbox and dispatch to us. return NotImplemented - other = lib.item_from_zerodim(other) mask = None if isinstance(other, BooleanArray): @@ -693,13 +687,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @ops.unpack_zerodim_and_defer(op_name) def boolean_arithmetic_method(self, other): - - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None if isinstance(other, BooleanArray): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index db9cfd9d7fc59..6e5c7bc699962 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -520,7 +520,7 @@ def _from_inferred_categories( ------- Categorical """ - from pandas import Index, to_numeric, to_datetime, to_timedelta + from pandas import Index, to_datetime, to_numeric, to_timedelta cats = Index(inferred_categories) known_categories = ( @@ -1403,7 +1403,7 @@ def value_counts(self, dropna=True): -------- Series.value_counts """ - from pandas import Series, CategoricalIndex + from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories ncat, mask = len(cat), 0 <= code diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ee4d43fdb3bc2..1b5e1d81f00d6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -468,6 +468,9 @@ def _ndarray(self) -> np.ndarray: def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` + # error: Unexpected keyword argument "dtype" for "NDArrayBackedExtensionArray" + # TODO: add my error code + # https://github.com/python/mypy/issues/7384 return type(self)(arr, dtype=self.dtype) # type: ignore # ------------------------------------------------------------------ @@ -809,7 +812,8 @@ def _validate_scalar( value = NaT elif isinstance(value, self._recognized_scalars): - value = self._scalar_type(value) # type: ignore + # error: Too many arguments for "object" [call-arg] + value = self._scalar_type(value) # type: ignore[call-arg] else: if msg is None: @@ -959,7 +963,7 @@ def value_counts(self, dropna=False): ------- Series """ - from pandas import Series, Index + from pandas import Index, Series if dropna: values = self[~self.isna()]._data @@ -1129,7 +1133,8 @@ def resolution(self) -> str: """ Returns day, hour, minute, second, millisecond or microsecond """ - return self._resolution_obj.attrname # type: ignore + # error: Item "None" of "Optional[Any]" has no attribute "attrname" + return self._resolution_obj.attrname # type: ignore[union-attr] @classmethod def _validate_frequency(cls, index, freq, **kwargs): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d674b1c476d2c..8b2bb7832b5d0 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -418,9 +418,9 @@ def _generate_range( # index is localized datetime64 array -> have to convert # start/end as well to compare if start is not None: - start = start.tz_localize(tz).asm8 + start = start.tz_localize(tz, ambiguous, nonexistent).asm8 if end is not None: - end = end.tz_localize(tz).asm8 + end = end.tz_localize(tz, ambiguous, nonexistent).asm8 else: # Create a linearly spaced date_range in local time # Nanosecond-granularity timestamps aren't always correctly diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b0958af41158c..57df067c7b16e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -116,6 +116,7 @@ def __from_arrow__( Construct IntegerArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask pyarrow_type = pyarrow.from_numpy_dtype(self.type) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c861d25afd13f..d76e0fd628a48 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1057,7 +1057,7 @@ def mid(self): # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties - @property # type: ignore + @property # type: ignore[misc] @Appender( _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs ) @@ -1105,6 +1105,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType try: diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index f6dfb1f0f1e62..05f901518d82f 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -11,12 +11,11 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna from pandas import compat -from pandas.core import nanops +from pandas.core import nanops, ops from pandas.core.algorithms import searchsorted from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -436,11 +435,9 @@ def __invert__(self): @classmethod def _create_arithmetic_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def arithmetic_method(self, other): - if isinstance(other, (ABCIndexClass, ABCSeries)): - return NotImplemented - - elif isinstance(other, cls): + if isinstance(other, cls): other = other._ndarray with np.errstate(all="ignore"): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8d5cb12d60e4d..ddaf6d39f1837 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -278,8 +278,8 @@ def _check_compatible_with(self, other, setitem: bool = False): def dtype(self) -> PeriodDtype: return self._dtype - # error: Read-only property cannot override read-write property [misc] - @property # type: ignore + # error: Read-only property cannot override read-write property + @property # type: ignore[misc] def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray. @@ -300,6 +300,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType if type is not None: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8a30d2b954b55..da8d695c59b9e 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -87,8 +87,8 @@ def from_coo(cls, A, dense_index=False): 1 0 3.0 dtype: Sparse[float64, nan] """ - from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series result = _coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) @@ -253,9 +253,10 @@ def from_spmatrix(cls, data, index=None, columns=None): 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas import DataFrame from pandas._libs.sparse import IntIndex + from pandas import DataFrame + data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape @@ -354,8 +355,8 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): - import pandas.core.indexes.base as ibase from pandas.core.indexes.api import ensure_index + import pandas.core.indexes.base as ibase N, K = data.shape if index is None: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fddd3af858f77..bb55c3cdea45c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -7,7 +7,6 @@ from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas import compat @@ -312,15 +311,14 @@ def memory_usage(self, deep=False): @classmethod def _create_arithmetic_method(cls, op): # Note: this handles both arithmetic and comparison methods. + + @ops.unpack_zerodim_and_defer(op.__name__) def method(self, other): from pandas.arrays import BooleanArray assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS - if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): - return NotImplemented - - elif isinstance(other, cls): + if isinstance(other, cls): other = other._ndarray mask = isna(self) | isna(other) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a378423df788b..3e21d01355dda 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -29,7 +29,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr -from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays import IntegerArray, datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.core.construction import extract_array @@ -628,7 +628,7 @@ def __floordiv__(self, other): result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): - result = result.astype(np.int64) + result = result.astype(np.float64) result[mask] = np.nan return result @@ -921,6 +921,8 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data + elif isinstance(data, IntegerArray): + data = data.to_numpy("int64", na_value=tslibs.iNaT) # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 0e9077e6d557e..05a5538a88772 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -227,7 +227,8 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b) # type: ignore + # error: "None" not callable + return _evaluate(op, op_str, a, b) # type: ignore[misc] return _evaluate_standard(op, op_str, a, b) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index c7c7103654a65..86e125b6b909b 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -37,7 +37,9 @@ def create_valid_python_identifier(name: str) -> str: special_characters_replacements = { char: f"_{token.tok_name[tokval]}_" # The ignore here is because of a bug in mypy that is resolved in 0.740 - for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + for char, tokval in ( + tokenize.EXACT_TOKEN_TYPES.items() # type: ignore[attr-defined] + ) } special_characters_replacements.update( { diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 001eb1789007f..f1b11a6869c2b 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -63,7 +63,7 @@ def _resolve_name(self): return self.name # read-only property overwriting read/write property - @property # type: ignore + @property # type: ignore[misc] def value(self): return self._value diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 86f6be77bc505..0c23f1b4bcdf2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -327,7 +327,7 @@ def is_terminal() -> bool: """ try: # error: Name 'get_ipython' is not defined - ip = get_ipython() # type: ignore + ip = get_ipython() # type: ignore[name-defined] except NameError: # assume standard Python interpreter in a terminal return True else: @@ -662,8 +662,10 @@ def register_plotting_backend_cb(key): def register_converter_cb(key): - from pandas.plotting import register_matplotlib_converters - from pandas.plotting import deregister_matplotlib_converters + from pandas.plotting import ( + deregister_matplotlib_converters, + register_matplotlib_converters, + ) if cf.get_option(key): register_matplotlib_converters() diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 6c58698989e96..47f10f1f65f4a 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -48,9 +48,9 @@ import pandas.core.common as com if TYPE_CHECKING: - from pandas.core.series import Series # noqa: F401 - from pandas.core.indexes.api import Index # noqa: F401 from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.indexes.api import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 def array( @@ -255,14 +255,14 @@ def array( ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( - period_array, BooleanArray, + DatetimeArray, IntegerArray, IntervalArray, PandasArray, - DatetimeArray, - TimedeltaArray, StringArray, + TimedeltaArray, + period_array, ) if lib.is_scalar(data): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6b84f0e81f48b..228329898b6a4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1244,6 +1244,7 @@ def try_datetime(v): # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype, xref GH19671 from pandas._libs.tslibs import conversion + from pandas import DatetimeIndex try: @@ -1303,8 +1304,8 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ - from pandas.core.tools.timedeltas import to_timedelta from pandas.core.tools.datetimes import to_datetime + from pandas.core.tools.timedeltas import to_timedelta if dtype is not None: if isinstance(dtype, str): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a2ca4d84b2bf6..1e70ff90fcd44 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,7 +9,7 @@ from pandas._libs import Interval, Period, algos from pandas._libs.tslibs import conversion -from pandas._typing import ArrayLike, DtypeObj +from pandas._typing import ArrayLike, DtypeObj, Optional from pandas.core.dtypes.base import registry from pandas.core.dtypes.dtypes import ( @@ -136,11 +136,13 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: """ # TODO: GH27506 potential bug with ExtensionArrays try: - return arr.astype("int64", copy=copy, casting="safe") # type: ignore + # error: Unexpected keyword argument "casting" for "astype" + return arr.astype("int64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: pass try: - return arr.astype("uint64", copy=copy, casting="safe") # type: ignore + # error: Unexpected keyword argument "casting" for "astype" + return arr.astype("uint64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: if is_extension_array_dtype(arr.dtype): return arr.to_numpy(dtype="float64", na_value=np.nan) @@ -1730,6 +1732,32 @@ def _validate_date_like_dtype(dtype) -> None: ) +def validate_all_hashable(*args, error_name: Optional[str] = None) -> None: + """ + Return None if all args are hashable, else raise a TypeError. + + Parameters + ---------- + *args + Arguments to validate. + error_name : str, optional + The name to use if error + + Raises + ------ + TypeError : If an argument is not hashable + + Returns + ------- + None + """ + if not all(is_hashable(arg) for arg in args): + if error_name: + raise TypeError(f"{error_name} must be a hashable type") + else: + raise TypeError("All elements must be hashable") + + def pandas_dtype(dtype) -> DtypeObj: """ Convert input into a pandas only dtype object or a numpy dtype object. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 22480fbc47508..8dc500dddeafa 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -30,12 +30,13 @@ if TYPE_CHECKING: import pyarrow # noqa: F401 + + from pandas import Categorical # noqa: F401 from pandas.core.arrays import ( # noqa: F401 + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, ) - from pandas import Categorical # noqa: F401 str_type = str @@ -391,12 +392,13 @@ def __repr__(self) -> str_type: @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: + from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype + from pandas.core.util.hashing import ( - hash_array, _combine_hash_arrays, + hash_array, hash_tuples, ) - from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM @@ -633,7 +635,8 @@ class DatetimeTZDtype(PandasExtensionDtype): def __init__(self, unit: Union[str_type, "DatetimeTZDtype"] = "ns", tz=None): if isinstance(unit, DatetimeTZDtype): - unit, tz = unit.unit, unit.tz # type: ignore + # error: "str" has no attribute "tz" + unit, tz = unit.unit, unit.tz # type: ignore[attr-defined] if unit != "ns": if isinstance(unit, str) and tz is None: @@ -939,6 +942,7 @@ def __from_arrow__( Construct PeriodArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1136,6 +1140,7 @@ def __from_arrow__( Construct IntervalArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import IntervalArray if isinstance(array, pyarrow.Array): diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 36eff214fc314..1f1017cfc1929 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -7,7 +7,7 @@ def create_pandas_abc_type(name, attr, comp): # https://github.com/python/mypy/issues/1006 # error: 'classmethod' used with a non-method - @classmethod # type: ignore + @classmethod # type: ignore[misc] def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8551ce9f14e6c..f59bb31af2828 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -10,7 +10,7 @@ from pandas._libs import lib import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from pandas._typing import DtypeObj +from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -484,6 +484,18 @@ def _array_equivalent_object(left, right, strict_nan): return True +def array_equals(left: ArrayLike, right: ArrayLike) -> bool: + """ + ExtensionArray-compatible implementation of array_equivalent. + """ + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ABCExtensionArray): + return left.equals(right) + else: + return array_equivalent(left, right, dtype_equal=True) + + def _infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3f634c1e6e1ff..b66b6b92336f2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -150,6 +150,7 @@ if TYPE_CHECKING: from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.io.formats.style import Styler # --------------------------------------------------------------------- @@ -1370,6 +1371,8 @@ def to_numpy( result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) + if result.dtype is not dtype: + result = np.array(result, dtype=dtype, copy=False) return result @@ -2163,10 +2166,14 @@ def to_stata( from pandas.io.stata import StataWriter as statawriter elif version == 117: # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import StataWriter117 as statawriter # type: ignore + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriter117 as statawriter, + ) else: # versions 118 and 119 # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import StataWriterUTF8 as statawriter # type: ignore + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriterUTF8 as statawriter, + ) kwargs: Dict[str, Any] = {} if version is None or version >= 117: @@ -2177,7 +2184,7 @@ def to_stata( kwargs["version"] = version # mypy: Too many arguments for "StataWriter" - writer = statawriter( # type: ignore + writer = statawriter( # type: ignore[call-arg] path, self, convert_dates=convert_dates, @@ -3577,7 +3584,13 @@ def extract_unique_dtypes_from_dtypes_set( extracted_dtypes = [ unique_dtype for unique_dtype in unique_dtypes - if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore + # error: Argument 1 to "tuple" has incompatible type + # "FrozenSet[Union[ExtensionDtype, str, Any, Type[str], + # Type[float], Type[int], Type[complex], Type[bool]]]"; + # expected "Iterable[Union[type, Tuple[Any, ...]]]" + if issubclass( + unique_dtype.type, tuple(dtypes_set) # type: ignore[arg-type] + ) ] return extracted_dtypes @@ -4192,7 +4205,7 @@ def rename( Parameters ---------- mapper : dict-like or function - Dict-like or functions transformations to apply to + Dict-like or function transformations to apply to that axis' values. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and ``columns``. @@ -5205,8 +5218,9 @@ def duplicated( 4 True dtype: bool """ + from pandas._libs.hashtable import _SIZE_HINT_LIMIT, duplicated_int64 + from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT if self.empty: return self._constructor_sliced(dtype=bool) @@ -5248,7 +5262,8 @@ def f(vals): # TODO: Just move the sort_values doc here. @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - def sort_values( # type: ignore[override] # NOQA # issue 27237 + # error: Signature of "sort_values" incompatible with supertype "NDFrame" + def sort_values( # type: ignore[override] self, by, axis=0, @@ -7868,8 +7883,8 @@ def join( def _join_compat( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False ): - from pandas.core.reshape.merge import merge from pandas.core.reshape.concat import concat + from pandas.core.reshape.merge import merge if isinstance(other, Series): if other.name is None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e46fde1f59f16..fcb7e2a949205 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -589,9 +589,9 @@ def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: # ignore needed because of NDFrame constructor is different than # DataFrame/Series constructors. - return self._constructor(new_values, *new_axes).__finalize__( # type: ignore - self, method="swapaxes" - ) + return self._constructor( + new_values, *new_axes # type: ignore[arg-type] + ).__finalize__(self, method="swapaxes") def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ @@ -1201,9 +1201,11 @@ def equals(self, other): This function allows two Series or DataFrames to be compared against each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. The column headers do not - need to have the same type, but the elements within the columns must - be the same dtype. + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns must be of + the same dtype. Parameters ---------- @@ -1232,13 +1234,6 @@ def equals(self, other): numpy.array_equal : Return True if two arrays have the same shape and elements, False otherwise. - Notes - ----- - This function requires that the elements have the same dtype as their - respective elements in the other Series or DataFrame. However, the - column labels do not need to have the same type, as long as they are - still considered equal. - Examples -------- >>> df = pd.DataFrame({1: [10], 2: [20]}) @@ -1777,7 +1772,28 @@ def empty(self) -> bool_t: def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._values, dtype=dtype) - def __array_wrap__(self, result, context=None): + def __array_wrap__( + self, + result: np.ndarray, + context: Optional[Tuple[Callable, Tuple[Any, ...], int]] = None, + ): + """ + Gets called after a ufunc and other functions. + + Parameters + ---------- + result: np.ndarray + The result of the ufunc or other function called on the NumPy array + returned by __array__ + context: tuple of (func, tuple, int) + This parameter is returned by ufuncs as a 3-element tuple: (name of the + ufunc, arguments of the ufunc, domain of the ufunc), but is not set by + other numpy functions.q + + Notes + ----- + Series implements __array_ufunc_ so this not called for ufunc on Series. + """ result = lib.item_from_zerodim(result) if is_scalar(result): # e.g. we get here with np.ptp(series) @@ -2840,6 +2856,7 @@ def to_latex( multirow=None, caption=None, label=None, + position=None, ): r""" Render object to a LaTeX tabular, longtable, or nested table/tabular. @@ -2925,6 +2942,9 @@ def to_latex( This is used with ``\ref{}`` in the main ``.tex`` file. .. versionadded:: 1.0.0 + position : str, optional + The LaTeX positional argument for tables, to be placed after + ``\begin{}`` in the output. %(returns)s See Also -------- @@ -2986,6 +3006,7 @@ def to_latex( multirow=multirow, caption=caption, label=label, + position=position, ) def to_csv( @@ -3021,13 +3042,18 @@ def to_csv( ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as - a string. If a file object is passed it should be opened with - `newline=''`, disabling universal newlines. + a string. If a non-binary file object is passed, it should be opened + with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` needs to contain a `'b'`. .. versionchanged:: 0.24.0 Was previously named "path" for Series. + .. versionchanged:: 1.2.0 + + Support for binary file objects was introduced. + sep : str, default ',' String of length 1. Field delimiter for the output file. na_rep : str, default '' @@ -3056,7 +3082,8 @@ def to_csv( Python write mode, default 'w'. encoding : str, optional A string representing the encoding to use in the output file, - defaults to 'utf-8'. + defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` + is a non-binary file object. compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following @@ -3080,6 +3107,10 @@ def to_csv( supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + .. versionchanged:: 1.2.0 + + Compression is supported for binary file objects. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC @@ -3477,7 +3508,10 @@ class animal locomotion index = self.index if isinstance(index, MultiIndex): - loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + try: + loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + except TypeError as e: + raise TypeError(f"Expected label or tuple of labels, got {key}") from e else: loc = self.index.get_loc(key) @@ -4011,7 +4045,11 @@ def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) # type: ignore + # error: Incompatible return value type (got "Optional[FrameOrSeries]", + # expected "FrameOrSeries") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + return self.rename(**mapper) # type: ignore[return-value, arg-type] def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: """ @@ -4070,7 +4108,11 @@ def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) # type: ignore + # error: Incompatible return value type (got "Optional[FrameOrSeries]", + # expected "FrameOrSeries") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + return self.rename(**mapper) # type: ignore[return-value, arg-type] def sort_values( self, @@ -9397,7 +9439,7 @@ def truncate( if before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if ax.is_monotonic_decreasing: + if len(ax) > 1 and ax.is_monotonic_decreasing: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec7b14f27c5a1..53242c0332a8c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -35,11 +35,11 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( + find_common_type, maybe_cast_result, maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, - maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -513,7 +513,6 @@ def _transform_general( """ Transform with a non-str `func`. """ - if maybe_use_numba(engine): numba_func, cache_key = generate_numba_func( func, engine_kwargs, kwargs, "groupby_transform" @@ -535,24 +534,23 @@ def _transform_general( if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values - indexer = self._get_index(name) - ser = klass(res, indexer) - results.append(ser) + results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat - result = concat(results).sort_index() + concatenated = concat(results) + result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) - dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype): - result = maybe_downcast_to_dtype(result, dtype) + if is_numeric_dtype(result.dtype): + common_dtype = find_common_type([self._selected_obj.dtype, result.dtype]) + if common_dtype is result.dtype: + result = maybe_downcast_numeric(result, self._selected_obj.dtype) result.name = self._selected_obj.name result.index = self._selected_obj.index @@ -681,8 +679,8 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers + from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level @@ -1031,11 +1029,36 @@ def _cython_agg_blocks( agg_blocks: List[Block] = [] new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] - # Some object-dtype blocks might be split into List[Block[T], Block[U]] - split_items: List[np.ndarray] = [] - split_frames: List[DataFrame] = [] no_result = object() + + def cast_result_block(result, block: "Block", how: str) -> "Block": + # see if we can cast the block to the desired dtype + # this may not be the original dtype + assert not isinstance(result, DataFrame) + assert result is not no_result + + dtype = maybe_cast_result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + # TODO(EA2D): special casing not needed with 2D EAs + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except (ValueError, TypeError): + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) + + agg_block: Block = block.make_block(result) + return agg_block + for block in data.blocks: # Avoid inheriting result from earlier in the loop result = no_result @@ -1067,9 +1090,9 @@ def _cython_agg_blocks( # not try to add missing categories if grouping over multiple # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 - s = get_groupby(obj, self.grouper, observed=True) + sgb = get_groupby(obj, self.grouper, observed=True) try: - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: # we may have an exception in trying to aggregate # continue and exclude the block @@ -1083,54 +1106,26 @@ def _cython_agg_blocks( # about a single block input returning a single block output # is a lie. To keep the code-path for the typical non-split case # clean, we choose to clean up this mess later on. - split_items.append(locs) - split_frames.append(result) - continue - - assert len(result._mgr.blocks) == 1 - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block to the desired dtype - # this may not be the original dtype - dtype = maybe_cast_result_dtype(block.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - # TODO(EA2D): special casing not needed with 2D EAs - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except (ValueError, TypeError): - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) - - agg_block: Block = block.make_block(result) - - new_items.append(locs) - agg_blocks.append(agg_block) + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_blocks.append(agg_block) + else: + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) + else: + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) - if not (agg_blocks or split_frames): + if not agg_blocks: raise DataError("No numeric types to aggregate") - if split_items: - # Clean up the mess left over from split blocks. - for locs, result in zip(split_items, split_frames): - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - new_items.append(np.array([loc], dtype=locs.dtype)) - agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0]) - # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) @@ -1829,7 +1824,13 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - return self._wrap_agged_blocks(blocks, items=data.items) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _wrap_agged_blocks() returns. GH 35028 + with com.temp_setattr(self, "observed", True): + result = self._wrap_agged_blocks(blocks, items=data.items) + + return self._reindex_output(result, fill_value=0) def nunique(self, dropna: bool = True): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ac45222625569..4597afeeaddbf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -463,8 +463,10 @@ def _group_selection_context(groupby): Set / reset the _group_selection_context. """ groupby._set_group_selection() - yield groupby - groupby._reset_group_selection() + try: + yield groupby + finally: + groupby._reset_group_selection() _KeysArgType = Union[ @@ -734,13 +736,12 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name): assert name in self._apply_allowlist - self._set_group_selection() - - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._obj_with_exclusions, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) + with _group_selection_context(self): + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._obj_with_exclusions, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) f = getattr(type(self._obj_with_exclusions), name) sig = inspect.signature(f) @@ -990,28 +991,28 @@ def _agg_general( alias: str, npfunc: Callable, ): - self._set_group_selection() - - # try a cython aggregation if we can - try: - return self._cython_agg_general( - how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, - ) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes + with _group_selection_context(self): + # try a cython aggregation if we can + try: + return self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) + except DataError: pass - else: - raise - - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + except NotImplementedError as err: + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1534,9 +1535,19 @@ def size(self) -> FrameOrSeriesUnion: @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): - return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum - ) + + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): @@ -1928,29 +1939,31 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() + with _group_selection_context(self): - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) - mask = mask_left | mask_right + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d( + self._cumcount_array(ascending=False) + 1, -nth_array + ) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - out = self._reindex_output(out) - return out.sort_index() if self.sort else out + out = self._reindex_output(out) + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 67003dffb90bb..8239a792c65dd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -237,7 +237,6 @@ def __new__(cls, *args, **kwargs): # core/groupby/grouper.py::Grouper # raising these warnings from TimeGrouper directly would fail the test: # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base - # hacky way to set the stacklevel: if cls is TimeGrouper it means # that the call comes from a pandas internal call of resample, # otherwise it comes from pd.Grouper diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b63760..64eb413fe78fa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -50,7 +50,7 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_iterator, + get_flattened_list, get_group_index, get_group_index_sorter, get_indexer_dict, @@ -153,7 +153,7 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) + return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated @@ -211,7 +211,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): # group might be modified group_axes = group.axes res = f(group) - if not _is_indexed_like(res, group_axes): + if not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) @@ -897,13 +897,13 @@ def agg_series( return grouper.get_result() -def _is_indexed_like(obj, axes) -> bool: +def _is_indexed_like(obj, axes, axis: int) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) elif isinstance(obj, DataFrame): - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) return False diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4c5a70f4088ee..30cc8cf480dcf 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -218,9 +218,8 @@ def conv(i): return result elif kind == "array": index = indexes[0] - for other in indexes[1:]: - if not index.equals(other): - return _unique_indices(indexes) + if not all(index.equals(other) for other in indexes[1:]): + index = _unique_indices(indexes) name = get_consensus_names(indexes)[0] if name != index.name: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 986d6323e704e..ecd3670e724a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -58,6 +58,7 @@ is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype, + validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -574,7 +575,7 @@ def __array__(self, dtype=None) -> np.ndarray: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: @@ -812,13 +813,11 @@ def copy(self, name=None, deep=False, dtype=None, names=None): In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ + name = self._validate_names(name=name, names=names, deep=deep)[0] if deep: - new_index = self._shallow_copy(self._data.copy()) + new_index = self._shallow_copy(self._data.copy(), name=name) else: - new_index = self._shallow_copy() - - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) + new_index = self._shallow_copy(name=name) if dtype: new_index = new_index.astype(dtype) @@ -885,7 +884,8 @@ def _format_data(self, name=None) -> str_t: if self.inferred_type == "string": is_justify = False elif self.inferred_type == "categorical": - if is_object_dtype(self.categories): # type: ignore + # error: "Index" has no attribute "categories" + if is_object_dtype(self.categories): # type: ignore[attr-defined] is_justify = False return format_object_summary( @@ -940,7 +940,8 @@ def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: if mask.any(): result = np.array(result) result[mask] = na_rep - result = result.tolist() # type: ignore + # error: "List[str]" has no attribute "tolist" + result = result.tolist() # type: ignore[attr-defined] else: result = trim_front(format_array(values, None, justify="left")) return header + result @@ -1184,7 +1185,7 @@ def name(self, value): maybe_extract_name(value, None, type(self)) self._name = value - def _validate_names(self, name=None, names=None, deep: bool = False): + def _validate_names(self, name=None, names=None, deep: bool = False) -> List[Label]: """ Handles the quirks of having a singular 'name' parameter for general Index and plural 'names' parameter for MultiIndex. @@ -1194,15 +1195,25 @@ def _validate_names(self, name=None, names=None, deep: bool = False): if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: - return deepcopy(self.names) if deep else self.names + new_names = deepcopy(self.names) if deep else self.names elif names is not None: if not is_list_like(names): raise TypeError("Must pass list-like as `names`.") - return names + new_names = names + elif not is_list_like(name): + new_names = [name] else: - if not is_list_like(name): - return [name] - return name + new_names = name + + if len(new_names) != len(self.names): + raise ValueError( + f"Length of new names must be {len(self.names)}, got {len(new_names)}" + ) + + # All items in 'new_names' need to be hashable + validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name") + + return new_names def _get_names(self): return FrozenList((self.name,)) @@ -1230,9 +1241,8 @@ def _set_names(self, values, level=None): # GH 20527 # All items in 'name' need to be hashable: - for name in values: - if not is_hashable(name): - raise TypeError(f"{type(self).__name__}.name must be a hashable type") + validate_all_hashable(*values, error_name=f"{type(self).__name__}.name") + self._name = values[0] names = property(fset=_set_names, fget=_get_names) @@ -2375,31 +2385,10 @@ def _get_unique_index(self, dropna: bool = False): # -------------------------------------------------------------------- # Arithmetic & Logical Methods - def __add__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - from pandas import Series - - return Index(Series(self) + other) - - def __radd__(self, other): - from pandas import Series - - return Index(other + Series(self)) - def __iadd__(self, other): # alias for __add__ return self + other - def __sub__(self, other): - return Index(np.array(self) - other) - - def __rsub__(self, other): - # wrap Series to ensure we pin name correctly - from pandas import Series - - return Index(other - Series(self)) - def __and__(self, other): return self.intersection(other) @@ -4252,16 +4241,15 @@ def equals(self, other: Any) -> bool: if not isinstance(other, Index): return False - if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): - # if other is not object, use other's logic for coercion - return other.equals(self) - - if isinstance(other, ABCMultiIndex): - # d-level MultiIndex can equal d-tuple Index - return other.equals(self) - - if is_extension_array_dtype(other.dtype): - # All EA-backed Index subclasses override equals + # If other is a subclass of self and defines it's own equals method, we + # dispatch to the subclass method. For instance for a MultiIndex, + # a d-level MultiIndex can equal d-tuple Index. + # Note: All EA-backed Index subclasses override equals + if ( + isinstance(other, type(self)) + and type(other) is not type(self) + and other.equals is not self.equals + ): return other.equals(self) return array_equivalent(self._values, other._values) @@ -5292,38 +5280,6 @@ def _add_comparison_methods(cls): cls.__le__ = _make_comparison_op(operator.le, cls) cls.__ge__ = _make_comparison_op(operator.ge, cls) - @classmethod - def _add_numeric_methods_add_sub_disabled(cls): - """ - Add in the numeric add/sub methods to disable. - """ - cls.__add__ = make_invalid_op("__add__") - cls.__radd__ = make_invalid_op("__radd__") - cls.__iadd__ = make_invalid_op("__iadd__") - cls.__sub__ = make_invalid_op("__sub__") - cls.__rsub__ = make_invalid_op("__rsub__") - cls.__isub__ = make_invalid_op("__isub__") - - @classmethod - def _add_numeric_methods_disabled(cls): - """ - Add in numeric methods to disable other than add/sub. - """ - cls.__pow__ = make_invalid_op("__pow__") - cls.__rpow__ = make_invalid_op("__rpow__") - cls.__mul__ = make_invalid_op("__mul__") - cls.__rmul__ = make_invalid_op("__rmul__") - cls.__floordiv__ = make_invalid_op("__floordiv__") - cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") - cls.__truediv__ = make_invalid_op("__truediv__") - cls.__rtruediv__ = make_invalid_op("__rtruediv__") - cls.__mod__ = make_invalid_op("__mod__") - cls.__divmod__ = make_invalid_op("__divmod__") - cls.__neg__ = make_invalid_op("__neg__") - cls.__pos__ = make_invalid_op("__pos__") - cls.__abs__ = make_invalid_op("__abs__") - cls.__inv__ = make_invalid_op("__inv__") - @classmethod def _add_numeric_methods_binary(cls): """ @@ -5339,11 +5295,12 @@ def _add_numeric_methods_binary(cls): cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) - # TODO: rmod? rdivmod? cls.__mod__ = _make_arithmetic_op(operator.mod, cls) + cls.__rmod__ = _make_arithmetic_op(ops.rmod, cls) cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) cls.__divmod__ = _make_arithmetic_op(divmod, cls) + cls.__rdivmod__ = _make_arithmetic_op(ops.rdivmod, cls) cls.__mul__ = _make_arithmetic_op(operator.mul, cls) cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) @@ -5503,7 +5460,7 @@ def shape(self): return self._values.shape -Index._add_numeric_methods_disabled() +Index._add_numeric_methods() Index._add_logical_methods() Index._add_comparison_methods() @@ -5731,9 +5688,9 @@ def _maybe_cast_data_without_dtype(subarr): """ # Runtime import needed bc IntervalArray imports Index from pandas.core.arrays import ( + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, TimedeltaArray, ) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b0b008de69a94..fb283cbe02954 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -20,7 +20,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -348,12 +348,12 @@ def _format_attrs(self): return attrs def _format_with_header(self, header, na_rep="NaN") -> List[str]: - from pandas.io.formats.format import format_array + from pandas.io.formats.printing import pprint_thing - formatted_values = format_array( - self._values, formatter=None, na_rep=na_rep, justify="left" - ) - result = ibase.trim_front(formatted_values) + result = [ + pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep + for x in self._values + ] return header + result # -------------------------------------------------------------------- @@ -770,6 +770,4 @@ def _wrap_joined_index( return self._create_from_codes(joined, name=name) -CategoricalIndex._add_numeric_methods_add_sub_disabled() -CategoricalIndex._add_numeric_methods_disabled() CategoricalIndex._add_logical_methods_disabled() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 15a7e25238983..8ccdab21339df 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -54,7 +54,8 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): Create the join wrapper methods. """ - @staticmethod # type: ignore + # error: 'staticmethod' used with a non-method + @staticmethod # type: ignore[misc] def wrapper(left, right): if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): left = left.view("i8") @@ -95,7 +96,10 @@ class DatetimeIndexOpsMixin(ExtensionIndex): _bool_ops: List[str] = [] _field_ops: List[str] = [] - hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore + # error: "Callable[[Any], Any]" has no attribute "fget" + hasnans = cache_readonly( + DatetimeLikeArrayMixin._hasnans.fget # type: ignore[attr-defined] + ) _hasnans = hasnans # for index / array -agnostic code @property @@ -112,7 +116,7 @@ def values(self): def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6d2e592f024ed..f71fd0d406c54 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -842,7 +842,6 @@ def indexer_between_time( return mask.nonzero()[0] -DatetimeIndex._add_numeric_methods_disabled() DatetimeIndex._add_logical_methods_disabled() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 235da89083d0a..13927dede5542 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -50,6 +50,7 @@ from pandas.core.indexes.frozen import FrozenList from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing +from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( get_group_index, indexer_from_factorized, @@ -739,7 +740,7 @@ def _set_levels( self._tuples = None self._reset_cache() - def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): + def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -751,6 +752,8 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool, default True If True, checks that levels and codes are compatible. @@ -821,6 +824,15 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -897,7 +909,7 @@ def _set_codes( self._tuples = None self._reset_cache() - def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): + def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -913,6 +925,8 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool (default True) If True, checks that levels and codes are compatible. @@ -957,6 +971,15 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): (1, 'two')], names=['foo', 'bar']) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if level is not None and not is_list_like(level): if not is_list_like(codes): raise TypeError("Codes must be list-like") @@ -3606,6 +3629,40 @@ def isin(self, values, level=None): return np.zeros(len(levs), dtype=np.bool_) return levs.isin(values) + @classmethod + def _add_numeric_methods_add_sub_disabled(cls): + """ + Add in the numeric add/sub methods to disable. + """ + cls.__add__ = make_invalid_op("__add__") + cls.__radd__ = make_invalid_op("__radd__") + cls.__iadd__ = make_invalid_op("__iadd__") + cls.__sub__ = make_invalid_op("__sub__") + cls.__rsub__ = make_invalid_op("__rsub__") + cls.__isub__ = make_invalid_op("__isub__") + + @classmethod + def _add_numeric_methods_disabled(cls): + """ + Add in numeric methods to disable other than add/sub. + """ + cls.__pow__ = make_invalid_op("__pow__") + cls.__rpow__ = make_invalid_op("__rpow__") + cls.__mul__ = make_invalid_op("__mul__") + cls.__rmul__ = make_invalid_op("__rmul__") + cls.__floordiv__ = make_invalid_op("__floordiv__") + cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") + cls.__truediv__ = make_invalid_op("__truediv__") + cls.__rtruediv__ = make_invalid_op("__rtruediv__") + cls.__mod__ = make_invalid_op("__mod__") + cls.__rmod__ = make_invalid_op("__rmod__") + cls.__divmod__ = make_invalid_op("__divmod__") + cls.__rdivmod__ = make_invalid_op("__rdivmod__") + cls.__neg__ = make_invalid_op("__neg__") + cls.__pos__ = make_invalid_op("__pos__") + cls.__abs__ = make_invalid_op("__abs__") + cls.__inv__ = make_invalid_op("__inv__") + MultiIndex._add_numeric_methods_disabled() MultiIndex._add_numeric_methods_add_sub_disabled() diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 03e11b652477f..11334803d4583 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -345,10 +345,13 @@ def _int64index(self) -> Int64Index: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. Needs additional handling as - PeriodIndex stores internal data as int dtype + Gets called after a ufunc and other functions. - Replace this to __numpy_ufunc__ in future version + Needs additional handling as PeriodIndex stores internal data as int + dtype + + Replace this to __numpy_ufunc__ in future version and implement + __array_function__ for Indexes """ if isinstance(context, tuple) and len(context) > 0: func = context[0] @@ -724,7 +727,6 @@ def memory_usage(self, deep=False): return result -PeriodIndex._add_numeric_methods_disabled() PeriodIndex._add_logical_methods_disabled() diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e5e98039ff77b..3577a7aacc008 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List, Optional +from typing import Any import warnings import numpy as np @@ -33,8 +33,6 @@ from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.io.formats.printing import pprint_thing - _empty_range = range(0) @@ -80,8 +78,6 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine _range: range - # check whether self._data has been called - _cached_data: Optional[np.ndarray] = None # -------------------------------------------------------------------- # Constructors @@ -152,20 +148,14 @@ def _constructor(self): """ return the class to use for construction """ return Int64Index - @property + @cache_readonly def _data(self): """ An int array that for performance reasons is created only when needed. - The constructed array is saved in ``_cached_data``. This allows us to - check if the array has been created without accessing ``_data`` and - triggering the construction. + The constructed array is saved in ``_cache``. """ - if self._cached_data is None: - self._cached_data = np.arange( - self.start, self.stop, self.step, dtype=np.int64 - ) - return self._cached_data + return np.arange(self.start, self.stop, self.step, dtype=np.int64) @cache_readonly def _int64index(self) -> Int64Index: @@ -197,9 +187,6 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep="NaN") -> List[str]: - return header + [pprint_thing(x) for x in self._range] - # -------------------------------------------------------------------- _deprecation_message = ( "RangeIndex.{} is deprecated and will be " @@ -398,11 +385,12 @@ def _shallow_copy(self, values=None, name: Label = no_default): return Int64Index._simple_new(values, name=name) @doc(Int64Index.copy) - def copy(self, name=None, deep=False, dtype=None, **kwargs): + def copy(self, name=None, deep=False, dtype=None, names=None): self._validate_dtype(dtype) - if name is None: - name = self.name - return self.from_range(self._range, name=name) + + name = self._validate_names(name=name, names=names, deep=deep)[0] + new_index = self._shallow_copy(name=name) + return new_index def _minmax(self, meth: str): no_steps = len(self) - 1 diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 04d1dbceb3342..dd81823055390 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -255,6 +255,8 @@ def loc(self) -> "_LocIndexer": - A boolean array of the same length as the axis being sliced, e.g. ``[True, False, True]``. + - An alignable boolean Series. The index of the key will be aligned before + masking. - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) @@ -264,6 +266,8 @@ def loc(self) -> "_LocIndexer": ------ KeyError If any items are not found. + IndexingError + If an indexed key is passed and its index is unalignable to the frame index. See Also -------- @@ -319,6 +323,13 @@ def loc(self) -> "_LocIndexer": max_speed shield sidewinder 7 8 + Alignable boolean Series: + + >>> df.loc[pd.Series([False, True, False], + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield + sidewinder 7 8 + Conditional that returns a boolean Series >>> df.loc[df['shield'] > 6] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6ca6eca1ff829..f3286b3c20965 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -105,7 +105,6 @@ class Block(PandasObject): is_extension = False _can_hold_na = False _can_consolidate = True - _verify_integrity = True _validate_ndim = True @classmethod @@ -1525,7 +1524,6 @@ class ExtensionBlock(Block): """ _can_consolidate = False - _verify_integrity = False _validate_ndim = False is_extension = True @@ -2613,7 +2611,6 @@ def _replace_coerce( class CategoricalBlock(ExtensionBlock): __slots__ = () is_categorical = True - _verify_integrity = True _can_hold_na = True should_store = Block.should_store @@ -2744,7 +2741,8 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - values = values.reshape(tuple((1,) + shape)) # type: ignore + # error: "ExtensionArray" has no attribute "reshape" + values = values.reshape(tuple((1,) + shape)) # type: ignore[attr-defined] return values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 895385b170c91..aa74d173d69b3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -28,10 +28,9 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import array_equivalent, isna +from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -49,7 +48,7 @@ get_block_type, make_block, ) -from pandas.core.internals.ops import operate_blockwise +from pandas.core.internals.ops import blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None @@ -312,7 +311,7 @@ def _verify_integrity(self) -> None: mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: - if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + if block.shape[1:] != mgr_shape[1:]: raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( @@ -551,6 +550,24 @@ def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + if axis == 0 and self.ndim == 2 and self.nblocks > 1: + # GH#35488 we need to watch out for multi-block cases + ncols = self.shape[0] + if periods > 0: + indexer = [-1] * periods + list(range(ncols - periods)) + else: + nper = abs(periods) + indexer = list(range(nper, ncols)) + [-1] * nper + result = self.reindex_indexer( + self.items, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + consolidate=False, + ) + return result + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": @@ -849,6 +866,8 @@ def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" result = np.empty(self.shape, dtype=dtype) @@ -1213,6 +1232,7 @@ def reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + consolidate: bool = True, ) -> T: """ Parameters @@ -1223,7 +1243,8 @@ def reindex_indexer( fill_value : object, default None allow_dups : bool, default False copy : bool, default True - + consolidate: bool, default True + Whether to consolidate inplace before reindexing. pandas-indexer with -1's only. """ @@ -1236,7 +1257,8 @@ def reindex_indexer( result.axes[axis] = new_axis return result - self._consolidate_inplace() + if consolidate: + self._consolidate_inplace() # some axes don't allow reindexing with dups if not allow_dups: @@ -1428,26 +1450,9 @@ def equals(self, other: "BlockManager") -> bool: return False left = self.blocks[0].values right = other.blocks[0].values - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - return left.equals(right) - else: - return array_equivalent(left, right) + return array_equals(left, right) - for i in range(len(self.items)): - # Check column-wise, return False if any column doesn't match - left = self.iget_values(i) - right = other.iget_values(i) - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - if not left.equals(right): - return False - else: - if not array_equivalent(left, right, dtype_equal=True): - return False - return True + return blockwise_all(self, other, array_equals) def unstack(self, unstacker, fill_value) -> "BlockManager": """ diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index fd9a9a5ef6c93..ae4892c720d5b 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,21 +1,26 @@ -from typing import TYPE_CHECKING, List, Tuple +from collections import namedtuple +from typing import TYPE_CHECKING, Iterator, List, Tuple import numpy as np from pandas._typing import ArrayLike if TYPE_CHECKING: - from pandas.core.internals.managers import BlockManager # noqa:F401 from pandas.core.internals.blocks import Block # noqa:F401 + from pandas.core.internals.managers import BlockManager # noqa:F401 -def operate_blockwise( - left: "BlockManager", right: "BlockManager", array_op -) -> "BlockManager": +BlockPairInfo = namedtuple( + "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"], +) + + +def _iter_block_pairs( + left: "BlockManager", right: "BlockManager" +) -> Iterator[BlockPairInfo]: # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) - res_blks: List["Block"] = [] for n, blk in enumerate(left.blocks): locs = blk.mgr_locs blk_vals = blk.values @@ -34,21 +39,32 @@ def operate_blockwise( right_ea = not isinstance(rblk.values, np.ndarray) lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) + info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) + yield info - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) +def operate_blockwise( + left: "BlockManager", right: "BlockManager", array_op +) -> "BlockManager": + # At this point we have already checked the parent DataFrames for + # assert rframe._indexed_same(lframe) + + res_blks: List["Block"] = [] + for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right): + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) + + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - _reset_block_mgr_locs(nbs, locs) + _reset_block_mgr_locs(nbs, locs) - res_blks.extend(nbs) + res_blks.extend(nbs) # Assertions are disabled for performance, but should hold: # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} @@ -85,7 +101,7 @@ def _get_same_shape_values( # Require that the indexing into lvals be slice-like assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs - # TODO(EA2D): with 2D EAs pnly this first clause would be needed + # TODO(EA2D): with 2D EAs only this first clause would be needed if not (left_ea or right_ea): lvals = lvals[rblk.mgr_locs.indexer, :] assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) @@ -102,3 +118,14 @@ def _get_same_shape_values( rvals = rvals[0, :] return lvals, rvals + + +def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: + """ + Blockwise `all` reduction. + """ + for info in _iter_block_pairs(left, right): + res = op(info.lvals, info.rvals) + if not res: + return False + return True diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 3379ee56b6ad0..aab10cea33632 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -349,7 +349,8 @@ def fill_bool(x, left=None): filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_logical_op(lvalues, rvalues, op) - res_values = filler(res_values) # type: ignore + # error: Cannot call function of unknown type + res_values = filler(res_values) # type: ignore[operator] return res_values diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bfdfc65723433..e82a1d4d2cda8 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -967,7 +967,7 @@ def __init__(self, obj, *args, **kwargs): setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) # error: Too many arguments for "__init__" of "object" - super().__init__(None) # type: ignore + super().__init__(None) # type: ignore[call-arg] self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/pandas/core/series.py b/pandas/core/series.py index ef3be854bc3bb..93368ea1e515f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,6 +54,7 @@ is_list_like, is_object_dtype, is_scalar, + validate_all_hashable, ) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable @@ -491,8 +492,7 @@ def name(self) -> Label: @name.setter def name(self, value: Label) -> None: - if not is_hashable(value): - raise TypeError("Series.name must be a hashable type") + validate_all_hashable(value, error_name=f"{type(self).__name__}.name") object.__setattr__(self, "_name", value) @property @@ -571,7 +571,8 @@ def _values(self): """ return self._mgr.internal_values() - @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore + # error: Decorated property not supported + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] @property def array(self) -> ExtensionArray: return self._mgr._block.array_values() @@ -4921,7 +4922,10 @@ def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_index = self.index.to_timestamp(freq=freq, how=how) # type: ignore + # error: "PeriodIndex" has no attribute "to_timestamp" + new_index = self.index.to_timestamp( # type: ignore[attr-defined] + freq=freq, how=how + ) return self._constructor(new_values, index=new_index).__finalize__( self, method="to_timestamp" ) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ee73aa42701b0..8bdd466ae6f33 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,5 +1,6 @@ """ miscellaneous sorting / groupby utilities """ -from typing import Callable, Optional +from collections import defaultdict +from typing import TYPE_CHECKING, Callable, DefaultDict, Iterable, List, Optional, Tuple import numpy as np @@ -18,6 +19,9 @@ import pandas.core.algorithms as algorithms from pandas.core.construction import extract_array +if TYPE_CHECKING: + from pandas.core.indexes.base import Index # noqa:F401 + _INT64_MAX = np.iinfo(np.int64).max @@ -409,7 +413,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): levels : Optional[List], if values is a MultiIndex, list of levels to apply the key to. """ - from pandas.core.indexes.api import Index + from pandas.core.indexes.api import Index # noqa:F811 if not key: return values @@ -440,36 +444,21 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result -class _KeyMapper: - """ - Map compressed group id -> key tuple. - """ - - def __init__(self, comp_ids, ngroups: int, levels, labels): - self.levels = levels - self.labels = labels - self.comp_ids = comp_ids.astype(np.int64) - - self.k = len(labels) - self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] - - self._populate_tables() - - def _populate_tables(self): - for labs, table in zip(self.labels, self.tables): - table.map(self.comp_ids, labs.astype(np.int64)) - - def get_key(self, comp_id): - return tuple( - level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels) - ) - - -def get_flattened_iterator(comp_ids, ngroups, levels, labels): - # provide "flattened" iterator for multi-group setting - mapper = _KeyMapper(comp_ids, ngroups, levels, labels) - return [mapper.get_key(i) for i in range(ngroups)] +def get_flattened_list( + comp_ids: np.ndarray, + ngroups: int, + levels: Iterable["Index"], + labels: Iterable[np.ndarray], +) -> List[Tuple]: + """Map compressed group id -> key tuple.""" + comp_ids = comp_ids.astype(np.int64, copy=False) + arrays: DefaultDict[int, List[int]] = defaultdict(list) + for labs, level in zip(labels, levels): + table = hashtable.Int64HashTable(ngroups) + table.map(comp_ids, labs.astype(np.int64, copy=False)) + for i in range(ngroups): + arrays[i].append(level[table.get_item(i)]) + return [tuple(array) for array in arrays.values()] def get_indexer_dict(label_list, keys): diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..6702bf519c52e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -155,7 +155,7 @@ def _map_stringarray( an ndarray. """ - from pandas.arrays import IntegerArray, StringArray, BooleanArray + from pandas.arrays import BooleanArray, IntegerArray, StringArray mask = isna(arr) @@ -2186,7 +2186,7 @@ def _wrap_result( returns_string=True, ): - from pandas import Index, Series, MultiIndex + from pandas import Index, MultiIndex, Series # for category, we do the stuff on the categories, so blow it up # to the full series again @@ -2292,7 +2292,7 @@ def _get_series_list(self, others): list of Series Others transformed into list of Series. """ - from pandas import Series, DataFrame + from pandas import DataFrame, Series # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0adab143f6052..3c1fe6bacefcf 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -53,9 +53,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from pandas import Series # noqa:F401 from pandas._libs.tslibs.nattype import NaTType # noqa:F401 + from pandas import Series # noqa:F401 + # --------------------------------------------------------------------- # types used in annotations @@ -308,7 +309,7 @@ def _convert_listlike_datetimes( if tz == "utc": # error: Item "DatetimeIndex" of "Union[DatetimeArray, DatetimeIndex]" has # no attribute "tz_convert" - arg = arg.tz_convert(None).tz_localize(tz) # type: ignore + arg = arg.tz_convert(None).tz_localize(tz) # type: ignore[union-attr] return arg elif is_datetime64_ns_dtype(arg_dtype): @@ -876,7 +877,7 @@ def _assemble_from_unit_mappings(arg, errors, tz): ------- Series """ - from pandas import to_timedelta, to_numeric, DataFrame + from pandas import DataFrame, to_numeric, to_timedelta arg = DataFrame(arg) if not arg.columns.is_unique: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 1b56b6d5a46fa..d79b9f4092325 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -275,7 +275,7 @@ def hash_array( # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: - from pandas import factorize, Categorical, Index + from pandas import Categorical, Index, factorize codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 58e7841d4dde5..51a067427e867 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -52,7 +52,7 @@ def __init__(self, obj, *args, **kwargs): kwargs.pop("parent", None) groupby = kwargs.pop("groupby", None) if groupby is None: - groupby, obj = obj, obj.obj + groupby, obj = obj, obj._selected_obj self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 7a2d8e84bec76..c57c434dd3040 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -12,9 +12,7 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.common import is_datetime64_ns_dtype -from pandas.core.dtypes.generic import ABCDataFrame -from pandas.core.base import DataError import pandas.core.common as common from pandas.core.window.common import _doc_template, _shared_docs, zsqrt from pandas.core.window.rolling import _flex_binary_moment, _Rolling @@ -302,30 +300,13 @@ def _apply(self, func): ------- y : same type as input argument """ - blocks, obj = self._create_blocks(self._selected_obj) - block_list = list(blocks) - - results = [] - exclude = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") from err + def homogeneous_func(values: np.ndarray): if values.size == 0: - results.append(values.copy()) - continue + return values.copy() + return np.apply_along_axis(func, self.axis, values) - results.append(np.apply_along_axis(func, self.axis, values)) - - return self._wrap_results(results, block_list, obj, exclude) + return self._apply_blockwise(homogeneous_func) @Substitution(name="ewm", func_name="mean") @Appender(_doc_template) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0898836ed2e0e..bc36bdca982e8 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -319,4 +319,10 @@ def get_window_bounds( end_arrays.append(window_indicies.take(end)) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) + # GH 35552: Need to adjust start and end based on the nans appended to values + # when center=True + if num_values > len(start): + offset = num_values - len(start) + start = np.concatenate([start, np.array([end[-1]] * offset)]) + end = np.concatenate([end, np.array([end[-1]] * offset)]) return start, end diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 445f179248226..7347d5686aabc 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -12,7 +12,7 @@ from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import Axis, FrameOrSeries, Scalar +from pandas._typing import ArrayLike, Axis, FrameOrSeries, Scalar from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -487,6 +487,38 @@ def _get_window_indexer(self, window: int) -> BaseIndexer: return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) return FixedWindowIndexer(window_size=window) + def _apply_blockwise( + self, homogeneous_func: Callable[..., ArrayLike] + ) -> FrameOrSeries: + """ + Apply the given function to the DataFrame broken down into homogeneous + sub-frames. + """ + # This isn't quite blockwise, since `blocks` is actually a collection + # of homogenenous DataFrames. + blocks, obj = self._create_blocks(self._selected_obj) + + skipped: List[int] = [] + results: List[ArrayLike] = [] + exclude: List[Scalar] = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError) as err: + if isinstance(obj, ABCDataFrame): + skipped.append(i) + exclude.extend(b.columns) + continue + else: + raise DataError("No numeric types to aggregate") from err + + result = homogeneous_func(values) + results.append(result) + + block_list = [blk for i, blk in enumerate(blocks) if i not in skipped] + return self._wrap_results(results, block_list, obj, exclude) + def _apply( self, func: Callable, @@ -524,30 +556,14 @@ def _apply( """ win_type = self._get_win_type(kwargs) window = self._get_window(win_type=win_type) - - blocks, obj = self._create_blocks(self._selected_obj) - block_list = list(blocks) window_indexer = self._get_window_indexer(window) - results = [] - exclude: List[Scalar] = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") from err + def homogeneous_func(values: np.ndarray): + # calculation function if values.size == 0: - results.append(values.copy()) - continue + return values.copy() - # calculation function offset = calculate_center_offset(window) if center else 0 additional_nans = np.array([np.nan] * offset) @@ -594,9 +610,9 @@ def calc(x): if center: result = self._center_window(result, window) - results.append(result) + return result - return self._wrap_results(results, block_list, obj, exclude) + return self._apply_blockwise(homogeneous_func) def aggregate(self, func, *args, **kwargs): result, how = self._aggregate(func, *args, **kwargs) @@ -2196,7 +2212,7 @@ def _apply( # Cannot use _wrap_outputs because we calculate the result all at once # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names - grouped_object_index = self._groupby._selected_obj.index + grouped_object_index = self.obj.index grouped_index_name = [grouped_object_index.name] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name @@ -2258,7 +2274,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] if self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self._groupby._selected_obj.index.asi8 + index_array = self.obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2275,7 +2291,7 @@ def _gotitem(self, key, ndim, subset=None): # here so our index is carried thru to the selected obj # when we do the splitting for the groupby if self.on is not None: - self._groupby.obj = self._groupby.obj.set_index(self._on) + self.obj = self.obj.set_index(self._on) self.on = None return super()._gotitem(key, ndim, subset=subset) diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 40bff5a75709b..d16955a98b62f 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -311,17 +311,17 @@ def init_windows_clipboard(): global HGLOBAL, LPVOID, DWORD, LPCSTR, INT global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE from ctypes.wintypes import ( - HGLOBAL, - LPVOID, + BOOL, DWORD, - LPCSTR, - INT, - HWND, + HANDLE, + HGLOBAL, HINSTANCE, HMENU, - BOOL, + HWND, + INT, + LPCSTR, + LPVOID, UINT, - HANDLE, ) windll = ctypes.windll @@ -528,8 +528,8 @@ def determine_clipboard(): # Setup for the MAC OS X platform: if os.name == "mac" or platform.system() == "Darwin": try: - import Foundation # check if pyobjc is installed import AppKit + import Foundation # check if pyobjc is installed except ImportError: return init_osx_pbcopy_clipboard() else: diff --git a/pandas/io/common.py b/pandas/io/common.py index 6ac8051f35b6f..34e4425c657f1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -121,7 +121,15 @@ def stringify_path( """ if hasattr(filepath_or_buffer, "__fspath__"): # https://github.com/python/mypy/issues/1424 - return filepath_or_buffer.__fspath__() # type: ignore + # error: Item "str" of "Union[str, Path, IO[str]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "IO[str]" of "Union[str, Path, IO[str]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "str" of "Union[str, Path, IO[bytes]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no + # attribute "__fspath__" [union-attr] + return filepath_or_buffer.__fspath__() # type: ignore[union-attr] elif isinstance(filepath_or_buffer, pathlib.Path): return str(filepath_or_buffer) return _expand_user(filepath_or_buffer) @@ -399,8 +407,9 @@ def get_handle( memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True - whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.). + Whether the type of the content passed to the file/buffer is string or + bytes. This is not the same as `"b" not in mode`. If a string content is + passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list @@ -441,14 +450,14 @@ def get_handle( if is_path: f = gzip.open(path_or_buf, mode, **compression_args) else: - f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) + f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": if is_path: f = bz2.BZ2File(path_or_buf, mode, **compression_args) else: - f = bz2.BZ2File(path_or_buf, **compression_args) + f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": @@ -481,10 +490,14 @@ def get_handle( handles.append(f) elif is_path: - if encoding: + # Check whether the filename is to be opened in binary mode. + # Binary mode does not support 'encoding' and 'newline'. + is_binary_mode = "b" in mode + + if encoding and not is_binary_mode: # Encoding f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") - elif is_text: + elif is_text and not is_binary_mode: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") else: @@ -516,7 +529,19 @@ def get_handle( return f, handles -class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore +# error: Definition of "__exit__" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "BinaryIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "IO" [misc] +# error: Definition of "read" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "read" in base class "ZipFile" is incompatible with +# definition in base class "IO" [misc] +class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc] """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2a12f779230b2..b1bbda4a4b7e0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -834,8 +834,8 @@ class ExcelFile: from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader - from pandas.io.excel._xlrd import _XlrdReader from pandas.io.excel._pyxlsb import _PyxlsbReader + from pandas.io.excel._xlrd import _XlrdReader _engines = { "xlrd": _XlrdReader, diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 85ec9afaaec25..44abaf5d3b3c9 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -191,9 +191,9 @@ def _get_cell_string_value(self, cell) -> str: Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ - from odf.element import Text, Element - from odf.text import S, P + from odf.element import Element, Text from odf.namespaces import TEXTNS + from odf.text import P, S text_p = P().qname text_s = S().qname diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f34..03a30cbd62f9a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -225,7 +225,7 @@ def _convert_to_fill(cls, fill_dict): ------- fill : openpyxl.styles.Fill """ - from openpyxl.styles import PatternFill, GradientFill + from openpyxl.styles import GradientFill, PatternFill _pattern_fill_key_map = { "patternType": "fill_type", diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..af82c15fd6b66 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -48,11 +48,11 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): from xlrd import ( - xldate, + XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_ERROR, - XL_CELL_BOOLEAN, XL_CELL_NUMBER, + xldate, ) epoch1904 = self.book.datemode diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5bd51dc8351f6..b10946a20d041 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -3,11 +3,10 @@ """ import csv as csvlib -from io import StringIO +from io import StringIO, TextIOWrapper import os from typing import Hashable, List, Mapping, Optional, Sequence, Union import warnings -from zipfile import ZipFile import numpy as np @@ -159,38 +158,29 @@ def save(self) -> None: """ Create the writer & save. """ - # GH21227 internal compression is not used when file-like passed. - if self.compression and hasattr(self.path_or_buf, "write"): + # GH21227 internal compression is not used for non-binary handles. + if ( + self.compression + and hasattr(self.path_or_buf, "write") + and "b" not in self.mode + ): warnings.warn( - "compression has no effect when passing file-like object as input.", + "compression has no effect when passing a non-binary object as input.", RuntimeWarning, stacklevel=2, ) - - # when zip compression is called. - is_zip = isinstance(self.path_or_buf, ZipFile) or ( - not hasattr(self.path_or_buf, "write") and self.compression == "zip" + self.compression = None + + # get a handle or wrap an existing handle to take care of 1) compression and + # 2) text -> byte conversion + f, handles = get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + errors=self.errors, + compression=dict(self.compression_args, method=self.compression), ) - if is_zip: - # zipfile doesn't support writing string to archive. uses string - # buffer to receive csv writing and dump into zip compression - # file handle. GH21241, GH21118 - f = StringIO() - close = False - elif hasattr(self.path_or_buf, "write"): - f = self.path_or_buf - close = False - else: - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=dict(self.compression_args, method=self.compression), - ) - close = True - try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( @@ -206,29 +196,23 @@ def save(self) -> None: self._save() finally: - if is_zip: - # GH17778 handles zip compression separately. - buf = f.getvalue() - if hasattr(self.path_or_buf, "write"): - self.path_or_buf.write(buf) - else: - compression = dict(self.compression_args, method=self.compression) - - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=compression, - ) - f.write(buf) - close = True - if close: + if self.should_close: f.close() - for _fh in handles: - _fh.close() - elif self.should_close: + elif ( + isinstance(f, TextIOWrapper) + and not f.closed + and f != self.path_or_buf + and hasattr(self.path_or_buf, "write") + ): + # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper + # closes the wrapped handle if it is not detached. + f.flush() # make sure everything is written + f.detach() # makes f unusable + del f + elif f != self.path_or_buf: f.close() + for _fh in handles: + _fh.close() def _save_header(self): writer = self.writer diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index fe85eab4bfbf5..81990b3d505e1 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -72,7 +72,7 @@ from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: - from pandas import Series, DataFrame, Categorical + from pandas import Categorical, DataFrame, Series FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] @@ -931,6 +931,7 @@ def to_latex( multirow: bool = False, caption: Optional[str] = None, label: Optional[str] = None, + position: Optional[str] = None, ) -> Optional[str]: """ Render a DataFrame to a LaTeX tabular/longtable environment output. @@ -946,6 +947,7 @@ def to_latex( multirow=multirow, caption=caption, label=label, + position=position, ).get_result(buf=buf, encoding=encoding) def _format_col(self, i: int) -> List[str]: diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 13f0ab1e8a52c..c89189f1e679a 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -86,8 +86,9 @@ def _get_columns_formatted_values(self) -> Iterable: return self.columns # https://github.com/python/mypy/issues/1237 + # error: Signature of "is_truncated" incompatible with supertype "TableFormatter" @property - def is_truncated(self) -> bool: # type: ignore + def is_truncated(self) -> bool: # type: ignore[override] return self.fmt.is_truncated @property diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 3a3ca84642d51..5d6f0a08ef2b5 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -38,6 +38,7 @@ def __init__( multirow: bool = False, caption: Optional[str] = None, label: Optional[str] = None, + position: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame @@ -50,6 +51,8 @@ def __init__( self.caption = caption self.label = label self.escape = self.fmt.escape + self.position = position + self._table_float = any(p is not None for p in (caption, label, position)) def write_result(self, buf: IO[str]) -> None: """ @@ -284,7 +287,7 @@ def _write_tabular_begin(self, buf, column_format: str): `__ e.g 'rcl' for 3 columns """ - if self.caption is not None or self.label is not None: + if self._table_float: # then write output in a nested table/tabular environment if self.caption is None: caption_ = "" @@ -296,7 +299,12 @@ def _write_tabular_begin(self, buf, column_format: str): else: label_ = f"\n\\label{{{self.label}}}" - buf.write(f"\\begin{{table}}\n\\centering{caption_}{label_}\n") + if self.position is None: + position_ = "" + else: + position_ = f"[{self.position}]" + + buf.write(f"\\begin{{table}}{position_}\n\\centering{caption_}{label_}\n") else: # then write output only in a tabular environment pass @@ -317,7 +325,7 @@ def _write_tabular_end(self, buf): """ buf.write("\\bottomrule\n") buf.write("\\end{tabular}\n") - if self.caption is not None or self.label is not None: + if self._table_float: buf.write("\\end{table}\n") else: pass @@ -337,25 +345,29 @@ def _write_longtable_begin(self, buf, column_format: str): `__ e.g 'rcl' for 3 columns """ - buf.write(f"\\begin{{longtable}}{{{column_format}}}\n") + if self.caption is None: + caption_ = "" + else: + caption_ = f"\\caption{{{self.caption}}}" - if self.caption is not None or self.label is not None: - if self.caption is None: - pass - else: - buf.write(f"\\caption{{{self.caption}}}") + if self.label is None: + label_ = "" + else: + label_ = f"\\label{{{self.label}}}" - if self.label is None: - pass - else: - buf.write(f"\\label{{{self.label}}}") + if self.position is None: + position_ = "" + else: + position_ = f"[{self.position}]" + buf.write( + f"\\begin{{longtable}}{position_}{{{column_format}}}\n{caption_}{label_}" + ) + if self.caption is not None or self.label is not None: # a double-backslash is required at the end of the line # as discussed here: # https://tex.stackexchange.com/questions/219138 buf.write("\\\\\n") - else: - pass @staticmethod def _write_longtable_end(buf): diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 1cf79dc105901..23daab725ec65 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -499,7 +499,7 @@ def _justify( # error: Incompatible return value type (got "Tuple[List[Sequence[str]], # List[Sequence[str]]]", expected "Tuple[List[Tuple[str, ...]], # List[Tuple[str, ...]]]") - return head, tail # type: ignore + return head, tail # type: ignore[return-value] def format_object_attrs( @@ -524,14 +524,16 @@ def format_object_attrs( attrs: List[Tuple[str, Union[str, int]]] = [] if hasattr(obj, "dtype") and include_dtype: # error: "Sequence[Any]" has no attribute "dtype" - attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore + attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore[attr-defined] if getattr(obj, "name", None) is not None: # error: "Sequence[Any]" has no attribute "name" - attrs.append(("name", default_pprint(obj.name))) # type: ignore + attrs.append(("name", default_pprint(obj.name))) # type: ignore[attr-defined] # error: "Sequence[Any]" has no attribute "names" - elif getattr(obj, "names", None) is not None and any(obj.names): # type: ignore + elif getattr(obj, "names", None) is not None and any( + obj.names # type: ignore[attr-defined] + ): # error: "Sequence[Any]" has no attribute "names" - attrs.append(("names", default_pprint(obj.names))) # type: ignore + attrs.append(("names", default_pprint(obj.names))) # type: ignore[attr-defined] max_seq_items = get_option("display.max_seq_items") or len(obj) if len(obj) > max_seq_items: attrs.append(("length", len(obj))) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d11144938eb26..584f42a6cab12 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -42,8 +42,8 @@ try: - import matplotlib.pyplot as plt from matplotlib import colors + import matplotlib.pyplot as plt has_mpl = True except ImportError: @@ -390,7 +390,7 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style - if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): + if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) row_es.append(row_dict) props = [] diff --git a/pandas/io/html.py b/pandas/io/html.py index 3193f52d239f1..8354cf413814e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -707,8 +707,8 @@ def _build_doc(self): -------- pandas.io.html._HtmlFrameParser._build_doc """ - from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + from lxml.html import HTMLParser, fromstring, parse parser = HTMLParser(recover=True, encoding=self.encoding) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ff37c36962aec..0b06a26d4aa3c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -115,7 +115,8 @@ def __init__( self.obj = obj if orient is None: - orient = self._default_orient # type: ignore + # error: "Writer" has no attribute "_default_orient" + orient = self._default_orient # type: ignore[attr-defined] self.orient = orient self.date_format = date_format diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b67a1c5781d91..aeb7b3e044794 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -57,7 +57,7 @@ from pandas.io.formats.printing import adjoin, pprint_thing if TYPE_CHECKING: - from tables import File, Node, Col # noqa:F401 + from tables import Col, File, Node # noqa:F401 # versioning attribute @@ -289,7 +289,15 @@ def read_hdf( Read from the store, close it if we opened it. Retrieve pandas object stored in file, optionally based on where - criteria + criteria. + + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. Parameters ---------- @@ -445,6 +453,14 @@ class HDFStore: Either Fixed or Table format. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- path : str @@ -789,6 +805,14 @@ def select( """ Retrieve pandas object stored in file, optionally based on where criteria. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -852,6 +876,15 @@ def select_as_coordinates( """ return the selection as an Index + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + + Parameters ---------- key : str @@ -876,6 +909,14 @@ def select_column( return a single column from the table. This is generally only useful to select an indexable + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -912,6 +953,14 @@ def select_as_multiple( """ Retrieve pandas objects from multiple tables. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- keys : a list of the tables @@ -2280,7 +2329,8 @@ def _get_atom(cls, values: ArrayLike) -> "Col": Get an appropriately typed and shaped pytables.Col object for values. """ dtype = values.dtype - itemsize = dtype.itemsize # type: ignore + # error: "ExtensionDtype" has no attribute "itemsize" + itemsize = dtype.itemsize # type: ignore[attr-defined] shape = values.shape if values.ndim == 1: @@ -3349,9 +3399,9 @@ def queryables(self) -> Dict[str, Any]: (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) ] - return dict(d1 + d2 + d3) # type: ignore - # error: List comprehension has incompatible type - # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]] + # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" + # and "List[Tuple[str, None]]") + return dict(d1 + d2 + d3) # type: ignore[operator] def index_cols(self): """ return a list of my index cols """ diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 0038e39e2ffcc..17b41fd2b4379 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -1,8 +1,8 @@ # cython: profile=False # cython: boundscheck=False, initializedcheck=False from cython import Py_ssize_t - import numpy as np + import pandas.io.sas.sas_constants as const ctypedef signed long long int64_t diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9177696ca13d6..51888e5021d80 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -439,7 +439,8 @@ def read_sql( con : SQLAlchemy connectable, str, or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible - for engine disposal and connection closure for the SQLAlchemy connectable. See + for engine disposal and connection closure for the SQLAlchemy connectable; str + connections are closed automatically. See `here `_. index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). @@ -937,7 +938,7 @@ def _get_column_names_and_types(self, dtype_mapper): return column_names_and_types def _create_table_setup(self): - from sqlalchemy import Table, Column, PrimaryKeyConstraint + from sqlalchemy import Column, PrimaryKeyConstraint, Table column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type) @@ -1026,15 +1027,15 @@ def _sqlalchemy_type(self, col): col_type = lib.infer_dtype(col, skipna=True) from sqlalchemy.types import ( + TIMESTAMP, BigInteger, - Integer, - Float, - Text, Boolean, - DateTime, Date, + DateTime, + Float, + Integer, + Text, Time, - TIMESTAMP, ) if col_type == "datetime64" or col_type == "datetime": @@ -1079,7 +1080,7 @@ def _sqlalchemy_type(self, col): return Text def _get_dtype(self, sqltype): - from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP + from sqlalchemy.types import TIMESTAMP, Boolean, Date, DateTime, Float, Integer if isinstance(sqltype, Float): return float @@ -1374,7 +1375,7 @@ def to_sql( dtype = {col_name: dtype for col_name in frame} if dtype is not None: - from sqlalchemy.types import to_instance, TypeEngine + from sqlalchemy.types import TypeEngine, to_instance for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7677d8a94d521..cb23b781a7ad2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1643,8 +1643,7 @@ def read( data = self._insert_strls(data) - cols_ = np.where(self.dtyplist)[0] - + cols_ = np.where([dtyp is not None for dtyp in self.dtyplist])[0] # Convert columns (if needed) to match input type ix = data.index requires_type_conversion = False @@ -1953,7 +1952,10 @@ def _open_file_binary_write( """ if hasattr(fname, "write"): # See https://github.com/python/mypy/issues/1424 for hasattr challenges - return fname, False, None # type: ignore + # error: Incompatible return value type (got "Tuple[Union[str, Path, + # IO[Any]], bool, None]", expected "Tuple[BinaryIO, bool, Union[str, + # Mapping[str, str], None]]") + return fname, False, None # type: ignore[return-value] elif isinstance(fname, (str, Path)): # Extract compression mode as given, if dict compression_typ, compression_args = get_compression_method(compression) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 353bc8a8936a5..b490e07e43753 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1149,8 +1149,8 @@ def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): @classmethod def _ts_plot(cls, ax, x, data, style=None, **kwds): from pandas.plotting._matplotlib.timeseries import ( - _maybe_resample, _decorate_axes, + _maybe_resample, format_dateaxis, ) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 8f3571cf13cbc..eef4276f0ed09 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -24,7 +24,7 @@ from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod if TYPE_CHECKING: - from pandas import Series, Index # noqa:F401 + from pandas import Index, Series # noqa:F401 # --------------------------------------------------------------------- @@ -45,7 +45,10 @@ def _maybe_resample(series: "Series", ax, kwargs): if ax_freq is not None and freq != ax_freq: if is_superperiod(freq, ax_freq): # upsample input series = series.copy() - series.index = series.index.asfreq(ax_freq, how="s") # type: ignore + # error: "Index" has no attribute "asfreq" + series.index = series.index.asfreq( # type: ignore[attr-defined] + ax_freq, how="s" + ) freq = ax_freq elif _is_sup(freq, ax_freq): # one is weekly how = kwargs.pop("how", "last") @@ -222,7 +225,8 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]: if freq is None: freq = getattr(index, "inferred_freq", None) if freq == "B": - weekdays = np.unique(index.dayofweek) # type: ignore + # error: "Index" has no attribute "dayofweek" + weekdays = np.unique(index.dayofweek) # type: ignore[attr-defined] if (5 in weekdays) or (6 in weekdays): freq = None diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ecd20796b6f21..caa348d3a1fb9 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -267,9 +267,10 @@ def test_sparsearray(): def test_np(): - import numpy as np import warnings + import numpy as np + with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) assert (pd.np.arange(0, 10) == np.arange(0, 10)).all() diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 2155846b271fc..484f83deb0f55 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -548,20 +548,6 @@ class TestMultiplicationDivision: # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ # for non-timestamp/timedelta/period dtypes - @pytest.mark.parametrize( - "box", - [ - pytest.param( - pd.Index, - marks=pytest.mark.xfail( - reason="Index.__div__ always raises", raises=TypeError - ), - ), - pd.Series, - pd.DataFrame, - ], - ids=lambda x: x.__name__, - ) def test_divide_decimal(self, box): # resolves issue GH#9787 ser = Series([Decimal(10)]) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index f94408d657ae5..64d3d5b6d684d 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1733,6 +1733,23 @@ def test_tdarr_div_length_mismatch(self, box_with_array): # ------------------------------------------------------------------ # __floordiv__, __rfloordiv__ + def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): + # GH#35529 + box = box_with_array + + left = pd.Series([1000, 222330, 30], dtype="timedelta64[ns]") + right = pd.Series([1000, 222330, None], dtype="timedelta64[ns]") + + left = tm.box_expected(left, box) + right = tm.box_expected(right, box) + + expected = np.array([1.0, 1.0, np.nan], dtype=np.float64) + expected = tm.box_expected(expected, box) + + result = left // right + + tm.assert_equal(result, expected) + def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td): # GH#18831 td1 = Series([timedelta(minutes=5, seconds=3)] * 3) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index d517eaaec68d2..0176755b54dd1 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -142,6 +142,7 @@ def test_repr(): @pyarrow_skip def test_arrow_extension_type(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType p1 = ArrowIntervalType(pa.int64(), "left") @@ -158,6 +159,7 @@ def test_arrow_extension_type(): @pyarrow_skip def test_arrow_array(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType intervals = pd.interval_range(1, 5, freq=1).array @@ -187,6 +189,7 @@ def test_arrow_array(): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) @@ -221,6 +224,7 @@ def test_arrow_array_missing(): ) def test_arrow_table_roundtrip(breaks): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks(breaks) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 8887dd0278afe..0d81e8e733842 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -359,6 +359,7 @@ def test_arrow_extension_type(): ) def test_arrow_array(data, freq): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType periods = period_array(data, freq=freq) @@ -384,6 +385,7 @@ def test_arrow_array(data, freq): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") @@ -399,6 +401,7 @@ def test_arrow_array_missing(): @pyarrow_skip def test_arrow_table_roundtrip(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ce12718e48d0d..a6c526fcb008a 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -746,3 +746,13 @@ def test_astype_object_preserves_datetime_na(from_type): result = astype_nansafe(arr, dtype="object") assert isna(result)[0] + + +def test_validate_allhashable(): + assert com.validate_all_hashable(1, "a") is None + + with pytest.raises(TypeError, match="All elements must be hashable"): + com.validate_all_hashable([]) + + with pytest.raises(TypeError, match="list must be a hashable type"): + com.validate_all_hashable([], error_name="list") diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 359acf230ce14..c93603398977e 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -114,10 +114,13 @@ def test_error(self, data, all_arithmetic_operators): with pytest.raises(AttributeError): getattr(data, op_name) - def test_direct_arith_with_series_returns_not_implemented(self, data): - # EAs should return NotImplemented for ops with Series. + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): + # EAs should return NotImplemented for ops with Series/DataFrame # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() if hasattr(data, "__add__"): result = data.__add__(other) assert result is NotImplemented @@ -156,10 +159,14 @@ def test_compare_array(self, data, all_compare_operators): other = pd.Series([data[0]] * len(data)) self._compare_other(s, data, op_name, other) - def test_direct_arith_with_series_returns_not_implemented(self, data): - # EAs should return NotImplemented for ops with Series. + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): + # EAs should return NotImplemented for ops with Series/DataFrame # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() + if hasattr(data, "__eq__"): result = data.__eq__(other) assert result is NotImplemented diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index b1eb276bfc227..817881e00fa99 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -126,9 +126,13 @@ def test_add_series_with_extension_array(self, data): def test_error(self): pass - def test_direct_arith_with_series_returns_not_implemented(self, data): + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): # Override to use __sub__ instead of __add__ other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() + result = data.__sub__(other) assert result is NotImplemented diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 9ec029a6c4304..8f6902eca816f 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -145,6 +145,33 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) + def test_shift_axis1_multiple_blocks(self): + # GH#35488 + df1 = pd.DataFrame(np.random.randint(1000, size=(5, 3))) + df2 = pd.DataFrame(np.random.randint(1000, size=(5, 2))) + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + + result = df3.shift(2, axis=1) + + expected = df3.take([-1, -1, 0, 1, 2], axis=1) + expected.iloc[:, :2] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + # rebuild df3 because `take` call above consolidated + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + result = df3.shift(-2, axis=1) + + expected = df3.take([2, 3, 4, -1, -1], axis=1) + expected.iloc[:, -2:] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_frame): # TODO: remove this test when tshift deprecation is enforced diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 5216c3be116e0..dcc33428d18a5 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -555,8 +555,8 @@ def test_sort_index_and_reconstruction(self): ), ) - df.columns.set_levels( - pd.to_datetime(df.columns.levels[1]), level=1, inplace=True + df.columns = df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1 ) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9d6b9f39a0578..52a1e3aae9058 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -287,7 +287,7 @@ def test_stat_op_api(self, float_frame, float_string_frame): assert_stat_op_api("median", float_frame, float_string_frame) try: - from scipy.stats import skew, kurtosis # noqa:F401 + from scipy.stats import kurtosis, skew # noqa:F401 assert_stat_op_api("skew", float_frame, float_string_frame) assert_stat_op_api("kurt", float_frame, float_string_frame) @@ -370,7 +370,7 @@ def kurt(x): ) try: - from scipy import skew, kurtosis # noqa:F401 + from scipy import kurtosis, skew # noqa:F401 assert_stat_op_calc("skew", skewness, float_frame_with_na) assert_stat_op_calc("kurt", kurt, float_frame_with_na) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b79fc8cd3406..cc57a3970d18b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -367,6 +367,13 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr + def test_to_numpy_mixed_dtype_to_str(self): + # https://github.com/pandas-dev/pandas/issues/35455 + df = pd.DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]]) + result = df.to_numpy(dtype=str) + expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) + tm.assert_numpy_array_equal(result, expected) + def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5)) tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a4ed548264d39..d0f774344a33d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1618,6 +1618,42 @@ def test_constructor_Series_differently_indexed(self): tm.assert_index_equal(df2.index, other_index) tm.assert_frame_equal(df2, exp2) + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, "idx"), + ("idx", None, None, "idx"), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_constructor_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + pd.Index(["a", "b", "c"], name=name_in1), + pd.Index(["b", "c", "d"], name=name_in2), + pd.Index(["c", "d", "e"], name=name_in3), + ] + series = { + c: pd.Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"]) + } + result = pd.DataFrame(series) + + exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) + expected = pd.DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + def test_constructor_manager_resize(self, float_frame): index = list(float_frame.index[:5]) columns = list(float_frame.columns[:3]) @@ -2619,6 +2655,12 @@ class DatetimeSubclass(datetime): data = pd.DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) assert data.datetime.dtype == "datetime64[ns]" + def test_with_mismatched_index_length_raises(self): + # GH#33437 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + with pytest.raises(ValueError, match="Shape of passed values"): + DataFrame(dti, index=range(4)) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 264cf40dc6984..e8cd6017a117c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -486,13 +486,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a1268bfb03db..ee38722ffb8ce 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import date, datetime from io import StringIO import numpy as np @@ -63,15 +63,8 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - reason="GH#20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object." -) def test_apply_trivial_fail(): # GH 20066 - # trivial apply fails if the constant dataframe has the same index - # with the one used to create GroupBy object. df = pd.DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], @@ -1014,3 +1007,82 @@ def test_apply_with_timezones_aware(): result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) tm.assert_frame_equal(result1, result2) + + +def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): + # GH #34656 + # GH #34271 + df = DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + } + ) + + expected = pd.DataFrame( + {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + index=pd.Index([88, 99], name="a"), + ) + + # Check output when no other methods are called before .apply() + grp = df.groupby(by="a") + result = grp.apply(sum) + tm.assert_frame_equal(result, expected) + + # Check output when another method is called before .apply() + grp = df.groupby(by="a") + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + _ = getattr(grp, reduction_func)(*args) + result = grp.apply(sum) + tm.assert_frame_equal(result, expected) + + +def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): + # GH 29617 + + df = pd.DataFrame( + { + "A": ["a", "a", "a", "b"], + "B": [ + date(2020, 1, 10), + date(2020, 1, 10), + date(2020, 2, 10), + date(2020, 2, 10), + ], + "C": [1, 2, 3, 4], + }, + index=pd.Index([100, 101, 102, 103], name="idx"), + ) + + grp = df.groupby(["A", "B"]) + result = grp.apply(lambda x: x.head(1)) + + expected = df.iloc[[0, 2, 3]] + expected = expected.reset_index() + expected.index = pd.MultiIndex.from_frame(expected[["A", "B", "idx"]]) + expected = expected.drop(columns="idx") + + tm.assert_frame_equal(result, expected) + for val in result.index.levels[1]: + assert type(val) is date + + +def test_apply_by_cols_equals_apply_by_rows_transposed(): + # GH 16646 + # Operating on the columns, or transposing and operating on the rows + # should give the same result. There was previously a bug where the + # by_rows operation would work fine, but by_cols would throw a ValueError + + df = pd.DataFrame( + np.random.random([6, 4]), + columns=pd.MultiIndex.from_product([["A", "B"], [1, 2]]), + ) + + by_rows = df.T.groupby(axis=0, level=0).apply( + lambda x: x.droplevel(axis=0, level=0) + ) + by_cols = df.groupby(axis=1, level=0).apply(lambda x: x.droplevel(axis=1, level=0)) + + tm.assert_frame_equal(by_cols, by_rows.T) + tm.assert_frame_equal(by_cols, df) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0d447a70b540d..c74c1529eb537 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -19,7 +19,7 @@ import pandas._testing as tm -def cartesian_product_for_groupers(result, args, names): +def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): """ Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -33,7 +33,7 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) - return result.reindex(index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() _results_for_groupbys_with_missing_categories = dict( @@ -309,7 +309,7 @@ def test_observed(observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0 ) tm.assert_frame_equal(result, expected) @@ -319,7 +319,9 @@ def test_observed(observed): expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) + expected = cartesian_product_for_groupers( + expected, [cat1, cat2], list("AB"), fill_value=0 + ) tm.assert_frame_equal(result, expected) @@ -1189,6 +1191,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): ).sortlevel() expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + if operation == "agg": + expected = expected.fillna(0, downcast="infer") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1338,15 +1342,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( ) request.node.add_marker(mark) - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1367,8 +1362,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) - # If we expect unobserved values to be zero, we also expect the dtype to be int - if zero_or_nan == 0: + # If we expect unobserved values to be zero, we also expect the dtype to be int. + # Except for .sum(). If the observed categories sum to dtype=float (i.e. their + # sums have decimals), then the zeros for the missing categories should also be + # floats. + if zero_or_nan == 0 and reduction_func != "sum": assert np.issubdtype(result.dtype, np.integer) @@ -1410,24 +1408,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "count": # GH 35028 - mark = pytest.mark.xfail( - reason=( - "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0. See GH 35028" - ) - ) - request.node.add_marker(mark) - - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e693962e57ac3..42945be923fa0 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -940,10 +940,6 @@ def test_frame_describe_multikey(tsframe): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T - expected.index = pd.MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) tm.assert_frame_equal(result, expected) @@ -992,6 +988,68 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" +) +@pytest.mark.parametrize("as_index", [True, False]) +def test_describe_with_duplicate_output_column_names(as_index): + # GH 35314 + df = pd.DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a", "b", "b"], + ) + + expected = ( + pd.DataFrame.from_records( + [ + ("a", "count", 3.0, 3.0), + ("a", "mean", 88.0, 99.0), + ("a", "std", 0.0, 0.0), + ("a", "min", 88.0, 99.0), + ("a", "25%", 88.0, 99.0), + ("a", "50%", 88.0, 99.0), + ("a", "75%", 88.0, 99.0), + ("a", "max", 88.0, 99.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + expected.index = pd.Index([88, 99], name="a") + + if as_index: + expected = expected.drop(columns=["a"], level=0) + else: + expected = expected.reset_index(drop=True) + + result = df.groupby("a", as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = pd.DataFrame( diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ebce5b0ef0a66..8c51ebf89f5c0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2055,3 +2055,17 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected + + +def test_group_on_two_row_multiindex_returns_one_tuple_key(): + # GH 18451 + df = pd.DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}]) + df = df.set_index(["a", "b"]) + + grp = df.groupby(["a", "b"]) + result = grp.indices + expected = {(1, 2): np.array([0, 1], dtype=np.int64)} + + assert len(result) == 1 + key = (1, 2) + assert (result[key] == expected[key]).all() diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1a525d306e9f5..adf62c4723526 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,6 +162,40 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "dropna,df_expected,s_expected", + [ + pytest.param( + True, + pd.DataFrame({"B": [2, 2, 1]}), + pd.Series(data=[2, 2, 1], name="B"), + marks=pytest.mark.xfail(raises=ValueError), + ), + ( + False, + pd.DataFrame({"B": [2, 2, 1, 1]}), + pd.Series(data=[2, 2, 1, 1], name="B"), + ), + ], +) +def test_slice_groupby_then_transform(dropna, df_expected, s_expected): + # GH35014 + + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + gb = df.groupby("A", dropna=dropna) + + res = gb.transform(len) + tm.assert_frame_equal(res, df_expected) + + gb_slice = gb[["B"]] + res = gb_slice.transform(len) + tm.assert_frame_equal(res, df_expected) + + gb_slice = gb["B"] + res = gb["B"].transform(len) + tm.assert_series_equal(res, s_expected) + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index efcd22f9c0c82..40b4ce46e550b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -191,13 +191,15 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) - tm.assert_frame_equal(result, expected) - g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() tm.assert_frame_equal(result, expected) + result = g.apply(lambda x: x.sum()) + expected["A"] = [0, 2, 4] + expected = expected.loc[:, ["A", "B"]] + tm.assert_frame_equal(result, expected) + # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame( diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 7f30a77872bc1..b325edb321ed4 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -43,7 +43,14 @@ def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) - msg = f"cannot perform {op_name} with this index type: CategoricalIndex" + cat_or_list = "'(Categorical|list)' and '(Categorical|list)'" + msg = "|".join( + [ + f"cannot perform {op_name} with this index type: CategoricalIndex", + "can only concatenate list", + rf"unsupported operand type\(s\) for [\+-]: {cat_or_list}", + ] + ) with pytest.raises(TypeError, match=msg): func(idx) @@ -478,3 +485,9 @@ def test_reindex_base(self): def test_map_str(self): # See test_map.py pass + + def test_format_different_scalar_lengths(self): + # GH35439 + idx = CategoricalIndex(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c8b780455f862..98f7c0eadb4bb 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,6 +5,7 @@ import pytest from pandas._libs import iNaT +from pandas.compat.numpy import _is_numpy_dev from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype @@ -145,22 +146,41 @@ def test_numeric_compat(self): # Check that this doesn't cover MultiIndex case, if/when it does, # we can remove multi.test_compat.test_numeric_compat assert not isinstance(idx, MultiIndex) + if type(idx) is Index: + return - with pytest.raises(TypeError, match="cannot perform __mul__"): + typ = type(idx._data).__name__ + lmsg = "|".join( + [ + rf"unsupported operand type\(s\) for \*: '{typ}' and 'int'", + "cannot perform (__mul__|__truediv__|__floordiv__) with " + f"this index type: {typ}", + ] + ) + with pytest.raises(TypeError, match=lmsg): idx * 1 - with pytest.raises(TypeError, match="cannot perform __rmul__"): + rmsg = "|".join( + [ + rf"unsupported operand type\(s\) for \*: 'int' and '{typ}'", + "cannot perform (__rmul__|__rtruediv__|__rfloordiv__) with " + f"this index type: {typ}", + ] + ) + with pytest.raises(TypeError, match=rmsg): 1 * idx - div_err = "cannot perform __truediv__" + div_err = lmsg.replace("*", "/") with pytest.raises(TypeError, match=div_err): idx / 1 - - div_err = div_err.replace(" __", " __r") + div_err = rmsg.replace("*", "/") with pytest.raises(TypeError, match=div_err): 1 / idx - with pytest.raises(TypeError, match="cannot perform __floordiv__"): + + floordiv_err = lmsg.replace("*", "//") + with pytest.raises(TypeError, match=floordiv_err): idx // 1 - with pytest.raises(TypeError, match="cannot perform __rfloordiv__"): + floordiv_err = rmsg.replace("*", "//") + with pytest.raises(TypeError, match=floordiv_err): 1 // idx def test_logical_compat(self): @@ -250,6 +270,20 @@ def test_copy_name(self, index): s3 = s1 * s2 assert s3.index.name == "mario" + def test_name2(self, index): + # gh-35592 + if isinstance(index, MultiIndex): + return + + assert index.copy(name="mario").name == "mario" + + with pytest.raises(ValueError, match="Length of new names must be 1, got 2"): + index.copy(name=["mario", "luigi"]) + + msg = f"{type(index).__name__}.name must be a hashable type" + with pytest.raises(TypeError, match=msg): + index.copy(name=[["mario"]]) + def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 @@ -417,7 +451,7 @@ def test_set_ops_error_cases(self, case, method, index): with pytest.raises(TypeError, match=msg): getattr(index, method)(case) - def test_intersection_base(self, index): + def test_intersection_base(self, index, request): if isinstance(index, CategoricalIndex): return @@ -434,6 +468,15 @@ def test_intersection_base(self, index): # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: + # https://github.com/pandas-dev/pandas/issues/35481 + if ( + _is_numpy_dev + and isinstance(case, Series) + and isinstance(index, UInt64Index) + ): + mark = pytest.mark.xfail(reason="gh-35481") + request.node.add_marker(mark) + result = first.intersection(case) assert tm.equalContents(result, second) @@ -632,6 +675,12 @@ def test_equals_op(self): tm.assert_numpy_array_equal(index_a == item, expected3) tm.assert_series_equal(series_a == item, Series(expected3)) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [str(x) for x in idx] + assert idx.format() == expected + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index c150e7901c86a..9a855a1624520 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -787,6 +787,65 @@ def test_construction_with_nat_and_tzlocal(self): expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) tm.assert_index_equal(result, expected) + def test_constructor_with_ambiguous_keyword_arg(self): + # GH 35297 + + expected = DatetimeIndex( + ["2020-11-01 01:00:00", "2020-11-02 01:00:00"], + dtype="datetime64[ns, America/New_York]", + freq="D", + ambiguous=False, + ) + + # ambiguous keyword in start + timezone = "America/New_York" + start = pd.Timestamp(year=2020, month=11, day=1, hour=1).tz_localize( + timezone, ambiguous=False + ) + result = pd.date_range(start=start, periods=2, ambiguous=False) + tm.assert_index_equal(result, expected) + + # ambiguous keyword in end + timezone = "America/New_York" + end = pd.Timestamp(year=2020, month=11, day=2, hour=1).tz_localize( + timezone, ambiguous=False + ) + result = pd.date_range(end=end, periods=2, ambiguous=False) + tm.assert_index_equal(result, expected) + + def test_constructor_with_nonexistent_keyword_arg(self): + # GH 35297 + + timezone = "Europe/Warsaw" + + # nonexistent keyword in start + start = pd.Timestamp("2015-03-29 02:30:00").tz_localize( + timezone, nonexistent="shift_forward" + ) + result = pd.date_range(start=start, periods=2, freq="H") + expected = DatetimeIndex( + [ + pd.Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), + pd.Timestamp("2015-03-29 04:00:00+02:00", tz=timezone), + ] + ) + + tm.assert_index_equal(result, expected) + + # nonexistent keyword in end + end = pd.Timestamp("2015-03-29 02:30:00").tz_localize( + timezone, nonexistent="shift_forward" + ) + result = pd.date_range(end=end, periods=2, freq="H") + expected = DatetimeIndex( + [ + pd.Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), + pd.Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), + ] + ) + + tm.assert_index_equal(result, expected) + def test_constructor_no_precision_raises(self): # GH-24753, GH-24739 diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index ec4162f87010f..7bb1d98086a91 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -59,6 +59,7 @@ def test_reindex_with_same_tz(self): def test_time_loc(self): # GH8667 from datetime import time + from pandas._libs.index import _SIZE_CUTOFF ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 7345ae3032463..a5abf2946feda 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -20,6 +20,12 @@ def index(self, request): def create_index(self) -> DatetimeIndex: return date_range("20130101", periods=5) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [f"{x:%Y-%m-%d}" for x in idx] + assert idx.format() == expected + def test_shift(self): pass # handled in test_ops diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index d1f66af4a8e83..b2500efef9e03 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -84,7 +84,8 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(mi1.values, vals) # Inplace should kill _tuples - mi1.set_levels(levels2, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi1.set_levels(levels2, inplace=True) tm.assert_almost_equal(mi1.values, vals2) # Make sure label setting works too @@ -103,7 +104,8 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(exp_values, new_values) # ...and again setting inplace should kill _tuples, etc - mi2.set_codes(codes2, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi2.set_codes(codes2, inplace=True) tm.assert_almost_equal(mi2.values, new_values) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index e48731b9c8099..9add4b478da47 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -91,7 +91,8 @@ def test_duplicate_multiindex_codes(): mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) msg = r"Level values must be unique: \[[AB', ]+\] on level 0" with pytest.raises(ValueError, match=msg): - mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) @pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 063ede028add7..b48f09457b96c 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -192,10 +192,12 @@ def test_is_(): mi4 = mi3.view() # GH 17464 - Remove duplicate MultiIndex levels - mi4.set_levels([list(range(10)), list(range(10))], inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi4.set_levels([list(range(10)), list(range(10))], inplace=True) assert not mi4.is_(mi3) mi5 = mi.view() - mi5.set_levels(mi5.levels, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi5.set_levels(mi5.levels, inplace=True) assert not mi5.is_(mi) diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 8a3deca0236e4..b9132f429905d 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -93,7 +93,8 @@ def test_set_levels(idx): # level changing [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels, inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) @@ -113,20 +114,23 @@ def test_set_levels(idx): # level changing specific level [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True) assert inplace_return is None assert_matching(ind2.levels, [new_levels[0], levels[1]]) assert_matching(idx.levels, levels) ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True) assert inplace_return is None assert_matching(ind2.levels, [levels[0], new_levels[1]]) assert_matching(idx.levels, levels) # level changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) assert_matching(idx.levels, levels) @@ -136,19 +140,23 @@ def test_set_levels(idx): original_index = idx.copy() for inplace in [True, False]: with pytest.raises(ValueError, match="^On"): - idx.set_levels(["c"], level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_levels(["c"], level=0, inplace=inplace) assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(ValueError, match="^On"): - idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) assert_matching(idx.codes, original_index.codes, check_dtype=True) with pytest.raises(TypeError, match="^Levels"): - idx.set_levels("c", level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_levels("c", level=0, inplace=inplace) assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(TypeError, match="^Codes"): - idx.set_codes(1, level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_codes(1, level=0, inplace=inplace) assert_matching(idx.codes, original_index.codes, check_dtype=True) @@ -168,7 +176,8 @@ def test_set_codes(idx): # changing label w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes, inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) @@ -188,20 +197,23 @@ def test_set_codes(idx): # label changing specific level w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) assert inplace_return is None assert_matching(ind2.codes, [new_codes[0], codes[1]]) assert_matching(idx.codes, codes) ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) assert inplace_return is None assert_matching(ind2.codes, [codes[0], new_codes[1]]) assert_matching(idx.codes, codes) # codes changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) assert_matching(idx.codes, codes) @@ -217,7 +229,8 @@ def test_set_codes(idx): # [w/ mutation] result = ind.copy() - result.set_codes(codes=new_codes, level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + result.set_codes(codes=new_codes, level=1, inplace=True) assert result.equals(expected) @@ -329,3 +342,19 @@ def test_set_levels_with_iterable(): [expected_sizes, colors], names=["size", "color"] ) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_codes_inplace_deprecated(idx, inplace): + new_codes = idx.codes[1][::-1] + + with tm.assert_produces_warning(FutureWarning): + idx.set_codes(codes=new_codes, level=1, inplace=inplace) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_levels_inplace_deprecated(idx, inplace): + new_level = idx.levels[1].copy() + + with tm.assert_produces_warning(FutureWarning): + idx.set_levels(levels=new_level, level=1, inplace=inplace) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index fd150bb4d57a2..c776a33717ccd 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -220,7 +220,8 @@ def test_metadata_immutable(idx): def test_level_setting_resets_attributes(): ind = pd.MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) assert ind.is_monotonic - ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) + with tm.assert_produces_warning(FutureWarning): + ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 479b5ef0211a0..f38da7ad2ae1c 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -75,6 +75,13 @@ def test_copy_names(): assert multi_idx.names == ["MyName1", "MyName2"] assert multi_idx3.names == ["NewName1", "NewName2"] + # gh-35592 + with pytest.raises(ValueError, match="Length of new names must be 2, got 1"): + multi_idx.copy(names=["mario"]) + + with pytest.raises(TypeError, match="MultiIndex.name must be a hashable type"): + multi_idx.copy(names=[["mario"], ["luigi"]]) + def test_names(idx, index_names): diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 5b6f9cb358b7d..ef4bb9a0869b0 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -137,53 +137,58 @@ def test_dtype(self): index = self.create_index() assert index.dtype == np.int64 - def test_cached_data(self): - # GH 26565, GH26617 - # Calling RangeIndex._data caches an int64 array of the same length at - # self._cached_data. This test checks whether _cached_data has been set + def test_cache(self): + # GH 26565, GH26617, GH35432 + # This test checks whether _cache has been set. + # Calling RangeIndex._cache["_data"] creates an int64 array of the same length + # as the RangeIndex and stores it in _cache. idx = RangeIndex(0, 100, 10) - assert idx._cached_data is None + assert idx._cache == {} repr(idx) - assert idx._cached_data is None + assert idx._cache == {} str(idx) - assert idx._cached_data is None + assert idx._cache == {} idx.get_loc(20) - assert idx._cached_data is None + assert idx._cache == {} - 90 in idx - assert idx._cached_data is None + 90 in idx # True + assert idx._cache == {} - 91 in idx - assert idx._cached_data is None + 91 in idx # False + assert idx._cache == {} idx.all() - assert idx._cached_data is None + assert idx._cache == {} idx.any() - assert idx._cached_data is None + assert idx._cache == {} df = pd.DataFrame({"a": range(10)}, index=idx) df.loc[50] - assert idx._cached_data is None + assert idx._cache == {} with pytest.raises(KeyError, match="51"): df.loc[51] - assert idx._cached_data is None + assert idx._cache == {} df.loc[10:50] - assert idx._cached_data is None + assert idx._cache == {} df.iloc[5:10] - assert idx._cached_data is None + assert idx._cache == {} - # actually calling idx._data + # idx._cache should contain a _data entry after call to idx._data + idx._data assert isinstance(idx._data, np.ndarray) - assert isinstance(idx._cached_data, np.ndarray) + assert idx._data is idx._data # check cached value is reused + assert len(idx._cache) == 4 + expected = np.arange(0, 100, 10, dtype="int64") + tm.assert_numpy_array_equal(idx._cache["_data"], expected) def test_is_monotonic(self): index = RangeIndex(0, 20, 2) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index eaf48421dc071..70eb9e502f78a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1171,8 +1171,11 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - def test_format(self, index): - self._check_method_works(Index.format, index) + def test_format_different_scalar_lengths(self): + # GH35439 + idx = Index(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected def test_format_bug(self): # GH 14626 @@ -1511,23 +1514,24 @@ def test_slice_locs_na_raises(self): @pytest.mark.parametrize( "in_slice,expected", [ + # error: Slice index must be an integer or None (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), # type: ignore - (pd.IndexSlice["b"::-1], "b"), # type: ignore - (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore - (pd.IndexSlice[:"y":-1], "y"), # type: ignore - (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore - (pd.IndexSlice["y"::-4], "yb"), # type: ignore + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore - (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore - (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore - (pd.IndexSlice["z"::-3], "yc"), # type: ignore - (pd.IndexSlice["m"::-1], "dcb"), # type: ignore - (pd.IndexSlice[:"m":-1], "yx"), # type: ignore - (pd.IndexSlice["a":"a":-1], ""), # type: ignore - (pd.IndexSlice["z":"z":-1], ""), # type: ignore - (pd.IndexSlice["m":"m":-1], ""), # type: ignore + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) def test_slice_locs_negative_step(self, in_slice, expected): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a7c5734ef9b02..bfcac5d433d2c 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -21,6 +21,13 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False + def test_format(self): + # GH35439 + idx = self.create_index() + max_width = max(len(str(x)) for x in idx) + expected = [str(x).ljust(max_width) for x in idx] + assert idx.format() == expected + def test_numeric_compat(self): pass # override Base method diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index be193e0854d8d..d8e56661b7d61 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -15,7 +15,7 @@ def test_multiindex_get_loc(): # GH7724, GH2646 with warnings.catch_warnings(record=True): # test indexing into a multi-index before & past the lexsort depth - from numpy.random import randint, choice, randn + from numpy.random import choice, randint, randn cols = ["jim", "joe", "jolie", "joline", "jolia"] diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 572cb9da405d1..bafe5068e1418 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -43,9 +43,13 @@ def test_frame_getitem_not_sorted2(self, key): df2 = df.set_index(["col1", "col2"]) df2_original = df2.copy() - return_value = df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) + with tm.assert_produces_warning(FutureWarning): + return_value = df2.index.set_levels( + ["b", "d", "a"], level="col1", inplace=True + ) assert return_value is None - return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) + with tm.assert_produces_warning(FutureWarning): + return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) assert return_value is None assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index b807795b9c309..91be1d913001b 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +from pandas import DataFrame, Index, IndexSlice, MultiIndex, Series, concat, date_range import pandas._testing as tm import pandas.core.common as com @@ -220,6 +220,27 @@ def test_xs_level_series_slice_not_implemented( s[2000, 3:4] +def test_xs_IndexSlice_argument_not_implemented(): + # GH 35301 + + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + msg = ( + "Expected label or tuple of labels, got " + r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" + ) + with pytest.raises(TypeError, match=msg): + frame.xs(IndexSlice[("foo", "qux", 0), :]) + with pytest.raises(TypeError, match=msg): + series.xs(IndexSlice[("foo", "qux", 0), :]) + + def test_series_getitem_multiindex_xs(): # GH6258 dt = list(date_range("20130903", periods=3)) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 30b13b6ea9fce..193800fae751f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _is_numpy_dev + import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range import pandas._testing as tm @@ -945,6 +947,7 @@ def test_loc_setitem_empty_append(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(_is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index e236b3da73c69..84805d06df4a8 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2141,6 +2141,15 @@ def test_dict_entries(self): assert "'a': 1" in val assert "'b': 2" in val + def test_categorical_columns(self): + # GH35439 + data = [[4, 2], [3, 2], [4, 3]] + cols = ["aaaaaaaaa", "b"] + df = pd.DataFrame(data, columns=cols) + df_cat_cols = pd.DataFrame(data, columns=pd.CategoricalIndex(cols)) + + assert df.to_string() == df_cat_cols.to_string() + def test_period(self): # GH 12615 df = pd.DataFrame( diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 9c6910637fa7e..3ef5157655e78 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1682,6 +1682,12 @@ def f(a, b, styler): result = styler.pipe((f, "styler"), a=1, b=2) assert result == (1, 2, styler) + def test_no_cell_ids(self): + # GH 35588 + df = pd.DataFrame(data=[[0]]) + s = Styler(df, uuid="_", cell_ids=False).render() + assert s.find('
') != -1 + @td.skip_if_no_mpl class TestStylerMatplotlibDep: diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 4c86e3a16b135..753b8b6eda9c5 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -607,3 +607,39 @@ def test_to_csv_errors(self, errors): ser.to_csv(path, errors=errors) # No use in reading back the data as it is not the same anymore # due to the error handling + + def test_to_csv_binary_handle(self): + """ + Binary file objects should work if 'mode' contains a 'b'. + + GH 35058 and GH 19827 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, mode="w+b") as handle: + df.to_csv(handle, mode="w+b") + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + + def test_to_csv_encoding_binary_handle(self): + """ + Binary file objects should honor a specified encoding. + + GH 23854 and GH 13068 with binary handles + """ + # example from GH 23854 + content = "a, b, 🐟".encode("utf-8-sig") + buffer = io.BytesIO(content) + df = pd.read_csv(buffer, encoding="utf-8-sig") + + buffer = io.BytesIO() + df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False) + buffer.seek(0) # tests whether file handle wasn't closed + assert buffer.getvalue().startswith(content) + + # example from GH 13068 + with tm.ensure_clean() as path: + with open(path, "w+b") as handle: + pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig") + + handle.seek(0) + assert handle.read().startswith(b'\xef\xbb\xbf""') diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 509e5bcb33304..93ad3739e59c7 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -573,6 +573,54 @@ def test_to_latex_longtable_caption_label(self): """ assert result_cl == expected_cl + def test_to_latex_position(self): + the_position = "h" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the position is provided + result_p = df.to_latex(position=the_position) + + expected_p = r"""\begin{table}[h] +\centering +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_p == expected_p + + def test_to_latex_longtable_position(self): + the_position = "t" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the position is provided + result_p = df.to_latex(longtable=True, position=the_position) + + expected_p = r"""\begin{longtable}[t]{lrl} +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_p == expected_p + def test_to_latex_escape_special_chars(self): special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 12e73bae40eac..5154a9ba6fdf0 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -18,7 +18,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -2179,3 +2179,13 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser = all_parsers with pytest.raises(ValueError, match="Names should be an ordered collection."): parser.read_csv(StringIO(data), names=set("QAZ")) + + +def test_read_csv_with_use_inf_as_na(all_parsers): + # https://github.com/pandas-dev/pandas/issues/35493 + parser = all_parsers + data = "1.0\nNaN\n3.0" + with option_context("use_inf_as_na", True): + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([1.0, np.nan, 3.0]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index df014171be817..0942c79837e7c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1751,9 +1751,9 @@ def col(t, column): # try to index a col which isn't a data_column msg = ( - f"column string2 is not a data_column.\n" - f"In order to read column string2 you must reload the dataframe \n" - f"into HDFStore and include string2 with the data_columns argument." + "column string2 is not a data_column.\n" + "In order to read column string2 you must reload the dataframe \n" + "into HDFStore and include string2 with the data_columns argument." ) with pytest.raises(AttributeError, match=msg): store.create_table_index("f", columns=["string2"]) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index dde38eb55ea7f..5ce2233bc0cd0 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -378,6 +378,17 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + def test_binary_mode(self): + """ + 'encoding' shouldn't be passed to 'open' in binary mode. + + GH 35058 + """ + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path, mode="w+b") + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 59c9bd0a36d3d..902a3d5d2a397 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -114,6 +114,22 @@ def test_compression_warning(compression_only): df.to_csv(f, compression=compression_only) +def test_compression_binary(compression_only): + """ + Binary file handles support compression. + + GH22555 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, mode="wb") as file: + df.to_csv(file, mode="wb", compression=compression_only) + file.seek(0) # file shouldn't be closed + tm.assert_frame_equal( + df, pd.read_csv(path, index_col=0, compression=compression_only) + ) + + def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575 diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c397a61616c1c..a0723452ccb70 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -15,7 +15,8 @@ ) # the ignore on the following line accounts for to_csv returning Optional(str) # in general, but always str in the case we give no filename -text = df1.to_csv(index=False).encode() # type: ignore +# error: Item "None" of "Optional[str]" has no attribute "encode" +text = df1.to_csv(index=False).encode() # type: ignore[union-attr] @pytest.fixture @@ -37,8 +38,8 @@ def test_read_csv(cleared_fs): def test_reasonable_error(monkeypatch, cleared_fs): - from fsspec.registry import known_implementations from fsspec import registry + from fsspec.registry import known_implementations registry.target.clear() with pytest.raises(ValueError) as e: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4d93119ffa3f5..eacf4fa08545d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -11,8 +11,7 @@ @td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state @@ -37,8 +36,7 @@ def open(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( @@ -76,8 +74,7 @@ def mock_get_filepath_or_buffer(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0991fae39138e..29b787d39c09d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -48,10 +48,10 @@ try: import sqlalchemy - import sqlalchemy.schema - import sqlalchemy.sql.sqltypes as sqltypes from sqlalchemy.ext import declarative from sqlalchemy.orm import session as sa_session + import sqlalchemy.schema + import sqlalchemy.sql.sqltypes as sqltypes SQLALCHEMY_INSTALLED = True except ImportError: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 896d3278cdde1..3b1ff233c5ec1 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -13,7 +13,6 @@ from pandas import DataFrame, Series import pandas._testing as tm - """ This is a common base class used for various plotting tests """ @@ -24,6 +23,7 @@ class TestPlotBase: def setup_method(self, method): import matplotlib as mpl + from pandas.plotting._matplotlib import compat mpl.rcdefaults() @@ -187,8 +187,8 @@ def _check_colors( Series used for color grouping key used for andrew_curves, parallel_coordinates, radviz test """ + from matplotlib.collections import Collection, LineCollection, PolyCollection from matplotlib.lines import Line2D - from matplotlib.collections import Collection, PolyCollection, LineCollection conv = self.colorconverter if linecolors is not None: diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 317a994bd9a32..ee43e5d7072fe 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2408,8 +2408,8 @@ def test_specified_props_kwd_plot_box(self, props, expected): assert result[expected][0].get_color() == "C1" def test_default_color_cycle(self): - import matplotlib.pyplot as plt import cycler + import matplotlib.pyplot as plt colors = list("rgbk") plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) @@ -2953,8 +2953,8 @@ def _check(axes): @td.skip_if_no_scipy def test_memory_leak(self): """ Check that every plot type gets properly collected. """ - import weakref import gc + import weakref results = {} for kind in plotting.PlotAccessor._all_kinds: @@ -3032,8 +3032,8 @@ def test_df_subplots_patterns_minorticks(self): @pytest.mark.slow def test_df_gridspec_patterns(self): # GH 10819 - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) @@ -3422,9 +3422,9 @@ def test_xlabel_ylabel_dataframe_subplots( def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.gridspec # noqa + import matplotlib.pyplot as plt gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index b6a6c326c3df3..34c881855d16a 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -101,7 +101,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -352,6 +352,7 @@ class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle + from pandas.plotting._matplotlib.hist import _grouped_hist df = DataFrame(randn(500, 2), columns=["A", "B"]) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 75eeede472fe9..f5c1c58f3f7ed 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -131,9 +131,10 @@ def test_scatter_matrix_axis(self): @pytest.mark.slow def test_andrews_curves(self, iris): - from pandas.plotting import andrews_curves from matplotlib import cm + from pandas.plotting import andrews_curves + df = iris _check_plot_works(andrews_curves, frame=df, class_column="Name") @@ -206,9 +207,10 @@ def test_andrews_curves(self, iris): @pytest.mark.slow def test_parallel_coordinates(self, iris): - from pandas.plotting import parallel_coordinates from matplotlib import cm + from pandas.plotting import parallel_coordinates + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") @@ -279,9 +281,10 @@ def test_parallel_coordinates_with_sorted_labels(self): @pytest.mark.slow def test_radviz(self, iris): - from pandas.plotting import radviz from matplotlib import cm + from pandas.plotting import radviz + df = iris _check_plot_works(radviz, frame=df, class_column="Name") @@ -397,6 +400,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. from matplotlib import cm + from pandas.plotting._matplotlib.style import _get_standard_colors color_before = cm.gnuplot(range(5)) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 151bb3bed7207..cc00626e992f3 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -452,7 +452,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -827,6 +827,7 @@ def test_standard_colors(self): @pytest.mark.slow def test_standard_colors_all(self): import matplotlib.colors as colors + from pandas.plotting._matplotlib.style import _get_standard_colors # multiple colors like mediumaquamarine diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 0159fabd04d59..38cf2cc2402a1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1279,6 +1279,43 @@ def test_concat_ignore_index(self, sort): tm.assert_frame_equal(v1, expected) + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, "idx"), + ("idx", None, None, "idx"), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + pd.Index(["a", "b", "c"], name=name_in1), + pd.Index(["b", "c", "d"], name=name_in2), + pd.Index(["c", "d", "e"], name=name_in3), + ] + frames = [ + pd.DataFrame({c: [0, 1, 2]}, index=i) + for i, c in zip(indices, ["x", "y", "z"]) + ] + result = pd.concat(frames, axis=1) + + exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) + expected = pd.DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + def test_concat_multiindex_with_keys(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 2b75a1ec6ca6e..79879ef346f53 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -799,7 +799,7 @@ def test_invalid_separator(self): expected = expected.set_index(["id", "year"])[ ["X", "A2010", "A2011", "B2010", "A", "B"] ] - expected.index.set_levels([0, 1], level=0, inplace=True) + expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep) tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) @@ -861,7 +861,7 @@ def test_invalid_suffixtype(self): expected = pd.DataFrame(exp_data).astype({"year": "int"}) expected = expected.set_index(["id", "year"]) - expected.index.set_levels([0, 1], level=0, inplace=True) + expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c07a5673fe503..67b3151b0ff9c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed): ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) expected_columns = pd.Index(["a", "b"], name="C2") - expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64) expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed): values="Sales", index="Month", columns="Year", - dropna=observed, + observed=observed, aggfunc="sum", ) expected_columns = pd.Int64Index([2013, 2014], name="Year") expected_index = pd.CategoricalIndex( - ["January"], categories=months, ordered=False, name="Month" + months, categories=months, ordered=False, name="Month" ) + expected_data = [[320, 120]] + [[0, 0]] * 11 expected = pd.DataFrame( - [[320, 120]], index=expected_index, columns=expected_columns + expected_data, index=expected_index, columns=expected_columns ) - if not observed: - result = result.dropna().astype(np.int64) + if observed: + expected = expected.loc[["January"]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e1e2ea1a5cec8..03830019affa1 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -513,11 +513,67 @@ def test_to_numpy_alias(): assert isna(expected) and isna(result) -@pytest.mark.parametrize("other", [Timedelta(0), Timestamp(0)]) +@pytest.mark.parametrize( + "other", + [ + Timedelta(0), + Timedelta(0).to_pytimedelta(), + pytest.param( + Timedelta(0).to_timedelta64(), + marks=pytest.mark.xfail( + reason="td64 doesnt return NotImplemented, see numpy#17017" + ), + ), + Timestamp(0), + Timestamp(0).to_pydatetime(), + pytest.param( + Timestamp(0).to_datetime64(), + marks=pytest.mark.xfail( + reason="dt64 doesnt return NotImplemented, see numpy#17017" + ), + ), + Timestamp(0).tz_localize("UTC"), + NaT, + ], +) def test_nat_comparisons(compare_operators_no_eq_ne, other): # GH 26039 - assert getattr(NaT, compare_operators_no_eq_ne)(other) is False - assert getattr(other, compare_operators_no_eq_ne)(NaT) is False + opname = compare_operators_no_eq_ne + + assert getattr(NaT, opname)(other) is False + + op = getattr(operator, opname.strip("_")) + assert op(NaT, other) is False + assert op(other, NaT) is False + + +@pytest.mark.parametrize("other", [np.timedelta64(0, "ns"), np.datetime64("now", "ns")]) +def test_nat_comparisons_numpy(other): + # Once numpy#17017 is fixed and the xfailed cases in test_nat_comparisons + # pass, this test can be removed + assert not NaT == other + assert NaT != other + assert not NaT < other + assert not NaT > other + assert not NaT <= other + assert not NaT >= other + + +@pytest.mark.parametrize("other", ["foo", 2, 2.0]) +@pytest.mark.parametrize("op", [operator.le, operator.lt, operator.ge, operator.gt]) +def test_nat_comparisons_invalid(other, op): + # GH#35585 + assert not NaT == other + assert not other == NaT + + assert NaT != other + assert other != NaT + + with pytest.raises(TypeError): + op(NaT, other) + + with pytest.raises(TypeError): + op(other, NaT) @pytest.mark.parametrize( diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 0b34fab7b80b1..088f8681feb99 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,7 +11,6 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm - """ Also test support for datetime64[ns] in Series / DataFrame """ @@ -166,6 +165,7 @@ def test_getitem_setitem_datetime_tz_pytz(): def test_getitem_setitem_datetime_tz_dateutil(): from dateutil.tz import tzutc + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz tz = ( diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 19caf4eccf748..4b4ef5ea046be 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -90,7 +90,7 @@ def test_with_nan(self): tm.assert_series_equal(result, expected) def test_periodindex(self): - from pandas import period_range, PeriodIndex + from pandas import PeriodIndex, period_range # array or list or dates N = 50 diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index 7c82edbaec177..45592f8d99b93 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -141,3 +141,14 @@ def test_truncate_multiindex(self): expected = df.col tm.assert_series_equal(result, expected) + + def test_truncate_one_element_series(self): + # GH 35544 + series = pd.Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) + before = pd.Timestamp("2020-08-02") + after = pd.Timestamp("2020-08-04") + + result = series.truncate(before=before, after=after) + + # the input Series and the expected Series are the same + tm.assert_series_equal(result, series) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 5c8a0d224c4f9..ef2bafd4ea2ad 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -195,8 +195,8 @@ def test_add_with_duplicate_index(self): tm.assert_series_equal(result, expected) def test_add_na_handling(self): - from decimal import Decimal from datetime import date + from decimal import Decimal s = Series( [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index e718a6b759963..b32c5e91af295 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -90,7 +90,7 @@ def test_statsmodels(): def test_scikit_learn(df): sklearn = import_module("sklearn") # noqa - from sklearn import svm, datasets + from sklearn import datasets, svm digits = datasets.load_digits() clf = svm.SVC(gamma=0.001, C=100.0) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1ba73292dc0b4..724558bd49ea2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -63,8 +63,8 @@ def setup_method(self, method): ).sum() # use Int64Index, to make sure things work - self.ymd.index.set_levels( - [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True + self.ymd.index = self.ymd.index.set_levels( + [lev.astype("i8") for lev in self.ymd.index.levels] ) self.ymd.index.set_names(["year", "month", "day"], inplace=True) diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 1e193f22a6698..f68d83f7f4d58 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -166,3 +166,16 @@ def test_to_timedelta_ignore_strings_unit(self): arr = np.array([1, 2, "error"], dtype=object) result = pd.to_timedelta(arr, unit="ns", errors="ignore") tm.assert_numpy_array_equal(result, arr) + + def test_to_timedelta_nullable_int64_dtype(self): + # GH 35574 + expected = Series([timedelta(days=1), timedelta(days=2)]) + result = to_timedelta(Series([1, 2], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) + + # IntegerArray Series with nulls + expected = Series([timedelta(days=1), None]) + result = to_timedelta(Series([1, None], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 4f184b78f34a1..87cd97f853f4d 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -78,6 +78,14 @@ def test_tz_convert_corner(arr): tm.assert_numpy_array_equal(result, arr) +def test_tz_convert_readonly(): + # GH#35530 + arr = np.array([0], dtype=np.int64) + arr.setflags(write=False) + result = tzconversion.tz_convert_from_utc(arr, UTC) + tm.assert_numpy_array_equal(result, arr) + + @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize("dtype", ["M8[ns]", "M8[s]"]) def test_length_zero_copy(dtype, copy): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1284cc9d4f49b..a7b5aeac560e4 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -281,3 +281,18 @@ class MySeries(Series): with pytest.raises(AssertionError, match="Series classes are different"): tm.assert_series_equal(s3, s1, check_series_type=True) + + +def test_series_equal_exact_for_nonnumeric(): + # https://github.com/pandas-dev/pandas/issues/35446 + s1 = Series(["a", "b"]) + s2 = Series(["a", "b"]) + s3 = Series(["b", "a"]) + + tm.assert_series_equal(s1, s2, check_exact=True) + tm.assert_series_equal(s2, s1, check_exact=True) + + with pytest.raises(AssertionError): + tm.assert_series_equal(s1, s3, check_exact=True) + with pytest.raises(AssertionError): + tm.assert_series_equal(s3, s1, check_exact=True) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 744ca264e91d9..e1dcac06c39cc 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -214,3 +214,144 @@ def foo(x): name="value", ) tm.assert_series_equal(result, expected) + + def test_groupby_rolling_center_center(self): + # GH 35552 + series = Series(range(1, 6)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 5, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))), + ) + tm.assert_series_equal(result, expected) + + series = Series(range(1, 5)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 4, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))), + ) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ("b", 10), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_subselect_rolling(self): + # GH 35486 + df = DataFrame( + {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [10, 20, 30, 20]} + ) + result = df.groupby("a")[["b"]].rolling(2).max() + expected = DataFrame( + [np.nan, np.nan, 2.0, np.nan], + columns=["b"], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].rolling(2).max() + expected = Series( + [np.nan, np.nan, 2.0, np.nan], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + name="b", + ) + tm.assert_series_equal(result, expected) + + def test_groupby_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + result = ( + df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) + + def test_groupby_subset_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + + result = ( + df.groupby("group")[["column1", "date"]] + .rolling("1D", on="date", closed="left")["column1"] + .sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 6135ccba1573d..f81bca7e85156 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -323,7 +323,8 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: sig = inspect.Signature(params) # https://github.com/python/typing/issues/598 - func.__signature__ = sig # type: ignore + # error: "F" has no attribute "__signature__" + func.__signature__ = sig # type: ignore[attr-defined] return cast(F, wrapper) return decorate @@ -357,8 +358,12 @@ def decorator(decorated: F) -> F: for docstring in docstrings: if hasattr(docstring, "_docstring_components"): + # error: Item "str" of "Union[str, Callable[..., Any]]" has no + # attribute "_docstring_components" [union-attr] + # error: Item "function" of "Union[str, Callable[..., Any]]" + # has no attribute "_docstring_components" [union-attr] docstring_components.extend( - docstring._docstring_components # type: ignore + docstring._docstring_components # type: ignore[union-attr] ) elif isinstance(docstring, str) or docstring.__doc__: docstring_components.append(docstring) @@ -373,7 +378,10 @@ def decorator(decorated: F) -> F: ] ) - decorated._docstring_components = docstring_components # type: ignore + # error: "F" has no attribute "_docstring_components" + decorated._docstring_components = ( # type: ignore[attr-defined] + docstring_components + ) return decorated return decorator diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index f413490764124..3a8a1a3144269 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -53,8 +53,8 @@ def plot(self, left, right, labels=None, vertical: bool = True): vertical : bool, default True If True, use vertical layout. If False, use horizontal layout. """ - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt if not isinstance(left, list): left = [left] diff --git a/requirements-dev.txt b/requirements-dev.txt index 7bf3df176b378..6a87b0a99a4f8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,7 +11,7 @@ cpplint flake8<3.8.0 flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 -isort==4.3.21 +isort>=5.2.1 mypy==0.730 pycodestyle gitpython @@ -73,4 +73,5 @@ cftime pyreadstat tabulate>=0.8.3 git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master -git+https://github.com/numpy/numpydoc \ No newline at end of file +git+https://github.com/numpy/numpydoc +pyflakes>=2.2.0 \ No newline at end of file diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 5de2a07381ae5..62ec6b9ef07af 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -138,6 +138,9 @@ "CategoricalDtype", "UTC", "Panel", + "False", + "Styler", + "os", } CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} diff --git a/setup.cfg b/setup.cfg index ee5725e36d193..e4c0b3dcf37ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -122,6 +122,7 @@ check_untyped_defs=True strict_equality=True warn_redundant_casts = True warn_unused_ignores = True +show_error_codes = True [mypy-pandas.tests.*] check_untyped_defs=False @@ -174,9 +175,6 @@ check_untyped_defs=False [mypy-pandas.core.groupby.ops] check_untyped_defs=False -[mypy-pandas.core.indexes.base] -check_untyped_defs=False - [mypy-pandas.core.indexes.datetimes] check_untyped_defs=False @@ -213,9 +211,6 @@ check_untyped_defs=False [mypy-pandas.core.window.common] check_untyped_defs=False -[mypy-pandas.core.window.ewm] -check_untyped_defs=False - [mypy-pandas.core.window.expanding] check_untyped_defs=False diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index be109ea53eb7d..515d23afb93ec 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -42,6 +42,13 @@ datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. +### [Compose](https://github.com/FeatureLabs/compose) + +Compose is a machine learning tool for labeling data and prediction engineering. +It allows you to structure the labeling process by parameterizing +prediction problems and transforming time-driven relational data into +target values with cutoff times that can be used for supervised learning. + ## Visualization ### [Altair](https://altair-viz.github.io/) @@ -372,3 +379,4 @@ authors to coordinate on the namespace. | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | | [pandas_path](https://github.com/drivendataorg/pandas-path/) | `path` | `Index`, `Series` | | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | + | [composeml](https://github.com/FeatureLabs/compose) | `slice` | `DataFrame` | diff --git a/web/pandas/index.html b/web/pandas/index.html index 83d0f48197033..75c797d6dd93d 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -63,7 +63,7 @@
With the support of:
{% if releases %}

Latest version: {{ releases[0].name }}