diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md index 7dd2e04249492..87a5b7905fc6d 100644 --- a/.github/CODE_OF_CONDUCT.md +++ b/.github/CODE_OF_CONDUCT.md @@ -60,4 +60,3 @@ and the [Swift Code of Conduct][swift]. [homepage]: https://www.contributor-covenant.org [version]: https://www.contributor-covenant.org/version/1/3/0/ [swift]: https://swift.org/community/#code-of-conduct - diff --git a/.github/workflows/autoupdate-pre-commit-config.yml b/.github/workflows/autoupdate-pre-commit-config.yml new file mode 100644 index 0000000000000..42d6ae6606442 --- /dev/null +++ b/.github/workflows/autoupdate-pre-commit-config.yml @@ -0,0 +1,33 @@ +name: "Update pre-commit config" + +on: + schedule: + - cron: "0 7 * * 1" # At 07:00 on each Monday. + workflow_dispatch: + +jobs: + update-pre-commit: + if: github.repository_owner == 'pandas-dev' + name: Autoupdate pre-commit config + runs-on: ubuntu-latest + steps: + - name: Set up Python + uses: actions/setup-python@v2 + - name: Cache multiple paths + uses: actions/cache@v2 + with: + path: | + ~/.cache/pre-commit + ~/.cache/pip + key: pre-commit-autoupdate-${{ runner.os }}-build + - name: Update pre-commit config packages + uses: technote-space/create-pr-action@v2 + with: + GITHUB_TOKEN: ${{ secrets.ACTION_TRIGGER_TOKEN }} + EXECUTE_COMMANDS: | + pip install pre-commit + pre-commit autoupdate || (exit 0); + pre-commit run -a || (exit 0); + COMMIT_MESSAGE: "⬆️ UPGRADE: Autoupdate pre-commit config" + PR_BRANCH_NAME: "pre-commit-config-update-${PR_ID}" + PR_TITLE: "⬆️ UPGRADE: Autoupdate pre-commit config" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db1fc30111a2d..2848437a76a16 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,9 @@ on: push: branches: master pull_request: - branches: master + branches: + - master + - 1.2.x env: ENV_FILE: environment.yml @@ -16,7 +18,7 @@ jobs: steps: - name: Setting conda path - run: echo "::add-path::${HOME}/miniconda3/bin" + run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH - name: Checkout uses: actions/checkout@v1 @@ -35,12 +37,6 @@ jobs: ci/code_checks.sh lint if: always() - - name: Dependencies consistency - run: | - source activate pandas-dev - ci/code_checks.sh dependencies - if: always() - - name: Checks on imported code run: | source activate pandas-dev @@ -102,7 +98,7 @@ jobs: steps: - name: Setting conda path - run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" + run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH - name: Checkout uses: actions/checkout@v1 @@ -123,7 +119,7 @@ jobs: # This can be removed when the ipython directive fails when there are errors, # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) - name: Check ipython directive errors - run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" + run: "! grep -B10 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" - name: Install ssh key run: | diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000000..723347913ac38 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,14 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [master] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: pre-commit/action@v2.0.0 diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml new file mode 100644 index 0000000000000..2f55a180bc88c --- /dev/null +++ b/.github/workflows/stale-pr.yml @@ -0,0 +1,21 @@ +name: "Stale PRs" +on: + schedule: + # * is a special character in YAML so you have to quote this string + - cron: "0 0 * * *" + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please update or respond to this comment if you're still interested in working on this." + skip-stale-pr-message: false + stale-pr-label: "Stale" + exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" + days-before-stale: 30 + days-before-close: -1 + remove-stale-when-updated: false + debug-only: false diff --git a/.gitignore b/.gitignore index 6c3c275c48fb7..1661862a5d066 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ *.log *.swp *.pdb +*.zip .project .pydevproject .settings diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7fd797fb7230..717334bfe1299 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,44 +1,168 @@ repos: - repo: https://github.com/python/black - rev: 19.10b0 + rev: 20.8b1 hooks: - id: black - language_version: python3 - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 + rev: 3.8.4 hooks: - id: flake8 - language: python_venv additional_dependencies: [flake8-comprehensions>=3.1.0] - id: flake8 - name: flake8-pyx - language: python_venv - files: \.(pyx|pxd)$ - types: - - file + name: flake8 (cython) + types: [cython] args: [--append-config=flake8/cython.cfg] - id: flake8 - name: flake8-pxd - language: python_venv + name: flake8 (cython template) files: \.pxi\.in$ - types: - - file + types: [text] args: [--append-config=flake8/cython-template.cfg] -- repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.21 +- repo: https://github.com/PyCQA/isort + rev: 5.6.4 hooks: - id: isort - language: python_venv - exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$ -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.730 - hooks: - - id: mypy - args: - # As long as a some files are excluded from check-untyped-defs - # we have to exclude it from the pre-commit hook as the configuration - # is based on modules but the hook runs on files. - - --no-check-untyped-defs - - --follow-imports - - skip - files: pandas/ + name: isort (python) + - id: isort + name: isort (cython) + types: [cython] +- repo: https://github.com/asottile/pyupgrade + rev: v2.7.4 + hooks: + - id: pyupgrade + args: [--py37-plus] +- repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.7.0 + hooks: + - id: rst-backticks + - id: rst-directive-colons + types: [text] + - id: rst-inline-touching-normal + types: [text] +- repo: local + hooks: + - id: pip_to_conda + name: Generate pip dependency from conda + description: This hook checks if the conda environment.yml and requirements-dev.txt are equal + language: python + entry: python scripts/generate_pip_deps_from_conda.py + files: ^(environment.yml|requirements-dev.txt)$ + pass_filenames: false + additional_dependencies: [pyyaml] + - id: flake8-rst + name: flake8-rst + description: Run flake8 on code snippets in docstrings or RST files + language: python + entry: flake8-rst + types: [rst] + args: [--filename=*.rst] + additional_dependencies: [flake8-rst==0.7.0, flake8==3.7.9] + - id: non-standard-imports + name: Check for non-standard imports + language: pygrep + entry: | + (?x) + # Check for imports from pandas.core.common instead of `import pandas.core.common as com` + from\ pandas\.core\.common\ import| + from\ pandas\.core\ import\ common| + + # Check for imports from collections.abc instead of `from collections import abc` + from\ collections\.abc\ import + + - id: non-standard-numpy.random-related-imports + name: Check for non-standard numpy.random-related imports excluding pandas/_testing.py + language: pygrep + exclude: pandas/_testing.py + entry: | + (?x) + # Check for imports from np.random. instead of `from numpy import random` or `from numpy.random import ` + from\ numpy\ import\ random| + from\ numpy.random\ import + types: [python] + - id: non-standard-imports-in-tests + name: Check for non-standard imports in test suite + language: pygrep + entry: | + (?x) + # Check for imports from pandas._testing instead of `import pandas._testing as tm` + from\ pandas\._testing\ import| + from\ pandas\ import\ _testing\ as\ tm| + + # No direct imports from conftest + conftest\ import| + import\ conftest + types: [python] + files: ^pandas/tests/ + - id: incorrect-code-directives + name: Check for incorrect code block or IPython directives + language: pygrep + entry: (\.\. code-block ::|\.\. ipython ::) + files: \.(py|pyx|rst)$ + - id: unwanted-patterns-strings-to-concatenate + name: Check for use of not concatenated strings + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" + files: \.(py|pyx|pxd|pxi)$ + - id: unwanted-patterns-strings-with-wrong-placed-whitespace + name: Check for strings with wrong placed spaces + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" + files: \.(py|pyx|pxd|pxi)$ + - id: unwanted-patterns-private-import-across-module + name: Check for import of private attributes across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" + types: [python] + exclude: ^(asv_bench|pandas/tests|doc)/ + - id: unwanted-patterns-private-function-across-module + name: Check for use of private functions across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" + types: [python] + exclude: ^(asv_bench|pandas/tests|doc)/ + - id: inconsistent-namespace-usage + name: 'Check for inconsistent use of pandas namespace in tests' + entry: python scripts/check_for_inconsistent_pandas_namespace.py + language: python + types: [python] + files: ^pandas/tests/ + - id: FrameOrSeriesUnion + name: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias + entry: Union\[.*(Series.*DataFrame|DataFrame.*Series).*\] + language: pygrep + types: [python] + exclude: ^pandas/_typing\.py$ + - id: type-not-class + name: Check for use of foo.__class__ instead of type(foo) + entry: \.__class__ + language: pygrep + files: \.(py|pyx)$ + - id: unwanted-typing + name: Check for use of comment-based annotation syntax and missing error codes + entry: | + (?x) + \#\ type:\ (?!ignore)| + \#\ type:\s?ignore(?!\[) + language: pygrep + types: [python] + - id: no-os-remove + name: Check code for instances of os.remove + entry: os\.remove + language: pygrep + types: [python] + files: ^pandas/tests/ + exclude: | + (?x)^ + pandas/tests/io/excel/test_writers\.py| + pandas/tests/io/pytables/common\.py| + pandas/tests/io/pytables/test_store\.py$ +- repo: https://github.com/asottile/yesqa + rev: v1.2.2 + hooks: + - id: yesqa +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.3.0 + hooks: + - id: end-of-file-fixer + exclude: ^LICENSES/|\.(html|csv|txt|svg|py)$ + - id: trailing-whitespace + exclude: \.(html|svg)$ diff --git a/.travis.yml b/.travis.yml index b016cf386098e..1ddd886699d38 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,15 @@ language: python python: 3.7 +addons: + apt: + update: true + packages: + - xvfb + +services: + - xvfb + # To turn off cached cython files and compiler cache # set NOCACHE-true # To delete caches go to https://travis-ci.org/OWNER/REPOSITORY/caches or run @@ -10,11 +19,9 @@ cache: ccache: true directories: - $HOME/.cache # cython cache - - $HOME/.ccache # compiler cache env: global: - # Variable for test workers - PYTEST_WORKERS="auto" # create a github personal access token # cd pandas-dev/pandas @@ -22,49 +29,42 @@ env: - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" git: - # for cloning depth: false matrix: fast_finish: true include: - # In allowed failures - - dist: bionic - python: 3.9-dev - env: - - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" - - env: - - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)" - - env: - - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)" - - - arch: arm64 - env: - - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" + - JOB="3.8, slow" ENV_FILE="ci/deps/travis-38-slow.yaml" PATTERN="slow" SQL="1" + services: + - mysql + - postgresql - env: - - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" + - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" services: - mysql - postgresql + - arch: arm64 + env: + - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" + - env: # Enabling Deprecations when running tests # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs # See pandas/_testing.py for more details. - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" + - JOB="3.7, coverage" ENV_FILE="ci/deps/travis-37-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" services: - mysql - postgresql + allow_failures: + # Moved to allowed_failures 2020-09-29 due to timeouts https://github.com/pandas-dev/pandas/issues/36719 - arch: arm64 env: - - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" - - dist: bionic - env: - - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" + - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" before_install: @@ -78,12 +78,6 @@ before_install: - uname -a - git --version - ./ci/check_git_tags.sh - # Because travis runs on Google Cloud and has a /etc/boto.cfg, - # it breaks moto import, see: - # https://github.com/spulec/moto/issues/1771 - # https://github.com/boto/boto/issues/3741 - # This overrides travis and tells it to look nowhere. - - export BOTO_CONFIG=/dev/null install: - echo "install start" @@ -95,7 +89,7 @@ install: script: - echo "script start" - echo "$JOB" - - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi + - source activate pandas-dev - ci/run_tests.sh after_script: diff --git a/AUTHORS.md b/AUTHORS.md index f576e333f9448..84fcfe05e3043 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -54,4 +54,3 @@ pandas is distributed under a 3-clause ("Simplified" or "New") BSD license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have BSD-compatible licenses, are included. Their licenses follow the pandas license. - diff --git a/Dockerfile b/Dockerfile index b8aff5d671dcf..de1c564921de9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM continuumio/miniconda3 +FROM quay.io/condaforge/miniforge3 # if you forked pandas, you can pass in your own GitHub username to use your fork # i.e. gh_username=myname @@ -15,10 +15,6 @@ RUN apt-get update \ # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed && apt-get -y install git iproute2 procps iproute2 lsb-release \ # - # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill), - # needed to build pandas C extensions - && apt-get -y install build-essential \ - # # cleanup && apt-get autoremove -y \ && apt-get clean -y \ @@ -39,9 +35,14 @@ RUN mkdir "$pandas_home" \ # we just update the base/root one from the 'environment.yml' file instead of creating a new one. # # Set up environment -RUN conda env update -n base -f "$pandas_home/environment.yml" +RUN conda install -y mamba +RUN mamba env update -n base -f "$pandas_home/environment.yml" # Build C extensions and pandas -RUN cd "$pandas_home" \ - && python setup.py build_ext --inplace -j 4 \ +SHELL ["/bin/bash", "-c"] +RUN . /opt/conda/etc/profile.d/conda.sh \ + && conda activate base \ + && cd "$pandas_home" \ + && export \ + && python setup.py build_ext -j 4 \ && python -m pip install -e . diff --git a/Makefile b/Makefile index f26689ab65ba5..2c968234749f5 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ clean_pyc: -find . -name '*.py[co]' -exec rm {} \; build: clean_pyc - python setup.py build_ext --inplace + python setup.py build_ext lint-diff: git diff upstream/master --name-only -- "*.py" | xargs flake8 @@ -25,3 +25,16 @@ doc: cd doc; \ python make.py clean; \ python make.py html + +check: + python3 scripts/validate_unwanted_patterns.py \ + --validation-type="private_function_across_module" \ + --included-file-extensions="py" \ + --excluded-file-paths=pandas/tests,asv_bench/ \ + pandas/ + + python3 scripts/validate_unwanted_patterns.py \ + --validation-type="private_import_across_module" \ + --included-file-extensions="py" \ + --excluded-file-paths=pandas/tests,asv_bench/,doc/ + pandas/ diff --git a/README.md b/README.md index a72e8402e68a0..6d1d890c54093 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ its way towards this goal. Here are just a few of the things that pandas does well: - Easy handling of [**missing data**][missing-data] (represented as - `NaN`) in floating point as well as non-floating point data + `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data - Size mutability: columns can be [**inserted and deleted**][insertion-deletion] from DataFrame and higher dimensional objects @@ -60,27 +60,27 @@ Here are just a few of the things that pandas does well: and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, - date shifting and lagging. - - - [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data - [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#column-selection-addition-deletion - [alignment]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html?highlight=alignment#intro-to-data-structures - [groupby]: https://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine - [conversion]: https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe - [slicing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#slicing-ranges - [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#advanced-indexing-with-ix - [subsetting]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing - [merging]: https://pandas.pydata.org/pandas-docs/stable/merging.html#database-style-dataframe-joining-merging - [joining]: https://pandas.pydata.org/pandas-docs/stable/merging.html#joining-on-index - [reshape]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#reshaping-and-pivot-tables - [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/reshaping.html#pivot-tables-and-cross-tabulations - [mi]: https://pandas.pydata.org/pandas-docs/stable/indexing.html#hierarchical-indexing-multiindex - [flat-files]: https://pandas.pydata.org/pandas-docs/stable/io.html#csv-text-files - [excel]: https://pandas.pydata.org/pandas-docs/stable/io.html#excel-files - [db]: https://pandas.pydata.org/pandas-docs/stable/io.html#sql-queries - [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/io.html#hdf5-pytables - [timeseries]: https://pandas.pydata.org/pandas-docs/stable/timeseries.html#time-series-date-functionality + date shifting and lagging + + + [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html + [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion + [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures + [groupby]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#group-by-split-apply-combine + [conversion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe + [slicing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#slicing-ranges + [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced + [subsetting]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing + [merging]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging + [joining]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#joining-on-index + [reshape]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html + [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html + [mi]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#hierarchical-indexing-multiindex + [flat-files]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#csv-text-files + [excel]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#excel-files + [db]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#sql-queries + [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables + [timeseries]: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-series-date-functionality ## Where to get it The source code is currently hosted on GitHub at: @@ -154,7 +154,7 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata). ## Discussion and Development -Most development discussions take place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. +Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. ## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 4583fac85b776..e8e82edabbfa3 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -26,7 +26,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // "pythons": ["2.7", "3.4"], - "pythons": ["3.6"], + "pythons": ["3.8"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty @@ -39,7 +39,7 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "Cython": ["0.29.16"], + "Cython": ["0.29.21"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..03480ae198345 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,6 +5,7 @@ from pandas._libs import lib import pandas as pd +from pandas.core.algorithms import make_duplicates_of_left_unique_in_right from .pandas_vb_common import tm @@ -174,4 +175,15 @@ def time_argsort(self, N): self.array.argsort() +class RemoveDuplicates: + def setup(self): + N = 10 ** 5 + na = np.arange(int(N / 2)) + self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]]) + self.right = np.concatenate([na, na]) + + def time_make_duplicates_of_left_unique_in_right(self): + make_duplicates_of_left_unique_in_right(self.left, self.right) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 3ef6ab6209ea7..5a3febdcf75e7 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -125,7 +125,7 @@ def setup(self, op): arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8") arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4") df = pd.concat( - [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True, + [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True ) # should already be the case, but just to be sure df._consolidate_inplace() diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a0b24342091ec..f3b005b704014 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,3 +1,5 @@ +import string +import sys import warnings import numpy as np @@ -67,6 +69,47 @@ def time_existing_series(self): pd.Categorical(self.series) +class AsType: + def setup(self): + N = 10 ** 5 + + random_pick = np.random.default_rng().choice + + categories = { + "str": list(string.ascii_letters), + "int": np.random.randint(2 ** 16, size=154), + "float": sys.maxsize * np.random.random((38,)), + "timestamp": [ + pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578) + ], + } + + self.df = pd.DataFrame( + {col: random_pick(cats, N) for col, cats in categories.items()} + ) + + for col in ("int", "float", "timestamp"): + self.df[col + "_as_str"] = self.df[col].astype(str) + + for col in self.df.columns: + self.df[col] = self.df[col].astype("category") + + def astype_str(self): + [self.df[col].astype("str") for col in "int float timestamp".split()] + + def astype_int(self): + [self.df[col].astype("int") for col in "int_as_str timestamp".split()] + + def astype_float(self): + [ + self.df[col].astype("float") + for col in "float_as_str int int_as_str timestamp".split() + ] + + def astype_datetime(self): + self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific")) + + class Concat: def setup(self): N = 10 ** 5 diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index bd17b710b108d..a5ed5c389fee4 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,5 +1,9 @@ +import string + import numpy as np +from pandas import DataFrame +import pandas._testing as tm from pandas.api.types import pandas_dtype from .pandas_vb_common import ( @@ -62,4 +66,57 @@ def time_infer(self, dtype): lib.infer_dtype(self.data_dict[dtype], skipna=False) +class SelectDtypes: + + params = [ + tm.ALL_INT_DTYPES + + tm.ALL_EA_INT_DTYPES + + tm.FLOAT_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] + param_names = ["dtype"] + + def setup(self, dtype): + N, K = 5000, 50 + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + + def create_df(data): + return DataFrame(data, index=self.index, columns=self.columns) + + self.df_int = create_df(np.random.randint(low=100, size=(N, K))) + self.df_float = create_df(np.random.randn(N, K)) + self.df_bool = create_df(np.random.choice([True, False], size=(N, K))) + self.df_string = create_df( + np.random.choice(list(string.ascii_letters), size=(N, K)) + ) + + def time_select_dtype_int_include(self, dtype): + self.df_int.select_dtypes(include=dtype) + + def time_select_dtype_int_exclude(self, dtype): + self.df_int.select_dtypes(exclude=dtype) + + def time_select_dtype_float_include(self, dtype): + self.df_float.select_dtypes(include=dtype) + + def time_select_dtype_float_exclude(self, dtype): + self.df_float.select_dtypes(exclude=dtype) + + def time_select_dtype_bool_include(self, dtype): + self.df_bool.select_dtypes(include=dtype) + + def time_select_dtype_bool_exclude(self, dtype): + self.df_bool.select_dtypes(exclude=dtype) + + def time_select_dtype_string_include(self, dtype): + self.df_string.select_dtypes(include=dtype) + + def time_select_dtype_string_exclude(self, dtype): + self.df_string.select_dtypes(exclude=dtype) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index dc6f45f810f3d..e0a2257b0ca1f 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -6,7 +6,7 @@ from .pandas_vb_common import tm try: - from pandas.tseries.offsets import Nano, Hour + from pandas.tseries.offsets import Hour, Nano except ImportError: # For compatibility with older versions from pandas.core.datetools import * # noqa diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 44f71b392c0eb..70d90ded84545 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -219,6 +219,46 @@ def time_to_html_mixed(self): self.df2.to_html() +class ToNumpy: + def setup(self): + N = 10000 + M = 10 + self.df_tall = DataFrame(np.random.randn(N, M)) + self.df_wide = DataFrame(np.random.randn(M, N)) + self.df_mixed_tall = self.df_tall.copy() + self.df_mixed_tall["foo"] = "bar" + self.df_mixed_tall[0] = period_range("2000", periods=N) + self.df_mixed_tall[1] = range(N) + self.df_mixed_wide = self.df_wide.copy() + self.df_mixed_wide["foo"] = "bar" + self.df_mixed_wide[0] = period_range("2000", periods=M) + self.df_mixed_wide[1] = range(M) + + def time_to_numpy_tall(self): + self.df_tall.to_numpy() + + def time_to_numpy_wide(self): + self.df_wide.to_numpy() + + def time_to_numpy_mixed_tall(self): + self.df_mixed_tall.to_numpy() + + def time_to_numpy_mixed_wide(self): + self.df_mixed_wide.to_numpy() + + def time_values_tall(self): + self.df_tall.values + + def time_values_wide(self): + self.df_wide.values + + def time_values_mixed_tall(self): + self.df_mixed_tall.values + + def time_values_mixed_wide(self): + self.df_mixed_wide.values + + class Repr: def setup(self): nrows = 10000 diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index e266d871f5bc6..5d9070de92ec7 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -7,14 +7,14 @@ try: from pandas import ( - rolling_median, + rolling_kurt, + rolling_max, rolling_mean, + rolling_median, rolling_min, - rolling_max, - rolling_var, rolling_skew, - rolling_kurt, rolling_std, + rolling_var, ) have_rolling_methods = True diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5ffda03fad80f..6ce63ff8badca 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -358,6 +358,26 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class FillNA: + def setup(self): + N = 100 + self.df = DataFrame( + {"group": [1] * N + [2] * N, "value": [np.nan, 1.0] * N} + ).set_index("group") + + def time_df_ffill(self): + self.df.groupby("group").fillna(method="ffill") + + def time_df_bfill(self): + self.df.groupby("group").fillna(method="bfill") + + def time_srs_ffill(self): + self.df.groupby("group")["value"].fillna(method="ffill") + + def time_srs_bfill(self): + self.df.groupby("group")["value"].fillna(method="bfill") + + class GroupByMethods: param_names = ["dtype", "method", "application"] @@ -466,7 +486,7 @@ def setup(self): tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) tmp = np.concatenate((tmp1, tmp2)) arr = np.repeat(tmp, 10) - self.df = DataFrame(dict(a=arr, b=arr)) + self.df = DataFrame({"a": arr, "b": arr}) def time_sum(self): self.df.groupby(["a"])["b"].sum() @@ -627,33 +647,42 @@ def time_first(self): class TransformEngine: - def setup(self): + + param_names = ["parallel"] + params = [[True, False]] + + def setup(self, parallel): N = 10 ** 3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], ) + self.parallel = parallel self.grouper = data.groupby(0) - def time_series_numba(self): + def time_series_numba(self, parallel): def function(values, index): return values * 5 - self.grouper[1].transform(function, engine="numba") + self.grouper[1].transform( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_series_cython(self): + def time_series_cython(self, parallel): def function(values): return values * 5 self.grouper[1].transform(function, engine="cython") - def time_dataframe_numba(self): + def time_dataframe_numba(self, parallel): def function(values, index): return values * 5 - self.grouper.transform(function, engine="numba") + self.grouper.transform( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_dataframe_cython(self): + def time_dataframe_cython(self, parallel): def function(values): return values * 5 @@ -661,15 +690,20 @@ def function(values): class AggEngine: - def setup(self): + + param_names = ["parallel"] + params = [[True, False]] + + def setup(self, parallel): N = 10 ** 3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], ) + self.parallel = parallel self.grouper = data.groupby(0) - def time_series_numba(self): + def time_series_numba(self, parallel): def function(values, index): total = 0 for i, value in enumerate(values): @@ -679,9 +713,11 @@ def function(values, index): total += value * 2 return total - self.grouper[1].agg(function, engine="numba") + self.grouper[1].agg( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_series_cython(self): + def time_series_cython(self, parallel): def function(values): total = 0 for i, value in enumerate(values): @@ -693,7 +729,7 @@ def function(values): self.grouper[1].agg(function, engine="cython") - def time_dataframe_numba(self): + def time_dataframe_numba(self, parallel): def function(values, index): total = 0 for i, value in enumerate(values): @@ -703,9 +739,11 @@ def function(values, index): total += value * 2 return total - self.grouper.agg(function, engine="numba") + self.grouper.agg( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_dataframe_cython(self): + def time_dataframe_cython(self, parallel): def function(values): total = 0 for i, value in enumerate(values): diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py new file mode 100644 index 0000000000000..17bf434acf38a --- /dev/null +++ b/asv_bench/benchmarks/hash_functions.py @@ -0,0 +1,164 @@ +import numpy as np + +import pandas as pd + + +class IsinAlmostFullWithRandomInt: + params = [ + [np.float64, np.int64, np.uint64, np.object], + range(10, 21), + ] + param_names = ["dtype", "exponent"] + + def setup(self, dtype, exponent): + M = 3 * 2 ** (exponent - 2) + # 0.77-the maximal share of occupied buckets + np.random.seed(42) + self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype) + self.values = np.random.randint(0, M, M).astype(dtype) + self.values_outside = self.values + M + + def time_isin(self, dtype, exponent): + self.s.isin(self.values) + + def time_isin_outside(self, dtype, exponent): + self.s.isin(self.values_outside) + + +class IsinWithRandomFloat: + params = [ + [np.float64, np.object], + [ + 1_300, + 2_000, + 7_000, + 8_000, + 70_000, + 80_000, + 750_000, + 900_000, + ], + ] + param_names = ["dtype", "M"] + + def setup(self, dtype, M): + np.random.seed(42) + self.values = np.random.rand(M) + self.s = pd.Series(self.values).astype(dtype) + np.random.shuffle(self.values) + self.values_outside = self.values + 0.1 + + def time_isin(self, dtype, M): + self.s.isin(self.values) + + def time_isin_outside(self, dtype, M): + self.s.isin(self.values_outside) + + +class IsinWithArangeSorted: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + 100_000, + 1_000_000, + ], + ] + param_names = ["dtype", "M"] + + def setup(self, dtype, M): + self.s = pd.Series(np.arange(M)).astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M): + self.s.isin(self.values) + + +class IsinWithArange: + params = [ + [np.float64, np.int64, np.uint64, np.object], + [ + 1_000, + 2_000, + 8_000, + ], + [-2, 0, 2], + ] + param_names = ["dtype", "M", "offset_factor"] + + def setup(self, dtype, M, offset_factor): + offset = int(M * offset_factor) + np.random.seed(42) + tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6)) + self.s = tmp.astype(dtype) + self.values = np.arange(M).astype(dtype) + + def time_isin(self, dtype, M, offset_factor): + self.s.isin(self.values) + + +class Float64GroupIndex: + # GH28303 + def setup(self): + self.df = pd.date_range( + start="1/1/2018", end="1/2/2018", periods=1e6 + ).to_frame() + self.group_index = np.round(self.df.index.astype(int) / 1e9) + + def time_groupby(self): + self.df.groupby(self.group_index).last() + + +class UniqueAndFactorizeArange: + params = range(4, 16) + param_names = ["exponent"] + + def setup(self, exponent): + a = np.arange(10 ** 4, dtype="float64") + self.a2 = (a + 10 ** exponent).repeat(100) + + def time_factorize(self, exponent): + pd.factorize(self.a2) + + def time_unique(self, exponent): + pd.unique(self.a2) + + +class NumericSeriesIndexing: + + params = [ + (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + ] + param_names = ["index_dtype", "N"] + + def setup(self, index, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) + indices = index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] + + +class NumericSeriesIndexingShuffled: + + params = [ + (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6), + ] + param_names = ["index_dtype", "N"] + + def setup(self, index, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) + np.random.seed(42) + np.random.shuffle(vals) + indices = index(vals) + self.data = pd.Series(np.arange(N), index=indices) + + def time_loc_slice(self, index, N): + # trigger building of mapping + self.data.loc[:800] diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index b242de6a17208..9c05019c70396 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -57,8 +57,8 @@ def time_datetime_difference_disjoint(self): class Range: def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3) - self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3) + self.idx_inc = RangeIndex(start=0, stop=10 ** 6, step=3) + self.idx_dec = RangeIndex(start=10 ** 6, stop=-1, step=-3) def time_max(self): self.idx_inc.max() @@ -73,15 +73,23 @@ def time_min_trivial(self): self.idx_inc.min() def time_get_loc_inc(self): - self.idx_inc.get_loc(900000) + self.idx_inc.get_loc(900_000) def time_get_loc_dec(self): - self.idx_dec.get_loc(100000) + self.idx_dec.get_loc(100_000) + + def time_iter_inc(self): + for _ in self.idx_inc: + pass + + def time_iter_dec(self): + for _ in self.idx_dec: + pass class IndexEquals: def setup(self): - idx_large_fast = RangeIndex(100000) + idx_large_fast = RangeIndex(100_000) idx_small_slow = date_range(start="1/1/2012", periods=1) self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) @@ -94,7 +102,7 @@ def time_non_object_equals_multiindex(self): class IndexAppend: def setup(self): - N = 10000 + N = 10_000 self.range_idx = RangeIndex(0, 100) self.int_idx = self.range_idx.astype(int) self.obj_idx = self.int_idx.astype(str) @@ -168,7 +176,7 @@ def time_get_loc_non_unique_sorted(self, dtype): class Float64IndexMethod: # GH 13166 def setup(self): - N = 100000 + N = 100_000 a = np.arange(N) self.ind = Float64Index(a * 4.8000000418824129e-08) @@ -212,7 +220,7 @@ class GC: params = [1, 2, 5] def create_use_drop(self): - idx = Index(list(range(1000 * 1000))) + idx = Index(list(range(1_000_000))) idx._engine def peakmem_gc_instances(self, N): diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 836d3ca8602ec..4fd91c8aafe4b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -191,7 +191,7 @@ def setup(self, index): } index = indexes[index] self.s = Series(np.random.rand(N), index=index) - self.indexer = [True, False, True, True, False] * 20000 + self.indexer = np.random.randint(0, N, size=N) def time_take(self, index): self.s.take(self.indexer) @@ -358,6 +358,14 @@ def time_assign_with_setitem(self): for i in range(100): self.df[i] = np.random.randn(self.N) + def time_assign_list_like_with_setitem(self): + np.random.seed(1234) + self.df[list(range(100))] = np.random.randn(self.N, 100) + + def time_assign_list_of_columns_concat(self): + df = DataFrame(np.random.randn(self.N, 100)) + concat([self.df, df], axis=1) + class ChainIndexing: diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index ec3eddfff7184..5390056ba36f2 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,8 +2,8 @@ try: from pandas._libs.tslibs.parsing import ( - concat_date_cols, _does_string_look_like_datetime, + concat_date_cols, ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 4ca9a82ae4827..656fe2197bc8a 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -24,5 +24,11 @@ def time_read_pickle(self): def time_write_pickle(self): self.df.to_pickle(self.fname) + def peakmem_read_pickle(self): + read_pickle(self.fname) + + def peakmem_write_pickle(self): + self.df.to_pickle(self.fname) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 1333b3a0f0560..a572b8a70a680 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) + def time_join_dataframes_cross(self, sort): + self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort) + class JoinIndex: def setup(self): @@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) + def time_merge_dataframes_cross(self, sort): + merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) + class I8Merge: diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py index 8ca33db361fa0..34fe4929a752b 100644 --- a/asv_bench/benchmarks/package.py +++ b/asv_bench/benchmarks/package.py @@ -4,22 +4,16 @@ import subprocess import sys -from pandas.compat import PY37 - class TimeImport: def time_import(self): - if PY37: - # on py37+ we the "-X importtime" usage gives us a more precise - # measurement of the import time we actually care about, - # without the subprocess or interpreter overhead - cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] - p = subprocess.run(cmd, stderr=subprocess.PIPE) - - line = p.stderr.splitlines()[-1] - field = line.split(b"|")[-2].strip() - total = int(field) # microseconds - return total + # on py37+ we the "-X importtime" usage gives us a more precise + # measurement of the import time we actually care about, + # without the subprocess or interpreter overhead + cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] + p = subprocess.run(cmd, stderr=subprocess.PIPE) - cmd = [sys.executable, "-c", "import pandas as pd"] - subprocess.run(cmd, stderr=subprocess.PIPE) + line = p.stderr.splitlines()[-1] + field = line.split(b"|")[-2].strip() + total = int(field) # microseconds + return total diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 23286343d7367..7bd4d639633b3 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -15,7 +15,7 @@ # Compatibility import for the testing module try: - import pandas._testing as tm # noqa + import pandas._testing as tm except ImportError: import pandas.util.testing as tm # noqa diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 21081ee23a773..9cec8a5f7d318 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -103,7 +103,10 @@ def setup(self): nidvars = 20 N = 5000 self.letters = list("ABCD") - yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))] + yrvars = [ + letter + str(num) + for letter, num in product(self.letters, range(1, nyrs + 1)) + ] columns = [str(i) for i in range(nidvars)] + yrvars self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns) self.df["id"] = self.df.index diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index f0dd908f81043..5a36cff7908f0 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -76,12 +76,21 @@ class ExpandingMethods: def setup(self, constructor, dtype, method): N = 10 ** 5 + N_groupby = 100 arr = (100 * np.random.random(N)).astype(dtype) self.expanding = getattr(pd, constructor)(arr).expanding() + self.expanding_groupby = ( + pd.DataFrame({"A": arr[:N_groupby], "B": range(N_groupby)}) + .groupby("B") + .expanding() + ) def time_expanding(self, constructor, dtype, method): getattr(self.expanding, method)() + def time_expanding_groupby(self, constructor, dtype, method): + getattr(self.expanding_groupby, method)() + class EWMMethods: @@ -216,4 +225,31 @@ def time_rolling_offset(self, method): getattr(self.groupby_roll_offset, method)() +class GroupbyLargeGroups: + # https://github.com/pandas-dev/pandas/issues/38038 + # specific example where the rolling operation on a larger dataframe + # is relatively cheap (few but large groups), but creation of + # MultiIndex of result can be expensive + + def setup(self): + N = 100000 + self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)}) + + def time_rolling_multiindex_creation(self): + self.df.groupby("A").rolling(3).mean() + + +class GroupbyEWM: + + params = ["cython", "numba"] + param_names = ["engine"] + + def setup(self, engine): + df = pd.DataFrame({"A": range(50), "B": range(50)}) + self.gb_ewm = df.groupby("A").ewm(com=1.0) + + def time_groupby_mean(self, engine): + self.gb_ewm.mean(engine=engine) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 258c29c145721..2db46abca119c 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import NaT, Series, date_range +from pandas import Categorical, NaT, Series, date_range from .pandas_vb_common import tm @@ -36,6 +36,28 @@ def time_isin(self, dtypes): self.s.isin(self.values) +class IsInDatetime64: + def setup(self): + dti = date_range( + start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s" + ) + self.ser = Series(dti) + self.subset = self.ser._values[::3] + self.cat_subset = Categorical(self.subset) + + def time_isin(self): + self.ser.isin(self.subset) + + def time_isin_cat_values(self): + self.ser.isin(self.cat_subset) + + def time_isin_mismatched_dtype(self): + self.ser.isin([1, 2]) + + def time_isin_empty(self): + self.ser.isin([]) + + class IsInFloat64: def setup(self): self.small = Series([1, 2], dtype=np.float64) @@ -90,6 +112,55 @@ def time_isin_long_series_long_values_floats(self): self.s_long_floats.isin(self.vals_long_floats) +class IsInLongSeriesLookUpDominates: + params = [ + ["int64", "int32", "float64", "float32", "object"], + [5, 1000], + ["random_hits", "random_misses", "monotone_hits", "monotone_misses"], + ] + param_names = ["dtype", "MaxNumber", "series_type"] + + def setup(self, dtype, MaxNumber, series_type): + N = 10 ** 7 + if series_type == "random_hits": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + if series_type == "random_misses": + np.random.seed(42) + array = np.random.randint(0, MaxNumber, N) + MaxNumber + if series_type == "monotone_hits": + array = np.repeat(np.arange(MaxNumber), N // MaxNumber) + if series_type == "monotone_misses": + array = np.arange(N) + MaxNumber + self.series = Series(array).astype(dtype) + self.values = np.arange(MaxNumber).astype(dtype) + + def time_isin(self, dtypes, MaxNumber, series_type): + self.series.isin(self.values) + + +class IsInLongSeriesValuesDominate: + params = [ + ["int64", "int32", "float64", "float32", "object"], + ["random", "monotone"], + ] + param_names = ["dtype", "series_type"] + + def setup(self, dtype, series_type): + N = 10 ** 7 + if series_type == "random": + np.random.seed(42) + vals = np.random.randint(0, 10 * N, N) + if series_type == "monotone": + vals = np.arange(N) + self.values = vals.astype(dtype) + M = 10 ** 6 + 1 + self.series = Series(np.arange(M)).astype(dtype) + + def time_isin(self, dtypes, series_type): + self.series.isin(self.values) + + class NSort: params = ["first", "last", "all"] diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d7fb2775376c0..7c75ad031e7cd 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -2,11 +2,49 @@ import numpy as np -from pandas import DataFrame, Series +from pandas import Categorical, DataFrame, Series from .pandas_vb_common import tm +class Construction: + + params = ["str", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() + + # GH37371. Testing construction of string series/frames from ExtensionArrays + self.series_cat_arr = Categorical(self.series_arr) + self.frame_cat_arr = Categorical(self.frame_arr) + + def time_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) + + def peakmem_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) + + def time_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) + + def peakmem_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) + + def time_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def peakmem_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def time_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + + def peakmem_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + + class Methods: def setup(self): self.s = Series(tm.makeStringIndex(10 ** 5)) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index b494dbd8a38fa..4ed542b3a28e3 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -3,7 +3,14 @@ import dateutil import numpy as np -from pandas import DataFrame, Series, date_range, period_range, to_datetime +from pandas import ( + DataFrame, + Series, + date_range, + period_range, + timedelta_range, + to_datetime, +) from pandas.tseries.frequencies import infer_freq @@ -121,12 +128,15 @@ def time_convert(self): class Iteration: - params = [date_range, period_range] + params = [date_range, period_range, timedelta_range] param_names = ["time_index"] def setup(self, time_index): N = 10 ** 6 - self.idx = time_index(start="20140101", freq="T", periods=N) + if time_index is timedelta_range: + self.idx = time_index(start=0, freq="T", periods=N) + else: + self.idx = time_index(start="20140101", freq="T", periods=N) self.exit = 10000 def time_iter(self, time_index): @@ -263,6 +273,29 @@ def time_lookup_and_cleanup(self): self.ts.index._cleanup() +class ToDatetimeFromIntsFloats: + def setup(self): + self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") + self.ts_sec_float = self.ts_sec.astype("float64") + + self.ts_nanosec = 1_000_000 * self.ts_sec + self.ts_nanosec_float = self.ts_nanosec.astype("float64") + + # speed of int64 and float64 paths should be comparable + + def time_nanosec_int64(self): + to_datetime(self.ts_nanosec, unit="ns") + + def time_nanosec_float64(self): + to_datetime(self.ts_nanosec_float, unit="ns") + + def time_sec_int64(self): + to_datetime(self.ts_sec, unit="s") + + def time_sec_float64(self): + to_datetime(self.ts_sec_float, unit="s") + + class ToDatetimeYYYYMMDD: def setup(self): rng = date_range(start="1/1/2000", periods=10000, freq="D") diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index 7d4e0556f4d96..9a206410d8775 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -1,5 +1,5 @@ try: - from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized + from pandas._libs.tslibs import is_date_array_normalized, normalize_i8_timestamps except ImportError: from pandas._libs.tslibs.conversion import ( normalize_i8_timestamps, diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index fc1efe63307b2..0aea8332398b1 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -9,7 +9,7 @@ from pandas import offsets try: - import pandas.tseries.holiday # noqa + import pandas.tseries.holiday except ImportError: pass diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 1a2c89b48c665..849e8ec864ac2 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -9,7 +9,12 @@ from pandas.tseries.frequencies import to_offset -from .tslib import _sizes +from .tslib import _sizes, _tzs + +try: + from pandas._libs.tslibs.vectorized import dt64arr_to_periodarr +except ImportError: + from pandas._libs.tslibs.period import dt64arr_to_periodarr class PeriodProperties: @@ -75,26 +80,29 @@ def time_period_constructor(self, freq, is_offset): Period("2012-06-01", freq=freq) +_freq_ints = [ + 1000, + 1011, # Annual - November End + 2000, + 2011, # Quarterly - November End + 3000, + 4000, + 4006, # Weekly - Saturday End + 5000, + 6000, + 7000, + 8000, + 9000, + 10000, + 11000, + 12000, +] + + class TimePeriodArrToDT64Arr: params = [ _sizes, - [ - 1000, - 1011, # Annual - November End - 2000, - 2011, # Quarterly - November End - 3000, - 4000, - 4006, # Weekly - Saturday End - 5000, - 6000, - 7000, - 8000, - 9000, - 10000, - 11000, - 12000, - ], + _freq_ints, ] param_names = ["size", "freq"] @@ -104,3 +112,19 @@ def setup(self, size, freq): def time_periodarray_to_dt64arr(self, size, freq): periodarr_to_dt64arr(self.i8values, freq) + + +class TimeDT64ArrToPeriodArr: + params = [ + _sizes, + _freq_ints, + _tzs, + ] + param_names = ["size", "freq", "tz"] + + def setup(self, size, freq, tz): + arr = np.arange(10, dtype="i8").repeat(size // 10) + self.i8values = arr + + def time_dt64arr_to_periodarr(self, size, freq, tz): + dt64arr_to_periodarr(self.i8values, freq, tz) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e45cafc02cb61..464bad7884362 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,9 +1,11 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: - master +- 1.2.x pr: - master +- 1.2.x variables: PYTEST_WORKERS: auto @@ -24,3 +26,28 @@ jobs: parameters: name: Windows vmImage: vs2017-win2016 + +- job: py37_32bit + pool: + vmImage: ubuntu-18.04 + + steps: + - script: | + docker pull quay.io/pypa/manylinux2014_i686 + docker run -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ + /bin/bash -xc "cd pandas && \ + /opt/python/cp37-cp37m/bin/python -m venv ~/virtualenvs/pandas-dev && \ + . ~/virtualenvs/pandas-dev/bin/activate && \ + python -m pip install --no-deps -U pip wheel setuptools && \ + pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis pytest-azurepipelines && \ + python setup.py build_ext -q -j2 && \ + python -m pip install --no-build-isolation -e . && \ + pytest -m 'not slow and not network and not clipboard' pandas --junitxml=test-data.xml" + displayName: 'Run 32-bit manylinux2014 Docker Build / Tests' + + - task: PublishTestResults@2 + condition: succeededOrFailed() + inputs: + testResultsFiles: '**/test-*.xml' + failTaskOnFailedTests: true + testRunTitle: 'Publish test results for Python 3.7-32 bit full Linux' diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index f716974f6add1..8e44db0b4bcd4 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -9,62 +9,63 @@ jobs: strategy: matrix: ${{ if eq(parameters.name, 'macOS') }}: - py36_macos: - ENV_FILE: ci/deps/azure-macos-36.yaml - CONDA_PY: "36" + py37_macos: + ENV_FILE: ci/deps/azure-macos-37.yaml + CONDA_PY: "37" PATTERN: "not slow and not network" ${{ if eq(parameters.name, 'Linux') }}: - py36_minimum_versions: - ENV_FILE: ci/deps/azure-36-minimum_versions.yaml - CONDA_PY: "36" + py37_minimum_versions: + ENV_FILE: ci/deps/azure-37-minimum_versions.yaml + CONDA_PY: "37" PATTERN: "not slow and not network and not clipboard" - py36_locale_slow_old_np: - ENV_FILE: ci/deps/azure-36-locale_slow.yaml - CONDA_PY: "36" - PATTERN: "slow" - # pandas does not use the language (zh_CN), but should support different encodings (utf8) - # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any - LANG: "zh_CN.utf8" - LC_ALL: "zh_CN.utf8" - EXTRA_APT: "language-pack-zh-hans" + py37: + ENV_FILE: ci/deps/azure-37.yaml + CONDA_PY: "37" + PATTERN: "not slow and not network and not clipboard" - py36_slow: - ENV_FILE: ci/deps/azure-36-slow.yaml - CONDA_PY: "36" + py37_locale_slow: + ENV_FILE: ci/deps/azure-37-locale_slow.yaml + CONDA_PY: "37" PATTERN: "slow" - - py36_locale: - ENV_FILE: ci/deps/azure-36-locale.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network" LANG: "it_IT.utf8" LC_ALL: "it_IT.utf8" EXTRA_APT: "language-pack-it xsel" - #py36_32bit: - # ENV_FILE: ci/deps/azure-36-32bit.yaml - # CONDA_PY: "36" - # PATTERN: "not slow and not network and not clipboard" - # BITS32: "yes" - - py37_locale: - ENV_FILE: ci/deps/azure-37-locale.yaml + py37_slow: + ENV_FILE: ci/deps/azure-37-slow.yaml CONDA_PY: "37" + PATTERN: "slow" + + py38: + ENV_FILE: ci/deps/azure-38.yaml + CONDA_PY: "38" + PATTERN: "not slow and not network and not clipboard" + + py38_locale: + ENV_FILE: ci/deps/azure-38-locale.yaml + CONDA_PY: "38" PATTERN: "not slow and not network" + # pandas does not use the language (zh_CN), but should support different encodings (utf8) + # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any LANG: "zh_CN.utf8" LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans xsel" - py37_np_dev: - ENV_FILE: ci/deps/azure-37-numpydev.yaml - CONDA_PY: "37" + py38_np_dev: + ENV_FILE: ci/deps/azure-38-numpydev.yaml + CONDA_PY: "38" PATTERN: "not slow and not network" TEST_ARGS: "-W error" PANDAS_TESTING_MODE: "deprecate" EXTRA_APT: "xsel" + py39: + ENV_FILE: ci/deps/azure-39.yaml + CONDA_PY: "39" + PATTERN: "not slow and not network and not clipboard" + steps: - script: | if [ "$(uname)" == "Linux" ]; then diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 87f1bfd2adb79..e510f4115b25f 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -8,16 +8,16 @@ jobs: vmImage: ${{ parameters.vmImage }} strategy: matrix: - py36_np15: - ENV_FILE: ci/deps/azure-windows-36.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network" - - py37_np18: + py37_np16: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" PATTERN: "not slow and not network" + py38_np18: + ENV_FILE: ci/deps/azure-windows-38.yaml + CONDA_PY: "38" + PATTERN: "not slow and not network and not high_memory" + steps: - powershell: | Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" @@ -34,7 +34,7 @@ jobs: - bash: | source activate pandas-dev conda list - python setup.py build_ext -q -i -j 4 + python setup.py build_ext -q -j 4 python -m pip install --no-build-isolation -e . displayName: 'Build' diff --git a/ci/build39.sh b/ci/build39.sh deleted file mode 100755 index f85e1c7def206..0000000000000 --- a/ci/build39.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -e -# Special build for python3.9 until numpy puts its own wheels up - -sudo apt-get install build-essential gcc xvfb -pip install --no-deps -U pip wheel setuptools -pip install python-dateutil pytz pytest pytest-xdist hypothesis -pip install cython --pre # https://github.com/cython/cython/issues/3395 - -git clone https://github.com/numpy/numpy -cd numpy -python setup.py build_ext --inplace -python setup.py install -cd .. -rm -rf numpy - -python setup.py build_ext -inplace -python -m pip install --no-build-isolation -e . - -python -c "import sys; print(sys.version_info)" -python -c "import pandas as pd" -python -c "import hypothesis" diff --git a/ci/check_cache.sh b/ci/check_cache.sh deleted file mode 100755 index b83144fc45ef4..0000000000000 --- a/ci/check_cache.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -# currently not used -# script to make sure that cache is clean -# Travis CI now handles this - -if [ "$TRAVIS_PULL_REQUEST" == "false" ] -then - echo "Not a PR: checking for changes in ci/ from last 2 commits" - git diff HEAD~2 --numstat | grep -E "ci/" - ci_changes=$(git diff HEAD~2 --numstat | grep -E "ci/"| wc -l) -else - echo "PR: checking for changes in ci/ from last 2 commits" - git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:PR_HEAD - git diff PR_HEAD~2 --numstat | grep -E "ci/" - ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l) -fi - -CACHE_DIR="$HOME/.cache/" -CCACHE_DIR="$HOME/.ccache/" - -if [ $ci_changes -ne 0 ] -then - echo "Files have changed in ci/ deleting all caches" - rm -rf "$CACHE_DIR" - rm -rf "$CCACHE_DIR" -fi diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7b12de387d648..3eeee61f62a7e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -15,11 +15,10 @@ # $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors -# $ ./ci/code_checks.sh dependencies # check that dependencies are consistent # $ ./ci/code_checks.sh typing # run static type analysis -[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "dependencies" || "$1" == "typing" ]] || \ - { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|dependencies|typing]"; exit 9999; } +[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "typing" ]] || \ + { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|typing]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 @@ -48,38 +47,6 @@ fi ### LINTING ### if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then - echo "black --version" - black --version - - MSG='Checking black formatting' ; echo $MSG - black . --check - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # `setup.cfg` contains the list of error codes that are being ignored in flake8 - - echo "flake8 --version" - flake8 --version - - # pandas/_libs/src is C code, so no need to search there. - MSG='Linting .py code' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" . - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Linting .pyx and .pxd code' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" pandas --append-config=flake8/cython.cfg - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Linting .pxi.in' ; echo $MSG - flake8 --format="$FLAKE8_FORMAT" pandas/_libs --append-config=flake8/cython-template.cfg - RET=$(($RET + $?)) ; echo $MSG "DONE" - - echo "flake8-rst --version" - flake8-rst --version - - MSG='Linting code-blocks in .rst documentation' ; echo $MSG - flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT" - RET=$(($RET + $?)) ; echo $MSG "DONE" - # Check that cython casting is of the form `obj` as opposed to ` obj`; # it doesn't make a difference, but we want to be internally consistent. # Note: this grep pattern is (intended to be) equivalent to the python @@ -100,65 +67,11 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of not concatenated strings' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" --format="##[error]{source_path}:{line_number}:{msg}" . - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" . - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for strings with wrong placed spaces' ; echo $MSG - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" --format="##[error]{source_path}:{line_number}:{msg}" . - else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" . - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - - echo "isort --version-number" - isort --version-number - - # Imports - Check formatting using isort see setup.cfg for settings - MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts" - if [[ "$GITHUB_ACTIONS" == "true" ]]; then - eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) - else - eval $ISORT_CMD - fi - RET=$(($RET + $?)) ; echo $MSG "DONE" - fi ### PATTERNS ### if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then - # Check for imports from pandas.core.common instead of `import pandas.core.common as com` - # Check for imports from collections.abc instead of `from collections import abc` - MSG='Check for non-standard imports' ; echo $MSG - invgrep -R --include="*.py*" -E "from pandas.core.common import" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from pandas.core import common" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from collections.abc import" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from numpy import nan" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # Checks for test suite - # Check for imports from pandas._testing instead of `import pandas._testing as tm` - invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "from pandas import _testing as tm" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - - # No direct imports from conftest - invgrep -R --include="*.py*" -E "conftest import" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - invgrep -R --include="*.py*" -E "import conftest" pandas/tests - RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of exec' ; echo $MSG invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -171,12 +84,8 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -r -E --include '*.py' "[[:space:]] pytest.raises" pandas/tests/ RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for python2-style file encodings' ; echo $MSG - invgrep -R --include="*.py" --include="*.pyx" -E "# -\*- coding: utf-8 -\*-" pandas scripts - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Check for python2-style super usage' ; echo $MSG - invgrep -R --include="*.py" -E "super\(\w*, (self|cls)\)" pandas + MSG='Check for use of builtin filter function' ; echo $MSG + invgrep -R --include="*.py" -P '(?=1.21 - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - attrs=19.1.0 - - gcc_linux-32 - - gxx_linux-32 - - numpy=1.14.* - - python-dateutil - - pytz=2017.2 - - # see comment above - - pip - - pip: - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml deleted file mode 100644 index d31015fde4741..0000000000000 --- a/ci/deps/azure-36-locale.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: pandas-dev -channels: - - defaults - - conda-forge -dependencies: - - python=3.6.* - - # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 - - pytest-xdist>=1.21 - - pytest-asyncio - - hypothesis>=3.58.0 - - pytest-azurepipelines - - # pandas dependencies - - beautifulsoup4 - - html5lib - - ipython - - jinja2 - - lxml - - matplotlib=3.0.* - - nomkl - - numexpr - - numpy=1.15.* - - openpyxl - # lowest supported version of pyarrow (putting it here instead of in - # azure-36-minimum_versions because it needs numpy >= 1.14) - - pyarrow=0.13 - - pytables - - python-dateutil - - pytz - - scipy - - xarray - - xlrd - - xlsxwriter - - xlwt - - moto diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml similarity index 52% rename from ci/deps/azure-36-locale_slow.yaml rename to ci/deps/azure-37-locale_slow.yaml index 23121b985492e..7f658fe62d268 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -3,11 +3,11 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -16,17 +16,15 @@ dependencies: - beautifulsoup4=4.6.0 - bottleneck=1.2.* - lxml - - matplotlib=2.2.2 - - numpy=1.14.* - - openpyxl=2.5.7 + - matplotlib=3.0.0 + - numpy=1.16.* + - openpyxl=2.6.0 - python-dateutil - python-blosc - - pytz=2017.2 + - pytz=2017.3 - scipy - - sqlalchemy=1.1.4 - - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 - - pip - - pip: - - html5lib==1.0b2 + - sqlalchemy=1.2.8 + - xlrd=1.2.0 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 + - html5lib=1.0.1 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml similarity index 56% rename from ci/deps/azure-36-minimum_versions.yaml rename to ci/deps/azure-37-minimum_versions.yaml index 9f66f82720b5b..f184ea87c89fe 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -2,11 +2,11 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.6.1 + - python=3.7.1 # tools - - cython=0.29.16 - - pytest>=5.0.1, <6.0.0rc0 + - cython=0.29.21 + - pytest=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -15,16 +15,17 @@ dependencies: # pandas dependencies - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - - jinja2=2.8 + - jinja2=2.10 - numba=0.46.0 - - numexpr=2.6.2 - - numpy=1.15.4 - - openpyxl=2.5.7 - - pytables=3.4.3 + - numexpr=2.6.8 + - numpy=1.16.5 + - openpyxl=2.6.0 + - pytables=3.5.1 - python-dateutil=2.7.3 - - pytz=2017.2 + - pytz=2017.3 + - pyarrow=0.15 - scipy=1.2 - - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 + - xlrd=1.2.0 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 - html5lib=1.0.1 diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml new file mode 100644 index 0000000000000..50fccf86b6340 --- /dev/null +++ b/ci/deps/azure-37-slow.yaml @@ -0,0 +1,38 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - python=3.7.* + + # tools + - cython>=0.29.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - beautifulsoup4 + - fsspec>=0.7.4 + - html5lib + - lxml + - matplotlib + - numexpr + - numpy + - openpyxl + - patsy + - psycopg2 + - pymysql + - pytables + - python-dateutil + - pytz + - s3fs>=0.4.0 + - moto>=1.3.14 + - scipy + - sqlalchemy + - xlrd + - xlsxwriter + - xlwt + - moto + - flask diff --git a/ci/deps/travis-37.yaml b/ci/deps/azure-37.yaml similarity index 77% rename from ci/deps/travis-37.yaml rename to ci/deps/azure-37.yaml index aaf706d61fe5c..82cb6760b6d1e 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/azure-37.yaml @@ -6,10 +6,11 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 + - pytest-azurepipelines # pandas dependencies - botocore>=1.11 @@ -20,8 +21,8 @@ dependencies: - pyarrow - pytz - s3fs>=0.4.0 + - moto>=1.3.14 + - flask - tabulate - pyreadstat - pip - - pip: - - moto diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-38-locale.yaml similarity index 74% rename from ci/deps/azure-37-locale.yaml rename to ci/deps/azure-38-locale.yaml index 714e1100b1e1a..f879111a32e67 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -2,23 +2,24 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.7.* + - python=3.8.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-asyncio + - pytest-asyncio>=0.12.0 - hypothesis>=3.58.0 - pytest-azurepipelines # pandas dependencies - beautifulsoup4 + - flask - html5lib - ipython - jinja2 - lxml - - matplotlib + - matplotlib <3.3.0 - moto - nomkl - numexpr @@ -32,7 +33,8 @@ dependencies: - xlrd - xlsxwriter - xlwt - - pyarrow>=0.15 + - moto + - pyarrow=1.0.0 - pip - pip: - pyxlsb diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-38-numpydev.yaml similarity index 80% rename from ci/deps/azure-37-numpydev.yaml rename to ci/deps/azure-38-numpydev.yaml index 451fb5884a4af..f11a3bcb28ab2 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-38-numpydev.yaml @@ -2,19 +2,19 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.7.* + - python=3.8.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines # pandas dependencies - pytz - - pip + - pip=20.2 - pip: - - cython==0.29.16 # GH#34014 + - cython==0.29.21 # GH#34014 - "git+git://github.com/dateutil/dateutil.git" - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - "--pre" diff --git a/ci/deps/travis-38.yaml b/ci/deps/azure-38.yaml similarity index 78% rename from ci/deps/travis-38.yaml rename to ci/deps/azure-38.yaml index ac39a223cd086..954e9710f79b9 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/azure-38.yaml @@ -6,15 +6,15 @@ dependencies: - python=3.8.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 + - pytest-azurepipelines # pandas dependencies - numpy - python-dateutil - nomkl - pytz - - pip - tabulate==0.8.3 diff --git a/ci/deps/azure-39.yaml b/ci/deps/azure-39.yaml new file mode 100644 index 0000000000000..c4c84e73fa684 --- /dev/null +++ b/ci/deps/azure-39.yaml @@ -0,0 +1,22 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.9.* + + # tools + - cython>=0.29.21 + - pytest>=5.0.1 + - pytest-xdist>=1.21 + - hypothesis>=3.58.0 + - pytest-azurepipelines + + # pandas dependencies + - numpy + - python-dateutil + - pytz + + # optional dependencies + - pytables + - scipy + - pyarrow=1.0 diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-37.yaml similarity index 80% rename from ci/deps/azure-macos-36.yaml rename to ci/deps/azure-macos-37.yaml index 81a27465f9e61..31e0ffca81424 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -2,10 +2,10 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.6.* + - python=3.7.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -19,9 +19,9 @@ dependencies: - matplotlib=2.2.3 - nomkl - numexpr - - numpy=1.15.4 + - numpy=1.16.5 - openpyxl - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pytables - python-dateutil==2.7.3 - pytz @@ -31,6 +31,6 @@ dependencies: - xlwt - pip - pip: - - cython>=0.29.16 + - cython>=0.29.21 - pyreadstat - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 34fca631df6c1..16b4bd72683b4 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -6,8 +6,8 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -15,21 +15,22 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.7.4 + - fsspec>=0.8.0 - gcsfs>=0.6.0 - html5lib - jinja2 - lxml - matplotlib=2.2.* - - moto + - moto>=1.3.14 + - flask - numexpr - - numpy=1.18.* + - numpy=1.16.* - openpyxl - - pyarrow=0.14 + - pyarrow=0.15 - pytables - python-dateutil - pytz - - s3fs>=0.4.0 + - s3fs>=0.4.2 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-38.yaml similarity index 67% rename from ci/deps/azure-windows-36.yaml rename to ci/deps/azure-windows-38.yaml index 4d7e1d821037b..449bbd05991bf 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -3,11 +3,11 @@ channels: - conda-forge - defaults dependencies: - - python=3.6.* + - python=3.8.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -16,16 +16,20 @@ dependencies: - blosc - bottleneck - fastparquet>=0.3.2 - - matplotlib=3.0.2 + - flask + - fsspec>=0.8.0 + - matplotlib=3.1.3 + - moto>=1.3.14 - numba - numexpr - - numpy=1.15.* + - numpy=1.18.* - openpyxl - jinja2 - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pytables - python-dateutil - pytz + - s3fs>=0.4.0 - scipy - xlrd - xlsxwriter diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index f434a03609b26..8df6104f43a50 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -1,13 +1,12 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - python=3.7.* # tools - - cython>=0.29.13 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -17,5 +16,6 @@ dependencies: - python-dateutil - pytz - pip + - flask - pip: - moto diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-37-cov.yaml similarity index 65% rename from ci/deps/travis-36-cov.yaml rename to ci/deps/travis-37-cov.yaml index 5f5ea8034cddf..c89b42ef06a2e 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -1,21 +1,19 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - - pytest-cov # this is only needed in the coverage build + - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 # pandas dependencies - beautifulsoup4 - botocore>=1.11 - - cython>=0.29.16 - dask - fastparquet>=0.3.2 - fsspec>=0.7.4 @@ -23,18 +21,21 @@ dependencies: - geopandas - html5lib - matplotlib - - moto + - moto>=1.3.14 + - flask - nomkl - numexpr - - numpy=1.15.* + - numpy=1.16.* - odfpy - openpyxl - pandas-gbq + - google-cloud-bigquery>=1.27.2 # GH 36436 - psycopg2 - - pyarrow>=0.13.0 - - pymysql + - pyarrow>=0.15.0 + - pymysql<0.10.0 # temporary pin, GH 36465 - pytables - python-snappy + - python-dateutil - pytz - s3fs>=0.4.0 - scikit-learn @@ -50,5 +51,4 @@ dependencies: - brotlipy - coverage - pandas-datareader - - python-dateutil - pyxlsb diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-37-locale.yaml similarity index 54% rename from ci/deps/travis-36-locale.yaml rename to ci/deps/travis-37-locale.yaml index 6bc4aba733ee5..4e442b10482a7 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -3,38 +3,45 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - # pandas dependencies + # required + - numpy + - python-dateutil + - pytz + + # optional - beautifulsoup4 - - blosc=1.14.3 + - blosc=1.15.0 - python-blosc - fastparquet=0.3.2 - html5lib - ipython - jinja2 - - lxml=3.8.0 - - matplotlib=3.0.* - - moto + - lxml=4.3.0 + - matplotlib - nomkl - numexpr - - numpy - openpyxl - - pandas-gbq=0.12.0 - - psycopg2=2.6.2 - - pymysql=0.7.11 - - pytables - - python-dateutil - - pytz + - pandas-gbq + - google-cloud-bigquery>=1.27.2 # GH 36436 + - pyarrow>=0.17 + - pytables>=3.5.1 - scipy - - sqlalchemy=1.1.4 - - xarray=0.10 + - xarray=0.12.3 - xlrd - xlsxwriter - xlwt + - moto + - flask + + # sql + - psycopg2=2.7 + - pymysql=0.7.11 + - sqlalchemy=1.3.0 diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/travis-38-slow.yaml similarity index 83% rename from ci/deps/azure-36-slow.yaml rename to ci/deps/travis-38-slow.yaml index 0a6d1d13c8549..e4b719006a11e 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/travis-38-slow.yaml @@ -3,11 +3,11 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.8.* # tools - - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - cython>=0.29.21 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -27,9 +27,11 @@ dependencies: - python-dateutil - pytz - s3fs>=0.4.0 + - moto>=1.3.14 - scipy - sqlalchemy - xlrd - xlsxwriter - xlwt - moto + - flask diff --git a/ci/run_tests.sh b/ci/run_tests.sh index fda2005ce7843..78d24c814840a 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -22,6 +22,12 @@ fi PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile -s --strict --durations=30 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" +if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then + # GH#37455 windows py38 build appears to be running out of memory + # skip collection of window tests + PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/" +fi + echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/ci/setup_env.sh b/ci/setup_env.sh index aa43d8b7dd00a..c36422884f2ec 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,10 +1,5 @@ #!/bin/bash -e -if [ "$JOB" == "3.9-dev" ]; then - /bin/bash ci/build39.sh - exit 0 -fi - # edit the locale file if needed if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" @@ -42,8 +37,7 @@ else fi if [ "${TRAVIS_CPU_ARCH}" == "arm64" ]; then - sudo apt-get -y install xvfb - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.2-1/Miniforge3-4.8.2-1-Linux-aarch64.sh" + CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.5-1/Miniforge3-4.8.5-1-Linux-aarch64.sh" else CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-$CONDA_OS.sh" fi @@ -99,8 +93,6 @@ echo "conda list (root environment)" conda list # Clean up any left-over from a previous build -# (note workaround for https://github.com/conda/conda/issues/2679: -# `conda env remove` issue) conda remove --all -q -y -n pandas-dev echo @@ -116,6 +108,12 @@ fi echo "activate pandas-dev" source activate pandas-dev +# Explicitly set an environment variable indicating that this is pandas' CI environment. +# +# This allows us to enable things like -Werror that shouldn't be activated in +# downstream CI jobs that may also build pandas from source. +export PANDAS_CI=1 + echo echo "remove any installed pandas package" echo "w/o removing anything else" @@ -139,14 +137,8 @@ conda list pandas # Make sure any error below is reported as such echo "[Build extensions]" -python setup.py build_ext -q -i -j2 - -# TODO: Some of our environments end up with old versions of pip (10.x) -# Adding a new enough version of pip to the requirements explodes the -# solve time. Just using pip to update itself. -# - py35_macos -# - py35_compat -# - py36_32bit +python setup.py build_ext -q -j2 + echo "[Updating pip]" python -m pip install --no-deps -U pip wheel setuptools diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh index fccf8e1e8deff..b5118ad5defc6 100755 --- a/ci/travis_process_gbq_encryption.sh +++ b/ci/travis_process_gbq_encryption.sh @@ -10,4 +10,3 @@ elif [[ -n ${!TRAVIS_IV_ENV} ]]; then export GBQ_PROJECT_ID='pandas-gbq-tests'; echo 'Successfully decrypted gbq credentials' fi - diff --git a/codecov.yml b/codecov.yml index 1644bf315e0ac..6dd1e33a7a671 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,7 +1,7 @@ codecov: branch: master -comment: off +comment: false coverage: status: @@ -11,3 +11,6 @@ coverage: patch: default: target: '50' + +github_checks: + annotations: false diff --git a/doc/data/iris.data b/doc/data/iris.data index c19b9c3688515..026e214e5f754 100644 --- a/doc/data/iris.data +++ b/doc/data/iris.data @@ -148,4 +148,4 @@ SepalLength,SepalWidth,PetalLength,PetalWidth,Name 6.3,2.5,5.0,1.9,Iris-virginica 6.5,3.0,5.2,2.0,Iris-virginica 6.2,3.4,5.4,2.3,Iris-virginica -5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file +5.9,3.0,5.1,1.8,Iris-virginica diff --git a/doc/make.py b/doc/make.py index db729853e5834..40ce9ea3bbcd2 100755 --- a/doc/make.py +++ b/doc/make.py @@ -286,12 +286,12 @@ def main(): joined = ",".join(cmds) argparser = argparse.ArgumentParser( - description="pandas documentation builder", epilog=f"Commands: {joined}", + description="pandas documentation builder", epilog=f"Commands: {joined}" ) joined = ", ".join(cmds) argparser.add_argument( - "command", nargs="?", default="html", help=f"command to run: {joined}", + "command", nargs="?", default="html", help=f"command to run: {joined}" ) argparser.add_argument( "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build" diff --git a/doc/redirects.csv b/doc/redirects.csv index bceb4b5961324..de69d0168835d 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -542,7 +542,9 @@ generated/pandas.DatetimeIndex.date,../reference/api/pandas.DatetimeIndex.date generated/pandas.DatetimeIndex.day,../reference/api/pandas.DatetimeIndex.day generated/pandas.DatetimeIndex.day_name,../reference/api/pandas.DatetimeIndex.day_name generated/pandas.DatetimeIndex.dayofweek,../reference/api/pandas.DatetimeIndex.dayofweek +generated/pandas.DatetimeIndex.day_of_week,../reference/api/pandas.DatetimeIndex.day_of_week generated/pandas.DatetimeIndex.dayofyear,../reference/api/pandas.DatetimeIndex.dayofyear +generated/pandas.DatetimeIndex.day_of_year,../reference/api/pandas.DatetimeIndex.day_of_year generated/pandas.DatetimeIndex.floor,../reference/api/pandas.DatetimeIndex.floor generated/pandas.DatetimeIndex.freq,../reference/api/pandas.DatetimeIndex.freq generated/pandas.DatetimeIndex.freqstr,../reference/api/pandas.DatetimeIndex.freqstr @@ -839,7 +841,9 @@ generated/pandas.option_context,../reference/api/pandas.option_context generated/pandas.Period.asfreq,../reference/api/pandas.Period.asfreq generated/pandas.Period.day,../reference/api/pandas.Period.day generated/pandas.Period.dayofweek,../reference/api/pandas.Period.dayofweek +generated/pandas.Period.day_of_week,../reference/api/pandas.Period.day_of_week generated/pandas.Period.dayofyear,../reference/api/pandas.Period.dayofyear +generated/pandas.Period.day_of_year,../reference/api/pandas.Period.day_of_year generated/pandas.Period.days_in_month,../reference/api/pandas.Period.days_in_month generated/pandas.Period.daysinmonth,../reference/api/pandas.Period.daysinmonth generated/pandas.Period.end_time,../reference/api/pandas.Period.end_time @@ -850,7 +854,9 @@ generated/pandas.Period,../reference/api/pandas.Period generated/pandas.PeriodIndex.asfreq,../reference/api/pandas.PeriodIndex.asfreq generated/pandas.PeriodIndex.day,../reference/api/pandas.PeriodIndex.day generated/pandas.PeriodIndex.dayofweek,../reference/api/pandas.PeriodIndex.dayofweek +generated/pandas.PeriodIndex.day_of_week,../reference/api/pandas.PeriodIndex.day_of_week generated/pandas.PeriodIndex.dayofyear,../reference/api/pandas.PeriodIndex.dayofyear +generated/pandas.PeriodIndex.day_of_year,../reference/api/pandas.PeriodIndex.day_of_year generated/pandas.PeriodIndex.days_in_month,../reference/api/pandas.PeriodIndex.days_in_month generated/pandas.PeriodIndex.daysinmonth,../reference/api/pandas.PeriodIndex.daysinmonth generated/pandas.PeriodIndex.end_time,../reference/api/pandas.PeriodIndex.end_time @@ -993,7 +999,9 @@ generated/pandas.Series.dt.date,../reference/api/pandas.Series.dt.date generated/pandas.Series.dt.day,../reference/api/pandas.Series.dt.day generated/pandas.Series.dt.day_name,../reference/api/pandas.Series.dt.day_name generated/pandas.Series.dt.dayofweek,../reference/api/pandas.Series.dt.dayofweek +generated/pandas.Series.dt.day_of_week,../reference/api/pandas.Series.dt.day_of_week generated/pandas.Series.dt.dayofyear,../reference/api/pandas.Series.dt.dayofyear +generated/pandas.Series.dt.day_of_year,../reference/api/pandas.Series.dt.day_of_year generated/pandas.Series.dt.days,../reference/api/pandas.Series.dt.days generated/pandas.Series.dt.days_in_month,../reference/api/pandas.Series.dt.days_in_month generated/pandas.Series.dt.daysinmonth,../reference/api/pandas.Series.dt.daysinmonth @@ -1326,7 +1334,9 @@ generated/pandas.Timestamp.date,../reference/api/pandas.Timestamp.date generated/pandas.Timestamp.day,../reference/api/pandas.Timestamp.day generated/pandas.Timestamp.day_name,../reference/api/pandas.Timestamp.day_name generated/pandas.Timestamp.dayofweek,../reference/api/pandas.Timestamp.dayofweek +generated/pandas.Timestamp.day_of_week,../reference/api/pandas.Timestamp.day_of_week generated/pandas.Timestamp.dayofyear,../reference/api/pandas.Timestamp.dayofyear +generated/pandas.Timestamp.day_of_year,../reference/api/pandas.Timestamp.day_of_year generated/pandas.Timestamp.days_in_month,../reference/api/pandas.Timestamp.days_in_month generated/pandas.Timestamp.daysinmonth,../reference/api/pandas.Timestamp.daysinmonth generated/pandas.Timestamp.dst,../reference/api/pandas.Timestamp.dst diff --git a/doc/source/conf.py b/doc/source/conf.py index ee0d4ca3f2a24..15e7a13ff5b72 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -146,7 +146,7 @@ # built documents. # # The short X.Y version. -import pandas # noqa: E402 isort:skip +import pandas # isort:skip # version = '%s r%s' % (pandas.__version__, svn_version()) version = str(pandas.__version__) @@ -308,7 +308,7 @@ for method in methods: # ... and each of its public methods - moved_api_pages.append((f"{old}.{method}", f"{new}.{method}",)) + moved_api_pages.append((f"{old}.{method}", f"{new}.{method}")) if pattern is None: html_additional_pages = { @@ -441,14 +441,14 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) -import sphinx # noqa: E402 isort:skip -from sphinx.util import rpartition # noqa: E402 isort:skip -from sphinx.ext.autodoc import ( # noqa: E402 isort:skip +import sphinx # isort:skip +from sphinx.util import rpartition # isort:skip +from sphinx.ext.autodoc import ( # isort:skip AttributeDocumenter, Documenter, MethodDocumenter, ) -from sphinx.ext.autosummary import Autosummary # noqa: E402 isort:skip +from sphinx.ext.autosummary import Autosummary # isort:skip class AccessorDocumenter(MethodDocumenter): diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 11d0c35f92ff5..9477a9ac79dd6 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -9,11 +9,12 @@ pandas code style guide .. contents:: Table of contents: :local: -*pandas* follows the `PEP8 `_ +pandas follows the `PEP8 `_ standard and uses `Black `_ and `Flake8 `_ to ensure a -consistent code format throughout the project. For details see the -:ref:`contributing guide to pandas`. +consistent code format throughout the project. We encourage you to use +:ref:`pre-commit ` to automatically run ``black``, +``flake8``, ``isort``, and related code checks when you make a git commit. Patterns ======== @@ -172,5 +173,6 @@ Reading from a url .. code-block:: python from pandas.io.common import urlopen - with urlopen('http://www.google.com') as url: + + with urlopen("http://www.google.com") as url: raw_text = url.read() diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b85e9403038ab..86d495ef2b097 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -31,13 +31,13 @@ comment letting others know they are working on an issue. While this is ok, you check each issue individually, and it's not possible to find the unassigned ones. For this reason, we implemented a workaround consisting of adding a comment with the exact -text `take`. When you do it, a GitHub action will automatically assign you the issue +text ``take``. When you do it, a GitHub action will automatically assign you the issue (this will take seconds, and may require refreshing the page to see it). By doing this, it's possible to filter the list of issues and find only the unassigned ones. So, a good way to find an issue to start contributing to pandas is to check the list of `unassigned good first issues `_ -and assign yourself one you like by writing a comment with the exact text `take`. +and assign yourself one you like by writing a comment with the exact text ``take``. If for whatever reason you are not able to continue working with the issue, please try to unassign it, so other people know it's available again. You can check the list of @@ -133,7 +133,7 @@ want to clone your fork to your machine:: cd pandas-yourname git remote add upstream https://github.com/pandas-dev/pandas.git -This creates the directory `pandas-yourname` and connects your repository to +This creates the directory ``pandas-yourname`` and connects your repository to the upstream (main project) *pandas* repository. Note that performing a shallow clone (with ``--depth==N``, for some ``N`` greater @@ -146,31 +146,63 @@ Creating a development environment ---------------------------------- To test out code changes, you'll need to build pandas from source, which -requires a C compiler and Python environment. If you're making documentation -changes, you can skip to :ref:`contributing.documentation` but you won't be able -to build the documentation locally before pushing your changes. +requires a C/C++ compiler and Python environment. If you're making documentation +changes, you can skip to :ref:`contributing.documentation` but if you skip +creating the development environment you won't be able to build the documentation +locally before pushing your changes. Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ -Instead of manually setting up a development environment, you can use Docker to -automatically create the environment with just several commands. Pandas provides a `DockerFile` -in the root directory to build a Docker image with a full pandas development environment. +Instead of manually setting up a development environment, you can use `Docker +`_ to automatically create the environment with just several +commands. pandas provides a ``DockerFile`` in the root directory to build a Docker image +with a full pandas development environment. -Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code, -a popular free IDE, using the `.devcontainer.json` file. +**Docker Commands** + +Pass your GitHub username in the ``DockerFile`` to use your own fork:: + + # Build the image pandas-yourname-env + docker build --tag pandas-yourname-env . + # Run a container and bind your local forked repo, pandas-yourname, to the container + docker run -it --rm -v path-to-pandas-yourname:/home/pandas-yourname pandas-yourname-env + +Even easier, you can integrate Docker with the following IDEs: + +**Visual Studio Code** + +You can use the DockerFile to launch a remote session with Visual Studio Code, +a popular free IDE, using the ``.devcontainer.json`` file. See https://code.visualstudio.com/docs/remote/containers for details. +**PyCharm (Professional)** + +Enable Docker support and use the Services tool window to build and manage images as well as +run and interact with containers. +See https://www.jetbrains.com/help/pycharm/docker.html for details. + +Note that you might need to rebuild the C extensions if/when you merge with upstream/master using:: + + python setup.py build_ext -j 4 + .. _contributing.dev_c: Installing a C compiler ~~~~~~~~~~~~~~~~~~~~~~~ -Pandas uses C extensions (mostly written using Cython) to speed up certain +pandas uses C extensions (mostly written using Cython) to speed up certain operations. To install pandas from source, you need to compile these C extensions, which means you need a C compiler. This process depends on which platform you're using. +If you have setup your environment using ``conda``, the packages ``c-compiler`` +and ``cxx-compiler`` will install a fitting compiler for your platform that is +compatible with the remaining conda packages. On Windows and macOS, you will +also need to install the SDKs as they have to be distributed separately. +These packages will be automatically installed by using ``pandas``'s +``environment.yml``. + **Windows** You will need `Build Tools for Visual Studio 2017 @@ -180,13 +212,35 @@ You will need `Build Tools for Visual Studio 2017 You DO NOT need to install Visual Studio 2019. You only need "Build Tools for Visual Studio 2019" found by scrolling down to "All downloads" -> "Tools for Visual Studio 2019". + In the installer, select the "C++ build tools" workload. + +You can install the necessary components on the commandline using +`vs_buildtools.exe `_: + +.. code:: + + vs_buildtools.exe --quiet --wait --norestart --nocache ^ + --installPath C:\BuildTools ^ + --add "Microsoft.VisualStudio.Workload.VCTools;includeRecommended" ^ + --add Microsoft.VisualStudio.Component.VC.v141 ^ + --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 ^ + --add Microsoft.VisualStudio.Component.Windows10SDK.17763 -**Mac OS** +To setup the right paths on the commandline, call +``"C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.16 10.0.17763.0``. -Information about compiler installation can be found here: +**macOS** + +To use the ``conda``-based compilers, you will need to install the +Developer Tools using ``xcode-select --install``. Otherwise +information about compiler installation can be found here: https://devguide.python.org/setup/#macos -**Unix** +**Linux** + +For Linux-based ``conda`` installations, you won't have to install any +additional components outside of the conda environment. The instructions +below are only needed if your setup isn't based on conda environments. Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -218,11 +272,10 @@ Let us know if you have any difficulties by opening an issue or reaching out on Creating a Python environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Now that you have a C compiler, create an isolated pandas development -environment: +Now create an isolated pandas development environment: -* Install either `Anaconda `_ or `miniconda - `_ +* Install either `Anaconda `_, `miniconda + `_, or `miniforge `_ * Make sure your conda is up to date (``conda update conda``) * Make sure that you have :ref:`cloned the repository ` * ``cd`` to the pandas source directory @@ -243,7 +296,7 @@ We'll now kick off a three-step process: source activate pandas-dev # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 At this point you should be able to import pandas from your locally built version:: @@ -274,7 +327,7 @@ Creating a Python environment (pip) If you aren't using conda for your development environment, follow these instructions. You'll need to have at least Python 3.6.1 installed on your system. -**Unix**/**Mac OS with virtualenv** +**Unix**/**macOS with virtualenv** .. code-block:: bash @@ -290,10 +343,10 @@ You'll need to have at least Python 3.6.1 installed on your system. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 -**Unix**/**Mac OS with pyenv** +**Unix**/**macOS with pyenv** Consult the docs for setting up pyenv `here `__. @@ -314,7 +367,7 @@ Consult the docs for setting up pyenv `here `__. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 **Windows** @@ -340,7 +393,7 @@ should already exist. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 4 + python setup.py build_ext -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 Creating a branch @@ -417,7 +470,7 @@ Some other important things to know about the docs: contributing_docstring.rst -* The tutorials make heavy use of the `ipython directive +* The tutorials make heavy use of the `IPython directive `_ sphinx extension. This directive lets you put code in the documentation which will be run during the doc build. For example:: @@ -573,7 +626,7 @@ Building master branch documentation When pull requests are merged into the pandas ``master`` branch, the main parts of the documentation are also built by Travis-CI. These docs are then hosted `here -`__, see also +`__, see also the :ref:`Continuous Integration ` section. .. _contributing.code: @@ -609,7 +662,50 @@ do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* as possible to avoid mass breakages. -Additional standards are outlined on the :ref:`pandas code style guide ` +In addition to ``./ci/code_checks.sh``, some extra checks are run by +``pre-commit`` - see :ref:`here ` for how to +run them. + +Additional standards are outlined on the :ref:`pandas code style guide `. + +.. _contributing.pre-commit: + +Pre-commit +---------- + +You can run many of these styling checks manually as we have described above. However, +we encourage you to use `pre-commit hooks `_ instead +to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This +can be done by installing ``pre-commit``:: + + pip install pre-commit + +and then running:: + + pre-commit install + +from the root of the pandas repository. Now all of the styling checks will be +run each time you commit changes without your needing to run each one manually. +In addition, using ``pre-commit`` will also allow you to more easily +remain up-to-date with our code checks as they change. + +Note that if needed, you can skip these checks with ``git commit --no-verify``. + +If you don't want to use ``pre-commit`` as part of your workflow, you can still use it +to run its checks with:: + + pre-commit run --files + +without needing to have done ``pre-commit install`` beforehand. + +.. note:: + + If you have conflicting installations of ``virtualenv``, then you may get an + error - see `here `_. + + Also, due to a `bug in virtualenv `_, + you may run into issues if you're using conda. To solve this, you can downgrade + ``virtualenv`` to version ``20.0.33``. Optional dependencies --------------------- @@ -683,7 +779,7 @@ Python (PEP8 / black) pandas follows the `PEP8 `_ standard and uses `Black `_ and `Flake8 `_ to ensure a consistent code -format throughout the project. +format throughout the project. We encourage you to use :ref:`pre-commit `. :ref:`Continuous Integration ` will run those tools and report any stylistic errors in your code. Therefore, it is helpful before @@ -695,12 +791,9 @@ submitting code to run the check yourself:: to auto-format your code. Additionally, many editors have plugins that will apply ``black`` as you edit files. -You should use a ``black`` version >= 19.10b0 as previous versions are not compatible +You should use a ``black`` version 20.8b1 as previous versions are not compatible with the pandas codebase. -If you wish to run these checks automatically, we encourage you to use -:ref:`pre-commits ` instead. - One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this command will catch any stylistic errors in your changes specifically, but be beware it may not catch all of them. For example, if you delete the only @@ -751,9 +844,9 @@ Imports are alphabetically sorted within these sections. As part of :ref:`Continuous Integration ` checks we run:: - isort --recursive --check-only pandas + isort --check-only pandas -to check that imports are correctly formatted as per the `setup.cfg`. +to check that imports are correctly formatted as per the ``setup.cfg``. If you see output like the below in :ref:`Continuous Integration ` checks: @@ -770,8 +863,6 @@ You should run:: to automatically format imports correctly. This will modify your local copy of the files. -The `--recursive` flag can be passed to sort all files in a directory. - Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: git diff upstream/master --name-only -- "*.py" | xargs -r isort @@ -780,29 +871,6 @@ Where similar caveats apply if you are on OSX or Windows. You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `. -.. _contributing.pre-commit: - -Pre-commit -~~~~~~~~~~ - -You can run many of these styling checks manually as we have described above. However, -we encourage you to use `pre-commit hooks `_ instead -to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This -can be done by installing ``pre-commit``:: - - pip install pre-commit - -and then running:: - - pre-commit install - -from the root of the pandas repository. Now all of the styling checks will be -run each time you commit changes without your needing to run each one manually. -In addition, using this pre-commit hook will also allow you to more easily -remain up-to-date with our code checks as they change. - -Note that if needed, you can skip these checks with ``git commit --no-verify``. - Backwards compatibility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -945,7 +1013,7 @@ For example, quite a few functions in pandas accept a ``dtype`` argument. This c def as_type(dtype: Dtype) -> ...: ... -This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. +This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like ``axis``. Development of this module is active so be sure to refer to the source for the most up to date list of available types. Validating type hints ~~~~~~~~~~~~~~~~~~~~~ @@ -1185,7 +1253,7 @@ This test shows off several useful features of Hypothesis, as well as demonstrating a good use-case: checking properties that should hold over a large or complicated domain of inputs. -To keep the Pandas test suite running quickly, parametrized tests are +To keep the pandas test suite running quickly, parametrized tests are preferred if the inputs or logic are simple, with Hypothesis tests reserved for cases with complex logic or where there are too many combinations of options or subtle interactions to test (or think of!) all of them. @@ -1268,7 +1336,7 @@ Or with one of the following constructs:: Using `pytest-xdist `_, one can speed up local testing on multicore machines. To use this feature, you will -need to install `pytest-xdist` via:: +need to install ``pytest-xdist`` via:: pip install pytest-xdist @@ -1328,16 +1396,16 @@ environments. If you want to use virtualenv instead, write:: The ``-E virtualenv`` option should be added to all ``asv`` commands that run benchmarks. The default value is defined in ``asv.conf.json``. -Running the full test suite can take up to one hour and use up to 3GB of RAM. -Usually it is sufficient to paste only a subset of the results into the pull -request to show that the committed changes do not cause unexpected performance -regressions. You can run specific benchmarks using the ``-b`` flag, which -takes a regular expression. For example, this will only run tests from a -``pandas/asv_bench/benchmarks/groupby.py`` file:: +Running the full benchmark suite can be an all-day process, depending on your +hardware and its resource utilization. However, usually it is sufficient to paste +only a subset of the results into the pull request to show that the committed changes +do not cause unexpected performance regressions. You can run specific benchmarks +using the ``-b`` flag, which takes a regular expression. For example, this will +only run benchmarks from a ``pandas/asv_bench/benchmarks/groupby.py`` file:: asv continuous -f 1.1 upstream/master HEAD -b ^groupby -If you want to only run a specific group of tests from a file, you can do it +If you want to only run a specific group of benchmarks from a file, you can do it using ``.`` as a separator. For example:: asv continuous -f 1.1 upstream/master HEAD -b groupby.GroupByMethods @@ -1431,7 +1499,7 @@ The following defines how a commit message should be structured. Please referen relevant GitHub issues in your commit message using GH1234 or #1234. Either style is fine, but the former is generally preferred: -* a subject line with `< 80` chars. +* a subject line with ``< 80`` chars. * One blank line. * Optionally, a commit message body. diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 0c780ad5f5847..623d1e8d45565 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -25,25 +25,25 @@ The next example gives an idea of what a docstring looks like: """ Add up two integer numbers. - This function simply wraps the `+` operator, and does not + This function simply wraps the ``+`` operator, and does not do anything interesting, except for illustrating what the docstring of a very simple function looks like. Parameters ---------- num1 : int - First number to add + First number to add. num2 : int - Second number to add + Second number to add. Returns ------- int - The sum of `num1` and `num2` + The sum of ``num1`` and ``num2``. See Also -------- - subtract : Subtract one integer from another + subtract : Subtract one integer from another. Examples -------- @@ -63,14 +63,14 @@ The first conventions every Python docstring should follow are defined in `PEP-257 `_. As PEP-257 is quite broad, other more specific standards also exist. In the -case of pandas, the numpy docstring convention is followed. These conventions are +case of pandas, the NumPy docstring convention is followed. These conventions are explained in this document: * `numpydoc docstring guide `_ (which is based in the original `Guide to NumPy/SciPy documentation `_) -numpydoc is a Sphinx extension to support the numpy docstring convention. +numpydoc is a Sphinx extension to support the NumPy docstring convention. The standard uses reStructuredText (reST). reStructuredText is a markup language that allows encoding styles in plain text files. Documentation @@ -126,9 +126,9 @@ backticks. The following are considered inline code: def add_values(arr): """ - Add the values in `arr`. + Add the values in ``arr``. - This is equivalent to Python `sum` of :meth:`pandas.Series.sum`. + This is equivalent to Python ``sum`` of :meth:`pandas.Series.sum`. Some sections are omitted here for simplicity. """ @@ -144,13 +144,13 @@ backticks. The following are considered inline code: With several mistakes in the docstring. - It has a blank like after the signature `def func():`. + It has a blank like after the signature ``def func():``. The text 'Some function' should go in the line after the opening quotes of the docstring, not in the same line. There is a blank line between the docstring and the first line - of code `foo = 1`. + of code ``foo = 1``. The closing quotes should be in the next line, not in this one.""" @@ -269,11 +269,11 @@ after, and not between the line with the word "Parameters" and the one with the hyphens. After the title, each parameter in the signature must be documented, including -`*args` and `**kwargs`, but not `self`. +``*args`` and ``**kwargs``, but not ``self``. The parameters are defined by their name, followed by a space, a colon, another space, and the type (or types). Note that the space between the name and the -colon is important. Types are not defined for `*args` and `**kwargs`, but must +colon is important. Types are not defined for ``*args`` and ``**kwargs``, but must be defined for all other parameters. After the parameter definition, it is required to have a line with the parameter description, which is indented, and can have multiple lines. The description must start with a capital letter, and @@ -285,13 +285,13 @@ comma at the end of the type. The exact form of the type in this case will be argument means, which can be added after a comma "int, default -1, meaning all cpus". -In cases where the default value is `None`, meaning that the value will not be -used. Instead of "str, default None", it is preferred to write "str, optional". -When `None` is a value being used, we will keep the form "str, default None". -For example, in `df.to_csv(compression=None)`, `None` is not a value being used, +In cases where the default value is ``None``, meaning that the value will not be +used. Instead of ``"str, default None"``, it is preferred to write ``"str, optional"``. +When ``None`` is a value being used, we will keep the form "str, default None". +For example, in ``df.to_csv(compression=None)``, ``None`` is not a value being used, but means that compression is optional, and no compression is being used if not -provided. In this case we will use `str, optional`. Only in cases like -`func(value=None)` and `None` is being used in the same way as `0` or `foo` +provided. In this case we will use ``"str, optional"``. Only in cases like +``func(value=None)`` and ``None`` is being used in the same way as ``0`` or ``foo`` would be used, then we will specify "str, int or None, default None". **Good:** @@ -331,13 +331,13 @@ would be used, then we will specify "str, int or None, default None". specified kind. Note the blank line between the parameters title and the first - parameter. Also, note that after the name of the parameter `kind` + parameter. Also, note that after the name of the parameter ``kind`` and before the colon, a space is missing. Also, note that the parameter descriptions do not start with a capital letter, and do not finish with a dot. - Finally, the `**kwargs` parameter is missing. + Finally, the ``**kwargs`` parameter is missing. Parameters ---------- @@ -361,9 +361,9 @@ boolean, etc): * str * bool -For complex types, define the subtypes. For `dict` and `tuple`, as more than +For complex types, define the subtypes. For ``dict`` and ``tuple``, as more than one type is present, we use the brackets to help read the type (curly brackets -for `dict` and normal brackets for `tuple`): +for ``dict`` and normal brackets for ``tuple``): * list of int * dict of {str : int} @@ -401,7 +401,7 @@ DataFrame: * pandas.Categorical * pandas.arrays.SparseArray -If the exact type is not relevant, but must be compatible with a numpy +If the exact type is not relevant, but must be compatible with a NumPy array, array-like can be specified. If Any type that can be iterated is accepted, iterable can be used: @@ -512,8 +512,8 @@ This section is used to let users know about pandas functionality related to the one being documented. In rare cases, if no related methods or functions can be found at all, this section can be skipped. -An obvious example would be the `head()` and `tail()` methods. As `tail()` does -the equivalent as `head()` but at the end of the `Series` or `DataFrame` +An obvious example would be the ``head()`` and ``tail()`` methods. As ``tail()`` does +the equivalent as ``head()`` but at the end of the ``Series`` or ``DataFrame`` instead of at the beginning, it is good to let the users know about it. To give an intuition on what can be considered related, here there are some @@ -608,8 +608,8 @@ Examples in docstrings, besides illustrating the usage of the function or method, must be valid Python code, that returns the given output in a deterministic way, and that can be copied and run by users. -Examples are presented as a session in the Python terminal. `>>>` is used to -present code. `...` is used for code continuing from the previous line. +Examples are presented as a session in the Python terminal. ``>>>`` is used to +present code. ``...`` is used for code continuing from the previous line. Output is presented immediately after the last line of code generating the output (no blank lines in between). Comments describing the examples can be added with blank lines before and after them. @@ -664,7 +664,7 @@ A simple example could be: 4 Falcon dtype: object - With the `n` parameter, we can change the number of returned rows: + With the ``n`` parameter, we can change the number of returned rows: >>> s.head(n=3) 0 Ant @@ -742,7 +742,7 @@ positional arguments ``head(3)``. def fillna(self, value): """ - Replace missing values by `value`. + Replace missing values by ``value``. Examples -------- @@ -771,7 +771,7 @@ positional arguments ``head(3)``. def contains(self, pattern, case_sensitive=True, na=numpy.nan): """ - Return whether each value contains `pattern`. + Return whether each value contains ``pattern``. In this case, we are illustrating how to use sections, even if the example is simple enough and does not require them. @@ -788,8 +788,8 @@ positional arguments ``head(3)``. **Case sensitivity** - With `case_sensitive` set to `False` we can match `a` with both - `a` and `A`: + With ``case_sensitive`` set to ``False`` we can match ``a`` with both + ``a`` and ``A``: >>> s.contains(pattern='a', case_sensitive=False) 0 True @@ -800,7 +800,7 @@ positional arguments ``head(3)``. **Missing values** - We can fill missing values in the output using the `na` parameter: + We can fill missing values in the output using the ``na`` parameter: >>> s.contains(pattern='a', na=False) 0 False @@ -819,14 +819,14 @@ positional arguments ``head(3)``. """ A sample DataFrame method. - Do not import numpy and pandas. + Do not import NumPy and pandas. Try to use meaningful data, when it makes the example easier to understand. - Try to avoid positional arguments like in `df.method(1)`. They + Try to avoid positional arguments like in ``df.method(1)``. They can be all right if previously defined with a meaningful name, - like in `present_value(interest_rate)`, but avoid them otherwise. + like in ``present_value(interest_rate)``, but avoid them otherwise. When presenting the behavior with different parameters, do not place all the calls one next to the other. Instead, add a short sentence @@ -854,7 +854,7 @@ Tips for getting your examples pass the doctests Getting the examples pass the doctests in the validation script can sometimes be tricky. Here are some attention points: -* Import all needed libraries (except for pandas and numpy, those are already +* Import all needed libraries (except for pandas and NumPy, those are already imported as ``import pandas as pd`` and ``import numpy as np``) and define all variables you use in the example. @@ -914,7 +914,7 @@ plot will be generated automatically when building the documentation. class Series: def plot(self): """ - Generate a plot with the `Series` data. + Generate a plot with the ``Series`` data. Examples -------- @@ -998,4 +998,4 @@ mapping function names to docstrings. Wherever possible, we prefer using See ``pandas.core.generic.NDFrame.fillna`` for an example template, and ``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` -for the filled versions. \ No newline at end of file +for the filled versions. diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst new file mode 100644 index 0000000000000..358c4036df961 --- /dev/null +++ b/doc/source/development/debugging_extensions.rst @@ -0,0 +1,93 @@ +.. _debugging_c_extensions: + +{{ header }} + +====================== +Debugging C extensions +====================== + +Pandas uses select C extensions for high performance IO operations. In case you need to debug segfaults or general issues with those extensions, the following steps may be helpful. + +First, be sure to compile the extensions with the appropriate flags to generate debug symbols and remove optimizations. This can be achieved as follows: + +.. code-block:: sh + + python setup.py build_ext --inplace -j4 --with-debugging-symbols + +Using a debugger +================ + +Assuming you are on a Unix-like operating system, you can use either lldb or gdb to debug. The choice between either is largely dependent on your compilation toolchain - typically you would use lldb if using clang and gdb if using gcc. For macOS users, please note that ``gcc`` is on modern systems an alias for ``clang``, so if using Xcode you usually opt for lldb. Regardless of which debugger you choose, please refer to your operating systems instructions on how to install. + +After installing a debugger you can create a script that hits the extension module you are looking to debug. For demonstration purposes, let's assume you have a script called ``debug_testing.py`` with the following contents: + +.. code-block:: python + + import pandas as pd + + pd.DataFrame([[1, 2]]).to_json() + +Place the ``debug_testing.py`` script in the project root and launch a Python process under your debugger. If using lldb: + +.. code-block:: sh + + lldb python + +If using gdb: + +.. code-block:: sh + + gdb python + +Before executing our script, let's set a breakpoint in our JSON serializer in its entry function called ``objToJSON``. The lldb syntax would look as follows: + +.. code-block:: sh + + breakpoint set --name objToJSON + +Similarly for gdb: + +.. code-block:: sh + + break objToJSON + +.. note:: + + You may get a warning that this breakpoint cannot be resolved in lldb. gdb may give a similar warning and prompt you to make the breakpoint on a future library load, which you should say yes to. This should only happen on the very first invocation as the module you wish to debug has not yet been loaded into memory. + +Now go ahead and execute your script: + +.. code-block:: sh + + run .py + +Code execution will halt at the breakpoint defined or at the occurance of any segfault. LLDB's `GDB to LLDB command map `_ provides a listing of debugger command that you can execute using either debugger. + +Another option to execute the entire test suite under lldb would be to run the following: + +.. code-block:: sh + + lldb -- python -m pytest + +Or for gdb + +.. code-block:: sh + + gdb --args python -m pytest + +Once the process launches, simply type ``run`` and the test suite will begin, stopping at any segmentation fault that may occur. + +Checking memory leaks with valgrind +=================================== + +You can use `Valgrind `_ to check for and log memory leaks in extensions. For instance, to check for a memory leak in a test from the suite you can run: + +.. code-block:: sh + + PYTHONMALLOC=malloc valgrind --leak-check=yes --track-origins=yes --log-file=valgrind-log.txt python -m pytest + +Note that code execution under valgrind will take much longer than usual. While you can run valgrind against extensions compiled with any optimization level, it is suggested to have optimizations turned off from compiled extensions to reduce the amount of false positives. The ``--with-debugging-symbols`` flag passed during package setup will do this for you automatically. + +.. note:: + + For best results, you should run use a Python installation configured with Valgrind support (--with-valgrind) diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index fbd83af3de82e..d701208792a4c 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -71,11 +71,13 @@ descriptor format for these as is follows: .. code-block:: python index = pd.RangeIndex(0, 10, 2) - {'kind': 'range', - 'name': index.name, - 'start': index.start, - 'stop': index.stop, - 'step': index.step} + { + "kind": "range", + "name": index.name, + "start": index.start, + "stop": index.stop, + "step": index.step, + } Other index types must be serialized as data columns along with the other DataFrame columns. The metadata for these is a string indicating the name of @@ -182,4 +184,4 @@ As an example of fully-formed metadata: 'creator': { 'library': 'pyarrow', 'version': '0.13.0' - }} \ No newline at end of file + }} diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 1e6b2c646fdfd..d4219296f5795 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -34,7 +34,7 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): # verify there is a column latitude and a column longitude - if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + if "latitude" not in obj.columns or "longitude" not in obj.columns: raise AttributeError("Must have 'latitude' and 'longitude'.") @property @@ -50,8 +50,9 @@ decorate a class, providing the name of attribute to add. The class's Now users can access your methods using the ``geo`` namespace: - >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) + >>> ds = pd.Dataframe( + ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} + ... ) >>> ds.geo.center (5.0, 10.0) >>> ds.geo.plot() @@ -61,7 +62,7 @@ This can be a convenient way to extend pandas objects without subclassing them. If you write a custom accessor, make a pull request adding it to our :ref:`ecosystem` page. -We highly recommend validating the data in your accessor's `__init__`. +We highly recommend validating the data in your accessor's ``__init__``. In our ``GeoAccessor``, we validate that the data contains the expected columns, raising an ``AttributeError`` when the validation fails. For a ``Series`` accessor, you should validate the ``dtype`` if the accessor @@ -73,8 +74,6 @@ applies only to certain dtypes. Extension types --------------- -.. versionadded:: 0.23.0 - .. warning:: The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and @@ -178,6 +177,7 @@ your ``MyExtensionArray`` class, as follows: from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin + class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): pass @@ -219,7 +219,7 @@ and re-boxes it if necessary. If applicable, we highly recommend that you implement ``__array_ufunc__`` in your extension array to avoid coercion to an ndarray. See -`the numpy documentation `__ +`the NumPy documentation `__ for an example. As part of your implementation, we require that you defer to pandas when a pandas @@ -273,6 +273,7 @@ included as a column in a pandas DataFrame): def __arrow_array__(self, type=None): # convert the underlying array values to a pyarrow Array import pyarrow + return pyarrow.array(..., type=type) The ``ExtensionDtype.__from_arrow__`` method then controls the conversion @@ -349,7 +350,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame .. code-block:: python class SubclassedSeries(pd.Series): - @property def _constructor(self): return SubclassedSeries @@ -360,7 +360,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame class SubclassedDataFrame(pd.DataFrame): - @property def _constructor(self): return SubclassedDataFrame @@ -379,7 +378,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(to_framed) - >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -389,7 +388,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(df) - >>> sliced1 = df[['A', 'B']] + >>> sliced1 = df[["A", "B"]] >>> sliced1 A B 0 1 4 @@ -399,7 +398,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df["A"] >>> sliced2 0 1 1 2 @@ -424,11 +423,11 @@ Below is an example to define two original properties, "internal_cache" as a tem class SubclassedDataFrame2(pd.DataFrame): # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] _internal_names_set = set(_internal_names) # normal properties - _metadata = ['added_property'] + _metadata = ["added_property"] @property def _constructor(self): @@ -436,15 +435,15 @@ Below is an example to define two original properties, "internal_cache" as a tem .. code-block:: python - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 1 2 5 8 2 3 6 9 - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' + >>> df.internal_cache = "cached" + >>> df.added_property = "property" >>> df.internal_cache cached @@ -452,11 +451,11 @@ Below is an example to define two original properties, "internal_cache" as a tem property # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache + >>> df[["A", "B"]].internal_cache AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property + >>> df[["A", "B"]].added_property property .. _extending.plotting-backends: @@ -470,7 +469,7 @@ one based on Matplotlib. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: @@ -501,4 +500,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. \ No newline at end of file +https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index f8a6bb6deb52d..abe2fc1409bfb 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -16,6 +16,8 @@ Development code_style maintaining internals + test_writing + debugging_extensions extending developer policies diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 8f1c3d5d818c2..cec385dd087db 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -68,8 +68,9 @@ integer **codes** (until version 0.24 named *labels*), and the level **names**: .. ipython:: python - index = pd.MultiIndex.from_product([range(3), ['one', 'two']], - names=['first', 'second']) + index = pd.MultiIndex.from_product( + [range(3), ["one", "two"]], names=["first", "second"] + ) index index.levels index.codes diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 9f9e9dc2631f3..2a21704c27005 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -132,17 +132,24 @@ respond or self-close their issue if it's determined that the behavior is not a or the feature is out of scope. Sometimes reporters just go away though, and we'll close the issue after the conversation has died. +.. _maintaining.reviewing: + Reviewing pull requests ----------------------- Anybody can review a pull request: regular contributors, triagers, or core-team -members. Here are some guidelines to check. +members. But only core-team members can merge pull requets when they're ready. + +Here are some things to check when reviewing a pull request. -* Tests should be in a sensible location. +* Tests should be in a sensible location: in the same file as closely related tests. * New public APIs should be included somewhere in ``doc/source/reference/``. * New / changed API should use the ``versionadded`` or ``versionchanged`` directives in the docstring. * User-facing changes should have a whatsnew in the appropriate file. * Regression tests should reference the original GitHub issue number like ``# GH-1234``. +* The pull request should be labeled and assigned the appropriate milestone (the next patch release + for regression fixes and small bug fixes, the next minor milestone otherwise) +* Changes should comply with our :ref:`policies.version`. Cleaning up old issues ---------------------- @@ -189,5 +196,34 @@ being helpful on the issue tracker. The current list of core-team members is at https://github.com/pandas-dev/pandas-governance/blob/master/people.md + +.. _maintaining.merging: + +Merging pull requests +--------------------- + +Only core team members can merge pull requests. We have a few guidelines. + +1. You should typically not self-merge your own pull requests. Exceptions include + things like small changes to fix CI (e.g. pinning a package version). +2. You should not merge pull requests that have an active discussion, or pull + requests that has any ``-1`` votes from a core maintainer. pandas operates + by consensus. +3. For larger changes, it's good to have a +1 from at least two core team members. + +In addition to the items listed in :ref:`maintaining.closing`, you should verify +that the pull request is assigned the correct milestone. + +Pull requests merged with a patch-release milestone will typically be backported +by our bot. Verify that the bot noticed the merge (it will leave a comment within +a minute typically). If a manual backport is needed please do that, and remove +the "Needs backport" label once you've done it manually. If you forget to assign +a milestone before tagging, you can request the bot to backport it with: + +.. code-block:: console + + @Meeseeksdev backport + + .. _governance documents: https://github.com/pandas-dev/pandas-governance -.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization \ No newline at end of file +.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 1031bbfc46457..f8e6bda2085d8 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -16,7 +16,7 @@ deprecations, API compatibility, and version numbering. A pandas release number is made up of ``MAJOR.MINOR.PATCH``. -API breaking changes should only occur in **major** releases. Theses changes +API breaking changes should only occur in **major** releases. These changes will be documented, with clear guidance on what is changing, why it's changing, and how to migrate existing code to the new behavior. @@ -35,7 +35,7 @@ We will not introduce new deprecations in patch releases. Deprecations will only be enforced in **major** releases. For example, if a behavior is deprecated in pandas 1.2.0, it will continue to work, with a warning, for all releases in the 1.x series. The behavior will change and the -deprecation removed in the next next major release (2.0.0). +deprecation removed in the next major release (2.0.0). .. note:: @@ -52,6 +52,6 @@ Python support ~~~~~~~~~~~~~~ pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in -pandas **major** releases. +pandas **major** or **minor** releases. .. _SemVer: https://semver.org diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index d331491d02883..8223edcf6f63a 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -53,6 +53,32 @@ need to implement certain operations expected by pandas users (for example the algorithm used in, ``Series.str.upper``). That work may be done outside of pandas. +Consistent missing value handling +--------------------------------- + +Currently, pandas handles missing data differently for different data types. We +use different types to indicate that a value is missing (``np.nan`` for +floating-point data, ``np.nan`` or ``None`` for object-dtype data -- typically +strings or booleans -- with missing values, and ``pd.NaT`` for datetimelike +data). Integer data cannot store missing data or are cast to float. In addition, +pandas 1.0 introduced a new missing value sentinel, ``pd.NA``, which is being +used for the experimental nullable integer, boolean, and string data types. + +These different missing values have different behaviors in user-facing +operations. Specifically, we introduced different semantics for the nullable +data types for certain operations (e.g. propagating in comparison operations +instead of comparing as False). + +Long term, we want to introduce consistent missing data handling for all data +types. This includes consistent behavior in all operations (indexing, arithmetic +operations, comparisons, etc.). We want to eventually make the new semantics the +default. + +This has been discussed at +`github #28095 `__ (and +linked issues), and described in more detail in this +`design doc `__. + Apache Arrow interoperability ----------------------------- @@ -115,20 +141,6 @@ ways for users to apply their own Numba-jitted functions where pandas accepts us and in groupby and window contexts). This will improve the performance of user-defined-functions in these operations by staying within compiled code. - -Documentation improvements --------------------------- - -We'd like to improve the content, structure, and presentation of the pandas documentation. -Some specific goals include - -* Overhaul the HTML theme with a modern, responsive design (:issue:`15556`) -* Improve the "Getting Started" documentation, designing and writing learning paths - for users different backgrounds (e.g. brand new to programming, familiar with - other languages like R, already familiar with Python). -* Improve the overall organization of the documentation and specific subsections - of the documentation to make navigation and finding content easier. - Performance monitoring ---------------------- @@ -177,3 +189,20 @@ should be notified of the proposal. When there's agreement that an implementation would be welcome, the roadmap should be updated to include the summary and a link to the discussion issue. + +Completed items +--------------- + +This section records now completed items from the pandas roadmap. + +Documentation improvements +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We improved the pandas documentation + +* The pandas community worked with others to build the `pydata-sphinx-theme`_, + which is now used for https://pandas.pydata.org/docs/ (:issue:`15556`). +* :ref:`getting_started` contains a number of resources intended for new + pandas users coming from a variety of backgrounds (:issue:`26831`). + +.. _pydata-sphinx-theme: https://github.com/pandas-dev/pydata-sphinx-theme diff --git a/doc/source/development/test_writing.rst b/doc/source/development/test_writing.rst new file mode 100644 index 0000000000000..d9e24bb76eed8 --- /dev/null +++ b/doc/source/development/test_writing.rst @@ -0,0 +1,174 @@ +.. _test_organization: + +Test organization +================= +Ideally, there should be one, and only one, obvious place for a test to reside. +Until we reach that ideal, these are some rules of thumb for where a test should +be located. + +1. Does your test depend only on code in ``pd._libs.tslibs``? + This test likely belongs in one of: + + - tests.tslibs + + .. note:: + + No file in ``tests.tslibs`` should import from any pandas modules + outside of ``pd._libs.tslibs`` + + - tests.scalar + - tests.tseries.offsets + +2. Does your test depend only on code in pd._libs? + This test likely belongs in one of: + + - tests.libs + - tests.groupby.test_libgroupby + +3. Is your test for an arithmetic or comparison method? + This test likely belongs in one of: + + - tests.arithmetic + + .. note:: + + These are intended for tests that can be shared to test the behavior + of DataFrame/Series/Index/ExtensionArray using the ``box_with_array`` + fixture. + + - tests.frame.test_arithmetic + - tests.series.test_arithmetic + +4. Is your test for a reduction method (min, max, sum, prod, ...)? + This test likely belongs in one of: + + - tests.reductions + + .. note:: + + These are intended for tests that can be shared to test the behavior + of DataFrame/Series/Index/ExtensionArray. + + - tests.frame.test_reductions + - tests.series.test_reductions + - tests.test_nanops + +5. Is your test for an indexing method? + This is the most difficult case for deciding where a test belongs, because + there are many of these tests, and many of them test more than one method + (e.g. both ``Series.__getitem__`` and ``Series.loc.__getitem__``) + + A) Is the test specifically testing an Index method (e.g. ``Index.get_loc``, + ``Index.get_indexer``)? + This test likely belongs in one of: + + - tests.indexes.test_indexing + - tests.indexes.fooindex.test_indexing + + Within that files there should be a method-specific test class e.g. + ``TestGetLoc``. + + In most cases, neither ``Series`` nor ``DataFrame`` objects should be + needed in these tests. + + B) Is the test for a Series or DataFrame indexing method *other* than + ``__getitem__`` or ``__setitem__``, e.g. ``xs``, ``where``, ``take``, + ``mask``, ``lookup``, or ``insert``? + This test likely belongs in one of: + + - tests.frame.indexing.test_methodname + - tests.series.indexing.test_methodname + + C) Is the test for any of ``loc``, ``iloc``, ``at``, or ``iat``? + This test likely belongs in one of: + + - tests.indexing.test_loc + - tests.indexing.test_iloc + - tests.indexing.test_at + - tests.indexing.test_iat + + Within the appropriate file, test classes correspond to either types of + indexers (e.g. ``TestLocBooleanMask``) or major use cases + (e.g. ``TestLocSetitemWithExpansion``). + + See the note in section D) about tests that test multiple indexing methods. + + D) Is the test for ``Series.__getitem__``, ``Series.__setitem__``, + ``DataFrame.__getitem__``, or ``DataFrame.__setitem__``? + This test likely belongs in one of: + + - tests.series.test_getitem + - tests.series.test_setitem + - tests.frame.test_getitem + - tests.frame.test_setitem + + If many cases such a test may test multiple similar methods, e.g. + + .. code-block:: python + + import pandas as pd + import pandas._testing as tm + + def test_getitem_listlike_of_ints(): + ser = pd.Series(range(5)) + + result = ser[[3, 4]] + expected = pd.Series([2, 3]) + tm.assert_series_equal(result, expected) + + result = ser.loc[[3, 4]] + tm.assert_series_equal(result, expected) + + In cases like this, the test location should be based on the *underlying* + method being tested. Or in the case of a test for a bugfix, the location + of the actual bug. So in this example, we know that ``Series.__getitem__`` + calls ``Series.loc.__getitem__``, so this is *really* a test for + ``loc.__getitem__``. So this test belongs in ``tests.indexing.test_loc``. + +6. Is your test for a DataFrame or Series method? + + A) Is the method a plotting method? + This test likely belongs in one of: + + - tests.plotting + + B) Is the method an IO method? + This test likely belongs in one of: + + - tests.io + + C) Otherwise + This test likely belongs in one of: + + - tests.series.methods.test_mymethod + - tests.frame.methods.test_mymethod + + .. note:: + + If a test can be shared between DataFrame/Series using the + ``frame_or_series`` fixture, by convention it goes in the + ``tests.frame`` file. + + - tests.generic.methods.test_mymethod + + .. note:: + + The generic/methods/ directory is only for methods with tests + that are fully parametrized over Series/DataFrame + +7. Is your test for an Index method, not depending on Series/DataFrame? + This test likely belongs in one of: + + - tests.indexes + +8) Is your test for one of the pandas-provided ExtensionArrays (``Categorical``, + ``DatetimeArray``, ``TimedeltaArray``, ``PeriodArray``, ``IntervalArray``, + ``PandasArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)? + This test likely belongs in one of: + + - tests.arrays + +9) Is your test for *all* ExtensionArray subclasses (the "EA Interface")? + This test likely belongs in one of: + + - tests.extension diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index b02d4abd3ddf8..e88875a9f679c 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -80,6 +80,11 @@ ML pipeline. Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. +`Compose `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. + .. _ecosystem.visualization: Visualization @@ -93,7 +98,7 @@ With Altair, you can spend more time understanding your data and its meaning. Altair's API is simple, friendly and consistent and built on top of the powerful Vega-Lite JSON specification. This elegant simplicity produces beautiful and effective visualizations with a -minimal amount of code. Altair works with Pandas DataFrames. +minimal amount of code. Altair works with pandas DataFrames. `Bokeh `__ @@ -105,7 +110,7 @@ graphics in the style of Protovis/D3, while delivering high-performance interact large data to thin clients. `Pandas-Bokeh `__ provides a high level API -for Bokeh that can be loaded as a native Pandas plotting backend via +for Bokeh that can be loaded as a native pandas plotting backend via .. code:: python @@ -165,12 +170,24 @@ invoked with the following command .. code:: python - import dtale; dtale.show(df) + import dtale + + dtale.show(df) -D-Tale integrates seamlessly with jupyter notebooks, python terminals, kaggle +D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle & Google Colab. Here are some demos of the `grid `__ and `chart-builder `__. +`hvplot `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews `__. +It can be loaded as a native pandas plotting backend via + +.. code:: python + + pd.set_option("plotting.backend", "hvplot") + .. _ecosystem.ide: IDE @@ -180,7 +197,7 @@ IDE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IPython is an interactive command shell and distributed computing -environment. IPython tab completion works with Pandas methods and also +environment. IPython tab completion works with pandas methods and also attributes like DataFrame columns. `Jupyter Notebook / Jupyter Lab `__ @@ -194,7 +211,7 @@ Jupyter notebooks can be converted to a number of open standard output formats Python) through 'Download As' in the web interface and ``jupyter convert`` in a shell. -Pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods +pandas DataFrames implement ``_repr_html_`` and ``_repr_latex`` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be @@ -222,8 +239,8 @@ Its `Variable Explorer `__ allows users to view, manipulate and edit pandas ``Index``, ``Series``, and ``DataFrame`` objects like a "spreadsheet", including copying and modifying values, sorting, displaying a "heatmap", converting data types and more. -Pandas objects can also be renamed, duplicated, new columns added, -copyed/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. +pandas objects can also be renamed, duplicated, new columns added, +copied/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. Spyder can also import data from a variety of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. @@ -269,13 +286,13 @@ The following data feeds are available: `Quandl/Python `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Quandl API for Python wraps the Quandl REST API to return -Pandas DataFrames with timeseries indexes. +pandas DataFrames with timeseries indexes. `Pydatastream `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyDatastream is a Python interface to the `Refinitiv Datastream (DWS) `__ -REST API to return indexed Pandas DataFrames with financial data. +REST API to return indexed pandas DataFrames with financial data. This package requires valid credentials for this API (non free). `pandaSDMX `__ @@ -298,6 +315,13 @@ HTTP API, and also provides several convenient methods for parsing and analyzing fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that you can obtain for free on the FRED website. +`dataframe_sql `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``dataframe_sql`` is a Python package that translates SQL syntax directly into +operations on pandas DataFrames. This is useful when migrating from a database to +using pandas or for users more comfortable with SQL looking for a way to interface +with pandas. + .. _ecosystem.domain: @@ -343,7 +367,7 @@ Out-of-core ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, PyTables, +in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables, PySpark. `Dask `__ @@ -362,6 +386,23 @@ Dask-ML enables parallel and distributed machine learning using Dask alongside e Koalas provides a familiar pandas DataFrame interface on top of Apache Spark. It enables users to leverage multi-cores on one machine or a cluster of machines to speed up or scale their DataFrame code. +`Modin `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``modin.pandas`` DataFrame is a parallel and distributed drop-in replacement +for pandas. This means that you can use Modin with existing pandas code or write +new code with the existing pandas API. Modin can leverage your entire machine or +cluster to speed up and scale your pandas workloads, including traditionally +time-consuming tasks like ingesting data (``read_csv``, ``read_excel``, +``read_parquet``, etc.). + +.. code:: python + + # import pandas as pd + import modin.pandas as pd + + df = pd.read_csv("big.csv") # use all your cores! + `Odo `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -386,21 +427,11 @@ If also displays progress bars. # df.apply(func) df.parallel_apply(func) -`Ray `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pandas on Ray is an early stage DataFrame library that wraps Pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous Pandas notebooks while experiencing a considerable speedup from Pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use Pandas on Ray just like you would Pandas. - -.. code:: python - - # import pandas as pd - import ray.dataframe as pd - `Vaex `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to Pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). +Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a Python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). * vaex.from_pandas * vaex.to_pandas_df @@ -410,7 +441,7 @@ Increasingly, packages are being built on top of pandas to address specific need Extension data types -------------------- -Pandas provides an interface for defining +pandas provides an interface for defining :ref:`extension types ` to extend NumPy's type system. The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. @@ -421,10 +452,15 @@ found in NumPy or pandas, which work well with pandas' data containers. Cyberpandas provides an extension type for storing arrays of IP Addresses. These arrays can be stored inside pandas' Series and DataFrame. +`Pandas-Genomics`_ +~~~~~~~~~~~~~~~~~~ + +Pandas-Genomics provides extension types and extension arrays for working with genomics data + `Pint-Pandas`_ ~~~~~~~~~~~~~~ -`Pint-Pandas ` provides an extension type for +``Pint-Pandas `` provides an extension type for storing numeric arrays with units. These arrays can be stored inside pandas' Series and DataFrame. Operations between Series and DataFrame columns which use pint's extension array are then units aware. @@ -445,11 +481,14 @@ Library Accessor Classes Description `pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. `pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. `pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. +`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. =============== ========== ========================= =============================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest .. _pdvega: https://altair-viz.github.io/pdvega/ .. _Altair: https://altair-viz.github.io/ +.. _pandas-genomics: https://pandas-genomics.readthedocs.io/en/latest/ .. _pandas_path: https://github.com/drivendataorg/pandas-path/ .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html .. _pint-pandas: https://github.com/hgrecco/pint-pandas +.. _composeml: https://github.com/FeatureLabs/compose diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index e1a4cfe49b7d1..864081002086b 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -5,11 +5,11 @@ Comparison with R / R libraries ******************************* -Since ``pandas`` aims to provide a lot of the data manipulation and analysis +Since pandas aims to provide a lot of the data manipulation and analysis functionality that people use `R `__ for, this page was started to provide a more detailed look at the `R language `__ and its many third -party libraries as they relate to ``pandas``. In comparisons with R and CRAN +party libraries as they relate to pandas. In comparisons with R and CRAN libraries, we care about the following things: * **Functionality / flexibility**: what can/cannot be done with each tool @@ -21,7 +21,7 @@ libraries, we care about the following things: This page is also here to offer a bit of a translation guide for users of these R packages. -For transfer of ``DataFrame`` objects from ``pandas`` to R, one option is to +For transfer of ``DataFrame`` objects from pandas to R, one option is to use HDF5 files, see :ref:`io.external_compatibility` for an example. @@ -118,20 +118,20 @@ or by integer location df <- data.frame(matrix(rnorm(1000), ncol=100)) df[, c(1:10, 25:30, 40, 50:100)] -Selecting multiple columns by name in ``pandas`` is straightforward +Selecting multiple columns by name in pandas is straightforward .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) - df[['a', 'c']] - df.loc[:, ['a', 'c']] + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) + df[["a", "c"]] + df.loc[:, ["a", "c"]] Selecting multiple noncontiguous columns by integer location can be achieved with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. .. ipython:: python - named = list('abcdefg') + named = list("abcdefg") n = 30 columns = named + np.arange(len(named), n).tolist() df = pd.DataFrame(np.random.randn(n, n), columns=columns) @@ -160,14 +160,29 @@ function. .. ipython:: python df = pd.DataFrame( - {'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], - 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, - np.nan]}) + { + "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + "by2": [ + "wet", + "dry", + 99, + 95, + np.nan, + "damp", + 95, + 99, + "red", + 99, + np.nan, + np.nan, + ], + } + ) - g = df.groupby(['by1', 'by2']) - g[['v1', 'v2']].mean() + g = df.groupby(["by1", "by2"]) + g[["v1", "v2"]].mean() For more details and examples see :ref:`the groupby documentation `. @@ -220,7 +235,7 @@ since the subclass sizes are possibly irregular. Using a data.frame called tapply(baseball$batting.average, baseball.example$team, max) -In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: +In pandas we may use :meth:`~pandas.pivot_table` method to handle this: .. ipython:: python @@ -228,11 +243,14 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import string baseball = pd.DataFrame( - {'team': ["team %d" % (x + 1) for x in range(5)] * 5, - 'player': random.sample(list(string.ascii_lowercase), 25), - 'batting avg': np.random.uniform(.200, .400, 25)}) + { + "team": ["team %d" % (x + 1) for x in range(5)] * 5, + "player": random.sample(list(string.ascii_lowercase), 25), + "batting avg": np.random.uniform(0.200, 0.400, 25), + } + ) - baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) + baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max) For more details and examples see :ref:`the reshaping documentation `. @@ -250,16 +268,16 @@ column's values are less than another column's values: subset(df, a <= b) df[df$a <= df$b,] # note the comma -In ``pandas``, there are a few ways to perform subsetting. You can use +In pandas, there are a few ways to perform subsetting. You can use :meth:`~pandas.DataFrame.query` or pass an expression as if it were an index/slice as well as standard boolean indexing: .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) - df.query('a <= b') - df[df['a'] <= df['b']] - df.loc[df['a'] <= df['b']] + df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.query("a <= b") + df[df["a"] <= df["b"]] + df.loc[df["a"] <= df["b"]] For more details and examples see :ref:`the query documentation `. @@ -277,14 +295,14 @@ An expression using a data.frame called ``df`` in R with the columns ``a`` and with(df, a + b) df$a + df$b # same as the previous expression -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.eval` method, would be: .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) - df.eval('a + b') - df['a'] + df['b'] # same as the previous expression + df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval("a + b") + df["a"] + df["b"] # same as the previous expression In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than evaluation in pure Python. For more details and examples see :ref:`the eval @@ -329,19 +347,23 @@ summarize ``x`` by ``month``: mean = round(mean(x), 2), sd = round(sd(x), 2)) -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.groupby` method, would be: .. ipython:: python - df = pd.DataFrame({'x': np.random.uniform(1., 168., 120), - 'y': np.random.uniform(7., 334., 120), - 'z': np.random.uniform(1.7, 20.7, 120), - 'month': [5, 6, 7, 8] * 30, - 'week': np.random.randint(1, 4, 120)}) + df = pd.DataFrame( + { + "x": np.random.uniform(1.0, 168.0, 120), + "y": np.random.uniform(7.0, 334.0, 120), + "z": np.random.uniform(1.7, 20.7, 120), + "month": [5, 6, 7, 8] * 30, + "week": np.random.randint(1, 4, 120), + } + ) - grouped = df.groupby(['month', 'week']) - grouped['x'].agg([np.mean, np.std]) + grouped = df.groupby(["month", "week"]) + grouped["x"].agg([np.mean, np.std]) For more details and examples see :ref:`the groupby documentation @@ -410,13 +432,17 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) - pd.melt(cheese, id_vars=['first', 'last']) - cheese.set_index(['first', 'last']).stack() # alternative way + pd.melt(cheese, id_vars=["first", "last"]) + cheese.set_index(["first", "last"]).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. @@ -444,15 +470,24 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({'x': np.random.uniform(1., 168., 12), - 'y': np.random.uniform(7., 334., 12), - 'z': np.random.uniform(1.7, 20.7, 12), - 'month': [5, 6, 7] * 4, - 'week': [1, 2] * 6}) + df = pd.DataFrame( + { + "x": np.random.uniform(1.0, 168.0, 12), + "y": np.random.uniform(7.0, 334.0, 12), + "z": np.random.uniform(1.7, 20.7, 12), + "month": [5, 6, 7] * 4, + "week": [1, 2] * 6, + } + ) - mdf = pd.melt(df, id_vars=['month', 'week']) - pd.pivot_table(mdf, values='value', index=['variable', 'week'], - columns=['month'], aggfunc=np.mean) + mdf = pd.melt(df, id_vars=["month", "week"]) + pd.pivot_table( + mdf, + values="value", + index=["variable", "week"], + columns=["month"], + aggfunc=np.mean, + ) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to aggregate information based on ``Animal`` and ``FeedType``: @@ -475,21 +510,29 @@ using :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({ - 'Animal': ['Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1', - 'Animal2', 'Animal3'], - 'FeedType': ['A', 'B', 'A', 'A', 'B', 'B', 'A'], - 'Amount': [10, 7, 4, 2, 5, 6, 2], - }) + df = pd.DataFrame( + { + "Animal": [ + "Animal1", + "Animal2", + "Animal3", + "Animal2", + "Animal1", + "Animal2", + "Animal3", + ], + "FeedType": ["A", "B", "A", "A", "B", "B", "A"], + "Amount": [10, 7, 4, 2, 5, 6, 2], + } + ) - df.pivot_table(values='Amount', index='Animal', columns='FeedType', - aggfunc='sum') + df.pivot_table(values="Amount", index="Animal", columns="FeedType", aggfunc="sum") The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: .. ipython:: python - df.groupby(['Animal', 'FeedType'])['Amount'].sum() + df.groupby(["Animal", "FeedType"])["Amount"].sum() For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 85c6ea2c31969..ae9f1caebd556 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -106,7 +106,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) + df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]}) df @@ -130,8 +130,10 @@ The pandas method is :func:`read_csv`, which works similarly. .. ipython:: python - url = ('https://raw.github.com/pandas-dev/' - 'pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.github.com/pandas-dev/" + "pandas/master/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) tips.head() @@ -142,10 +144,10 @@ and did not have column names, the pandas command would be: .. code-block:: python - tips = pd.read_csv('tips.csv', sep='\t', header=None) + tips = pd.read_csv("tips.csv", sep="\t", header=None) # alternatively, read_table is an alias to read_csv with tab delimiter - tips = pd.read_table('tips.csv', header=None) + tips = pd.read_table("tips.csv", header=None) In addition to text/csv, pandas supports a variety of other data formats such as Excel, HDF5, and SQL databases. These are all read via a ``pd.read_*`` @@ -166,7 +168,7 @@ and other data formats follow a similar api. .. code-block:: python - tips.to_csv('tips2.csv') + tips.to_csv("tips2.csv") Data operations @@ -192,14 +194,14 @@ New columns can be assigned in the same way. .. ipython:: python - tips['total_bill'] = tips['total_bill'] - 2 - tips['new_bill'] = tips['total_bill'] / 2.0 + tips["total_bill"] = tips["total_bill"] - 2 + tips["new_bill"] = tips["total_bill"] / 2.0 tips.head() .. ipython:: python :suppress: - tips = tips.drop('new_bill', axis=1) + tips = tips.drop("new_bill", axis=1) Filtering ~~~~~~~~~ @@ -226,7 +228,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin .. ipython:: python - tips[tips['total_bill'] > 10].head() + tips[tips["total_bill"] > 10].head() If/then logic ~~~~~~~~~~~~~ @@ -248,13 +250,13 @@ the ``where`` method from ``numpy``. .. ipython:: python - tips['bucket'] = np.where(tips['total_bill'] < 10, 'low', 'high') + tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") tips.head() .. ipython:: python :suppress: - tips = tips.drop('bucket', axis=1) + tips = tips.drop("bucket", axis=1) Date functionality ~~~~~~~~~~~~~~~~~~ @@ -284,22 +286,26 @@ see the :ref:`timeseries documentation` for more details. .. ipython:: python - tips['date1'] = pd.Timestamp('2013-01-15') - tips['date2'] = pd.Timestamp('2015-02-15') - tips['date1_year'] = tips['date1'].dt.year - tips['date2_month'] = tips['date2'].dt.month - tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = ( - tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M')) + tips["date1"] = pd.Timestamp("2013-01-15") + tips["date2"] = pd.Timestamp("2015-02-15") + tips["date1_year"] = tips["date1"].dt.year + tips["date2_month"] = tips["date2"].dt.month + tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() + tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ + "date1" + ].dt.to_period("M") - tips[['date1', 'date2', 'date1_year', 'date2_month', - 'date1_next', 'months_between']].head() + tips[ + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] + ].head() .. ipython:: python :suppress: - tips = tips.drop(['date1', 'date2', 'date1_year', - 'date2_month', 'date1_next', 'months_between'], axis=1) + tips = tips.drop( + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], + axis=1, + ) Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -329,13 +335,13 @@ The same operations are expressed in pandas below. .. ipython:: python # keep - tips[['sex', 'total_bill', 'tip']].head() + tips[["sex", "total_bill", "tip"]].head() # drop - tips.drop('sex', axis=1).head() + tips.drop("sex", axis=1).head() # rename - tips.rename(columns={'total_bill': 'total_bill_2'}).head() + tips.rename(columns={"total_bill": "total_bill_2"}).head() Sorting by values @@ -354,7 +360,7 @@ takes a list of columns to sort by. .. ipython:: python - tips = tips.sort_values(['sex', 'total_bill']) + tips = tips.sort_values(["sex", "total_bill"]) tips.head() @@ -383,8 +389,8 @@ trailing blanks. .. ipython:: python - tips['time'].str.len().head() - tips['time'].str.rstrip().str.len().head() + tips["time"].str.len().head() + tips["time"].str.rstrip().str.len().head() Find @@ -410,7 +416,7 @@ the function will return -1 if it fails to find the substring. .. ipython:: python - tips['sex'].str.find("ale").head() + tips["sex"].str.find("ale").head() Substring @@ -432,7 +438,7 @@ indexes are zero-based. .. ipython:: python - tips['sex'].str[0:1].head() + tips["sex"].str[0:1].head() Scan @@ -460,9 +466,9 @@ approaches, but this just shows a simple approach. .. ipython:: python - firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) - firstlast['First_Name'] = firstlast['String'].str.split(" ", expand=True)[0] - firstlast['Last_Name'] = firstlast['String'].str.rsplit(" ", expand=True)[0] + firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]}) + firstlast["First_Name"] = firstlast["String"].str.split(" ", expand=True)[0] + firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[0] firstlast @@ -491,10 +497,10 @@ The equivalent Python functions are ``upper``, ``lower``, and ``title``. .. ipython:: python - firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) - firstlast['string_up'] = firstlast['String'].str.upper() - firstlast['string_low'] = firstlast['String'].str.lower() - firstlast['string_prop'] = firstlast['String'].str.title() + firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]}) + firstlast["string_up"] = firstlast["String"].str.upper() + firstlast["string_low"] = firstlast["String"].str.lower() + firstlast["string_prop"] = firstlast["String"].str.title() firstlast Merging @@ -504,11 +510,9 @@ The following tables will be used in the merge examples .. ipython:: python - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) df1 - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) df2 In SAS, data must be explicitly sorted before merging. Different @@ -542,16 +546,16 @@ types are accomplished via the ``how`` keyword. .. ipython:: python - inner_join = df1.merge(df2, on=['key'], how='inner') + inner_join = df1.merge(df2, on=["key"], how="inner") inner_join - left_join = df1.merge(df2, on=['key'], how='left') + left_join = df1.merge(df2, on=["key"], how="left") left_join - right_join = df1.merge(df2, on=['key'], how='right') + right_join = df1.merge(df2, on=["key"], how="right") right_join - outer_join = df1.merge(df2, on=['key'], how='outer') + outer_join = df1.merge(df2, on=["key"], how="outer") outer_join @@ -566,8 +570,8 @@ operations, and is ignored by default for aggregations. .. ipython:: python outer_join - outer_join['value_x'] + outer_join['value_y'] - outer_join['value_x'].sum() + outer_join["value_x"] + outer_join["value_y"] + outer_join["value_x"].sum() One difference is that missing data cannot be compared to its sentinel value. For example, in SAS you could do this to filter missing values. @@ -589,8 +593,8 @@ should be used for comparisons. .. ipython:: python - outer_join[pd.isna(outer_join['value_x'])] - outer_join[pd.notna(outer_join['value_x'])] + outer_join[pd.isna(outer_join["value_x"])] + outer_join[pd.notna(outer_join["value_x"])] pandas also provides a variety of methods to work with missing data - some of which would be challenging to express in SAS. For example, there are methods to @@ -601,8 +605,8 @@ value, like the mean, or forward filling from previous rows. See the .. ipython:: python outer_join.dropna() - outer_join.fillna(method='ffill') - outer_join['value_x'].fillna(outer_join['value_x'].mean()) + outer_join.fillna(method="ffill") + outer_join["value_x"].fillna(outer_join["value_x"].mean()) GroupBy @@ -629,7 +633,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() + tips_summed = tips.groupby(["sex", "smoker"])[["total_bill", "tip"]].sum() tips_summed.head() @@ -666,8 +670,8 @@ operation. .. ipython:: python - gb = tips.groupby('smoker')['total_bill'] - tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean') + gb = tips.groupby("smoker")["total_bill"] + tips["adj_total_bill"] = tips["total_bill"] - gb.transform("mean") tips.head() @@ -695,7 +699,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex', 'smoker']).first() + tips.groupby(["sex", "smoker"]).first() Other considerations @@ -729,16 +733,16 @@ the XPORT or SAS7BDAT binary format. .. code-block:: python - df = pd.read_sas('transport-file.xpt') - df = pd.read_sas('binary-file.sas7bdat') + df = pd.read_sas("transport-file.xpt") + df = pd.read_sas("binary-file.sas7bdat") You can also specify the file format directly. By default, pandas will try to infer the file format based on its extension. .. code-block:: python - df = pd.read_sas('transport-file.xpt', format='xport') - df = pd.read_sas('binary-file.sas7bdat', format='sas7bdat') + df = pd.read_sas("transport-file.xpt", format="xport") + df = pd.read_sas("binary-file.sas7bdat", format="sas7bdat") XPORT is a relatively limited format and the parsing of it is not as optimized as some of the other pandas readers. An alternative way @@ -752,4 +756,4 @@ to interop data between SAS and pandas is to serialize to csv. Wall time: 14.6 s In [9]: %time df = pd.read_csv('big.csv') - Wall time: 4.86 s \ No newline at end of file + Wall time: 4.86 s diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index aa7218c3e4fad..6848d8df2e46b 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -19,13 +19,15 @@ As is customary, we import pandas and NumPy as follows: import numpy as np Most of the examples will utilize the ``tips`` dataset found within pandas tests. We'll read -the data into a DataFrame called `tips` and assume we have a database table of the same name and +the data into a DataFrame called ``tips`` and assume we have a database table of the same name and structure. .. ipython:: python - url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.github.com/pandas-dev" + "/pandas/master/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) tips.head() @@ -44,7 +46,7 @@ With pandas, column selection is done by passing a list of column names to your .. ipython:: python - tips[['total_bill', 'tip', 'smoker', 'time']].head(5) + tips[["total_bill", "tip", "smoker", "time"]].head(5) Calling the DataFrame without the list of column names would display all columns (akin to SQL's ``*``). @@ -61,7 +63,7 @@ With pandas, you can use the :meth:`DataFrame.assign` method of a DataFrame to a .. ipython:: python - tips.assign(tip_rate=tips['tip'] / tips['total_bill']).head(5) + tips.assign(tip_rate=tips["tip"] / tips["total_bill"]).head(5) WHERE ----- @@ -79,14 +81,14 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin .. ipython:: python - tips[tips['time'] == 'Dinner'].head(5) + tips[tips["time"] == "Dinner"].head(5) The above statement is simply passing a ``Series`` of True/False objects to the DataFrame, returning all rows with True. .. ipython:: python - is_dinner = tips['time'] == 'Dinner' + is_dinner = tips["time"] == "Dinner" is_dinner.value_counts() tips[is_dinner].head(5) @@ -103,7 +105,7 @@ Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame usi .. ipython:: python # tips of more than $5.00 at Dinner meals - tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)] + tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)] .. code-block:: sql @@ -115,15 +117,16 @@ Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame usi .. ipython:: python # tips by parties of at least 5 diners OR bill total was more than $45 - tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)] + tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)] NULL checking is done using the :meth:`~pandas.Series.notna` and :meth:`~pandas.Series.isna` methods. .. ipython:: python - frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'], - 'col2': ['F', np.NaN, 'G', 'H', 'I']}) + frame = pd.DataFrame( + {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]} + ) frame Assume we have a table of the same structure as our DataFrame above. We can see only the records @@ -137,7 +140,7 @@ where ``col2`` IS NULL with the following query: .. ipython:: python - frame[frame['col2'].isna()] + frame[frame["col2"].isna()] Getting items where ``col1`` IS NOT NULL can be done with :meth:`~pandas.Series.notna`. @@ -149,7 +152,7 @@ Getting items where ``col1`` IS NOT NULL can be done with :meth:`~pandas.Series. .. ipython:: python - frame[frame['col1'].notna()] + frame[frame["col1"].notna()] GROUP BY @@ -177,7 +180,7 @@ The pandas equivalent would be: .. ipython:: python - tips.groupby('sex').size() + tips.groupby("sex").size() Notice that in the pandas code we used :meth:`~pandas.core.groupby.DataFrameGroupBy.size` and not :meth:`~pandas.core.groupby.DataFrameGroupBy.count`. This is because @@ -186,14 +189,14 @@ the number of ``not null`` records within each. .. ipython:: python - tips.groupby('sex').count() + tips.groupby("sex").count() Alternatively, we could have applied the :meth:`~pandas.core.groupby.DataFrameGroupBy.count` method to an individual column: .. ipython:: python - tips.groupby('sex')['total_bill'].count() + tips.groupby("sex")["total_bill"].count() Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount differs by day of the week - :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` allows you to pass a dictionary @@ -213,7 +216,7 @@ to your grouped DataFrame, indicating which functions to apply to specific colum .. ipython:: python - tips.groupby('day').agg({'tip': np.mean, 'day': np.size}) + tips.groupby("day").agg({"tip": np.mean, "day": np.size}) Grouping by more than one column is done by passing a list of columns to the :meth:`~pandas.DataFrame.groupby` method. @@ -237,7 +240,7 @@ Grouping by more than one column is done by passing a list of columns to the .. ipython:: python - tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) + tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]}) .. _compare_with_sql.join: @@ -250,10 +253,8 @@ columns to join on (column names or indices). .. ipython:: python - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) Assume we have two database tables of the same name and structure as our DataFrames. @@ -271,15 +272,15 @@ INNER JOIN .. ipython:: python # merge performs an INNER JOIN by default - pd.merge(df1, df2, on='key') + pd.merge(df1, df2, on="key") :meth:`~pandas.merge` also offers parameters for cases when you'd like to join one DataFrame's column with another DataFrame's index. .. ipython:: python - indexed_df2 = df2.set_index('key') - pd.merge(df1, indexed_df2, left_on='key', right_index=True) + indexed_df2 = df2.set_index("key") + pd.merge(df1, indexed_df2, left_on="key", right_index=True) LEFT OUTER JOIN ~~~~~~~~~~~~~~~ @@ -294,7 +295,7 @@ LEFT OUTER JOIN .. ipython:: python # show all records from df1 - pd.merge(df1, df2, on='key', how='left') + pd.merge(df1, df2, on="key", how="left") RIGHT JOIN ~~~~~~~~~~ @@ -309,7 +310,7 @@ RIGHT JOIN .. ipython:: python # show all records from df2 - pd.merge(df1, df2, on='key', how='right') + pd.merge(df1, df2, on="key", how="right") FULL JOIN ~~~~~~~~~ @@ -327,7 +328,7 @@ joined columns find a match. As of writing, FULL JOINs are not supported in all .. ipython:: python # show all records from both frames - pd.merge(df1, df2, on='key', how='outer') + pd.merge(df1, df2, on="key", how="outer") UNION @@ -336,10 +337,12 @@ UNION ALL can be performed using :meth:`~pandas.concat`. .. ipython:: python - df1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'], - 'rank': range(1, 4)}) - df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'], - 'rank': [1, 4, 5]}) + df1 = pd.DataFrame( + {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)} + ) + df2 = pd.DataFrame( + {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]} + ) .. code-block:: sql @@ -403,7 +406,7 @@ Top n rows with offset .. ipython:: python - tips.nlargest(10 + 5, columns='tip').tail(10) + tips.nlargest(10 + 5, columns="tip").tail(10) Top n rows per group ~~~~~~~~~~~~~~~~~~~~ @@ -423,20 +426,30 @@ Top n rows per group .. ipython:: python - (tips.assign(rn=tips.sort_values(['total_bill'], ascending=False) - .groupby(['day']) - .cumcount() + 1) - .query('rn < 3') - .sort_values(['day', 'rn'])) + ( + tips.assign( + rn=tips.sort_values(["total_bill"], ascending=False) + .groupby(["day"]) + .cumcount() + + 1 + ) + .query("rn < 3") + .sort_values(["day", "rn"]) + ) -the same using `rank(method='first')` function +the same using ``rank(method='first')`` function .. ipython:: python - (tips.assign(rnk=tips.groupby(['day'])['total_bill'] - .rank(method='first', ascending=False)) - .query('rnk < 3') - .sort_values(['day', 'rnk'])) + ( + tips.assign( + rnk=tips.groupby(["day"])["total_bill"].rank( + method="first", ascending=False + ) + ) + .query("rnk < 3") + .sort_values(["day", "rnk"]) + ) .. code-block:: sql @@ -453,16 +466,17 @@ the same using `rank(method='first')` function Let's find tips with (rank < 3) per gender group for (tips < 2). Notice that when using ``rank(method='min')`` function -`rnk_min` remains the same for the same `tip` +``rnk_min`` remains the same for the same ``tip`` (as Oracle's RANK() function) .. ipython:: python - (tips[tips['tip'] < 2] - .assign(rnk_min=tips.groupby(['sex'])['tip'] - .rank(method='min')) - .query('rnk_min < 3') - .sort_values(['sex', 'rnk_min'])) + ( + tips[tips["tip"] < 2] + .assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min")) + .query("rnk_min < 3") + .sort_values(["sex", "rnk_min"]) + ) UPDATE @@ -476,7 +490,7 @@ UPDATE .. ipython:: python - tips.loc[tips['tip'] < 2, 'tip'] *= 2 + tips.loc[tips["tip"] < 2, "tip"] *= 2 DELETE ------ @@ -490,4 +504,4 @@ In pandas we select the rows that should remain, instead of deleting them .. ipython:: python - tips = tips.loc[tips['tip'] <= 9] + tips = tips.loc[tips["tip"] <= 9] diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index 06f9e45466243..014506cc18327 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -103,7 +103,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) + df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]}) df @@ -127,8 +127,10 @@ the data set if presented with a url. .. ipython:: python - url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.github.com/pandas-dev" + "/pandas/master/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) tips.head() @@ -139,16 +141,16 @@ the pandas command would be: .. code-block:: python - tips = pd.read_csv('tips.csv', sep='\t', header=None) + tips = pd.read_csv("tips.csv", sep="\t", header=None) # alternatively, read_table is an alias to read_csv with tab delimiter - tips = pd.read_table('tips.csv', header=None) + tips = pd.read_table("tips.csv", header=None) -Pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. +pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. .. code-block:: python - df = pd.read_stata('data.dta') + df = pd.read_stata("data.dta") In addition to text/csv and Stata files, pandas supports a variety of other data formats such as Excel, SAS, HDF5, Parquet, and SQL databases. These are all read via a ``pd.read_*`` @@ -168,13 +170,13 @@ Similarly in pandas, the opposite of ``read_csv`` is :meth:`DataFrame.to_csv`. .. code-block:: python - tips.to_csv('tips2.csv') + tips.to_csv("tips2.csv") -Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. +pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. .. code-block:: python - tips.to_stata('tips2.dta') + tips.to_stata("tips2.dta") Data operations @@ -200,11 +202,11 @@ drops a column from the ``DataFrame``. .. ipython:: python - tips['total_bill'] = tips['total_bill'] - 2 - tips['new_bill'] = tips['total_bill'] / 2 + tips["total_bill"] = tips["total_bill"] - 2 + tips["new_bill"] = tips["total_bill"] / 2 tips.head() - tips = tips.drop('new_bill', axis=1) + tips = tips.drop("new_bill", axis=1) Filtering ~~~~~~~~~ @@ -220,7 +222,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin .. ipython:: python - tips[tips['total_bill'] > 10].head() + tips[tips["total_bill"] > 10].head() If/then logic ~~~~~~~~~~~~~ @@ -237,13 +239,13 @@ the ``where`` method from ``numpy``. .. ipython:: python - tips['bucket'] = np.where(tips['total_bill'] < 10, 'low', 'high') + tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") tips.head() .. ipython:: python :suppress: - tips = tips.drop('bucket', axis=1) + tips = tips.drop("bucket", axis=1) Date functionality ~~~~~~~~~~~~~~~~~~ @@ -273,22 +275,26 @@ see the :ref:`timeseries documentation` for more details. .. ipython:: python - tips['date1'] = pd.Timestamp('2013-01-15') - tips['date2'] = pd.Timestamp('2015-02-15') - tips['date1_year'] = tips['date1'].dt.year - tips['date2_month'] = tips['date2'].dt.month - tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) + tips["date1"] = pd.Timestamp("2013-01-15") + tips["date2"] = pd.Timestamp("2015-02-15") + tips["date1_year"] = tips["date1"].dt.year + tips["date2_month"] = tips["date2"].dt.month + tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() + tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ + "date1" + ].dt.to_period("M") - tips[['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', - 'months_between']].head() + tips[ + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] + ].head() .. ipython:: python :suppress: - tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', - 'date1_next', 'months_between'], axis=1) + tips = tips.drop( + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], + axis=1, + ) Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -310,13 +316,13 @@ to a variable. .. ipython:: python # keep - tips[['sex', 'total_bill', 'tip']].head() + tips[["sex", "total_bill", "tip"]].head() # drop - tips.drop('sex', axis=1).head() + tips.drop("sex", axis=1).head() # rename - tips.rename(columns={'total_bill': 'total_bill_2'}).head() + tips.rename(columns={"total_bill": "total_bill_2"}).head() Sorting by values @@ -333,7 +339,7 @@ takes a list of columns to sort by. .. ipython:: python - tips = tips.sort_values(['sex', 'total_bill']) + tips = tips.sort_values(["sex", "total_bill"]) tips.head() @@ -357,8 +363,8 @@ Use ``len`` and ``rstrip`` to exclude trailing blanks. .. ipython:: python - tips['time'].str.len().head() - tips['time'].str.rstrip().str.len().head() + tips["time"].str.len().head() + tips["time"].str.rstrip().str.len().head() Finding position of substring @@ -380,7 +386,7 @@ the function will return -1 if it fails to find the substring. .. ipython:: python - tips['sex'].str.find("ale").head() + tips["sex"].str.find("ale").head() Extracting substring by position @@ -398,7 +404,7 @@ indexes are zero-based. .. ipython:: python - tips['sex'].str[0:1].head() + tips["sex"].str[0:1].head() Extracting nth word @@ -425,9 +431,9 @@ approaches, but this just shows a simple approach. .. ipython:: python - firstlast = pd.DataFrame({'string': ['John Smith', 'Jane Cook']}) - firstlast['First_Name'] = firstlast['string'].str.split(" ", expand=True)[0] - firstlast['Last_Name'] = firstlast['string'].str.rsplit(" ", expand=True)[0] + firstlast = pd.DataFrame({"string": ["John Smith", "Jane Cook"]}) + firstlast["First_Name"] = firstlast["string"].str.split(" ", expand=True)[0] + firstlast["Last_Name"] = firstlast["string"].str.rsplit(" ", expand=True)[0] firstlast @@ -455,10 +461,10 @@ The equivalent Python functions are ``upper``, ``lower``, and ``title``. .. ipython:: python - firstlast = pd.DataFrame({'string': ['John Smith', 'Jane Cook']}) - firstlast['upper'] = firstlast['string'].str.upper() - firstlast['lower'] = firstlast['string'].str.lower() - firstlast['title'] = firstlast['string'].str.title() + firstlast = pd.DataFrame({"string": ["John Smith", "Jane Cook"]}) + firstlast["upper"] = firstlast["string"].str.upper() + firstlast["lower"] = firstlast["string"].str.lower() + firstlast["title"] = firstlast["string"].str.title() firstlast Merging @@ -468,11 +474,9 @@ The following tables will be used in the merge examples .. ipython:: python - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) df1 - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) df2 In Stata, to perform a merge, one data set must be in memory @@ -534,16 +538,16 @@ types are accomplished via the ``how`` keyword. .. ipython:: python - inner_join = df1.merge(df2, on=['key'], how='inner') + inner_join = df1.merge(df2, on=["key"], how="inner") inner_join - left_join = df1.merge(df2, on=['key'], how='left') + left_join = df1.merge(df2, on=["key"], how="left") left_join - right_join = df1.merge(df2, on=['key'], how='right') + right_join = df1.merge(df2, on=["key"], how="right") right_join - outer_join = df1.merge(df2, on=['key'], how='outer') + outer_join = df1.merge(df2, on=["key"], how="outer") outer_join @@ -558,8 +562,8 @@ operations, and is ignored by default for aggregations. .. ipython:: python outer_join - outer_join['value_x'] + outer_join['value_y'] - outer_join['value_x'].sum() + outer_join["value_x"] + outer_join["value_y"] + outer_join["value_x"].sum() One difference is that missing data cannot be compared to its sentinel value. For example, in Stata you could do this to filter missing values. @@ -576,10 +580,10 @@ should be used for comparisons. .. ipython:: python - outer_join[pd.isna(outer_join['value_x'])] - outer_join[pd.notna(outer_join['value_x'])] + outer_join[pd.isna(outer_join["value_x"])] + outer_join[pd.notna(outer_join["value_x"])] -Pandas also provides a variety of methods to work with missing data -- some of +pandas also provides a variety of methods to work with missing data -- some of which would be challenging to express in Stata. For example, there are methods to drop all rows with any missing values, replacing missing values with a specified value, like the mean, or forward filling from previous rows. See the @@ -591,10 +595,10 @@ value, like the mean, or forward filling from previous rows. See the outer_join.dropna() # Fill forwards - outer_join.fillna(method='ffill') + outer_join.fillna(method="ffill") # Impute missing values with the mean - outer_join['value_x'].fillna(outer_join['value_x'].mean()) + outer_join["value_x"].fillna(outer_join["value_x"].mean()) GroupBy @@ -617,7 +621,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() + tips_summed = tips.groupby(["sex", "smoker"])[["total_bill", "tip"]].sum() tips_summed.head() @@ -640,8 +644,8 @@ operation. .. ipython:: python - gb = tips.groupby('smoker')['total_bill'] - tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean') + gb = tips.groupby("smoker")["total_bill"] + tips["adj_total_bill"] = tips["total_bill"] - gb.transform("mean") tips.head() @@ -661,7 +665,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex', 'smoker']).first() + tips.groupby(["sex", "smoker"]).first() Other considerations @@ -670,7 +674,7 @@ Other considerations Disk vs memory ~~~~~~~~~~~~~~ -Pandas and Stata both operate exclusively in memory. This means that the size of +pandas and Stata both operate exclusively in memory. This means that the size of data able to be loaded in pandas is limited by your machine's memory. If out of core processing is needed, one possibility is the `dask.dataframe `_ diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index eb7ee000a9a86..6f6eeada0cfed 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -533,7 +533,7 @@ pandas has great support for time series and has an extensive set of tools for w
-Data sets do not only contain numerical data. pandas provides a wide range of functions to cleaning textual data and extract useful information from it. +Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it. .. raw:: html diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b79a9cd872c47..c823ad01f10bf 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.6.1 and above, 3.7, and 3.8. +Officially Python 3.7.1 and above, 3.8, and 3.9. Installing pandas ----------------- @@ -28,20 +28,20 @@ Installing pandas Installing with Anaconda ~~~~~~~~~~~~~~~~~~~~~~~~ -Installing pandas and the rest of the `NumPy `__ and -`SciPy `__ stack can be a little +Installing pandas and the rest of the `NumPy `__ and +`SciPy `__ stack can be a little difficult for inexperienced users. The simplest way to install not only pandas, but Python and the most popular -packages that make up the `SciPy `__ stack -(`IPython `__, `NumPy `__, +packages that make up the `SciPy `__ stack +(`IPython `__, `NumPy `__, `Matplotlib `__, ...) is with `Anaconda `__, a cross-platform -(Linux, Mac OS X, Windows) Python distribution for data analytics and +(Linux, macOS, Windows) Python distribution for data analytics and scientific computing. After running the installer, the user will have access to pandas and the -rest of the `SciPy `__ stack without needing to install +rest of the `SciPy `__ stack without needing to install anything else, and without needing to wait for any software to be compiled. Installation instructions for `Anaconda `__ @@ -179,12 +179,12 @@ In Linux/Mac you can run ``which python`` on your terminal and it will tell you using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended. It is highly recommended to use ``conda``, for quick installation and for package and dependency updates. -You can find simple installation instructions for pandas in this document: `installation instructions `. +You can find simple installation instructions for pandas in this document: ``installation instructions ``. Installing from source ~~~~~~~~~~~~~~~~~~~~~~ -See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a *pandas* development environment. +See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a pandas development environment. Running the test suite ---------------------- @@ -220,9 +220,9 @@ Dependencies Package Minimum supported version ================================================================ ========================== `setuptools `__ 24.2.0 -`NumPy `__ 1.15.4 +`NumPy `__ 1.16.5 `python-dateutil `__ 2.7.3 -`pytz `__ 2017.2 +`pytz `__ 2017.3 ================================================================ ========================== .. _install.recommended_dependencies: @@ -232,7 +232,7 @@ Recommended dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.6.2 or higher. + If installed, must be Version 2.6.8 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, @@ -249,7 +249,7 @@ Recommended dependencies Optional dependencies ~~~~~~~~~~~~~~~~~~~~~ -Pandas has many optional dependencies that are only used for specific methods. +pandas has many optional dependencies that are only used for specific methods. For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while :meth:`DataFrame.to_markdown` requires the ``tabulate`` package. If the optional dependency is not installed, pandas will raise an ``ImportError`` when @@ -259,36 +259,35 @@ the method requiring that dependency is called. Dependency Minimum Version Notes ========================= ================== ============================================================= BeautifulSoup4 4.6.0 HTML parser for read_html (see :ref:`note `) -Jinja2 Conditional formatting with DataFrame.style +Jinja2 2.10 Conditional formatting with DataFrame.style PyQt4 Clipboard I/O PyQt5 Clipboard I/O -PyTables 3.4.3 HDF5-based reading / writing -SQLAlchemy 1.1.4 SQL support for databases other than sqlite -SciPy 0.19.0 Miscellaneous statistical functions -XLsxWriter 0.9.8 Excel writing -blosc Compression for HDF5 +PyTables 3.5.1 HDF5-based reading / writing +SQLAlchemy 1.2.8 SQL support for databases other than sqlite +SciPy 1.12.0 Miscellaneous statistical functions +xlsxwriter 1.0.2 Excel writing +blosc 1.15.0 Compression for HDF5 fsspec 0.7.4 Handling files aside from local and HTTP fastparquet 0.3.2 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access -html5lib HTML parser for read_html (see :ref:`note `) -lxml 3.8.0 HTML parser for read_html (see :ref:`note `) -matplotlib 2.2.2 Visualization +html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) +lxml 4.3.0 HTML parser for read_html (see :ref:`note `) +matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations -openpyxl 2.5.7 Reading / writing for xlsx files +openpyxl 2.6.0 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access -psycopg2 PostgreSQL engine for sqlalchemy -pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing +psycopg2 2.7 PostgreSQL engine for sqlalchemy +pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading -pytables 3.4.3 HDF5 reading / writing pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) -xarray 0.8.2 pandas-like API for N-dimensional data +xarray 0.12.3 pandas-like API for N-dimensional data xclip Clipboard I/O on linux -xlrd 1.1.0 Excel reading -xlwt 1.2.0 Excel writing +xlrd 1.2.0 Excel reading +xlwt 1.3.0 Excel writing xsel Clipboard I/O on linux zlib Compression for HDF5 ========================= ================== ============================================================= @@ -301,8 +300,6 @@ Optional dependencies for parsing HTML One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: -.. versionchanged:: 0.23.0 - * `BeautifulSoup4`_ and `html5lib`_ * `BeautifulSoup4`_ and `lxml`_ * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index dc9bec2284aab..e8e0fef271a74 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -41,12 +41,16 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno .. ipython:: python - df = pd.DataFrame({ - "Name": ["Braund, Mr. Owen Harris", - "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth"], - "Age": [22, 35, 58], - "Sex": ["male", "male", "female"]} + df = pd.DataFrame( + { + "Name": [ + "Braund, Mr. Owen Harris", + "Allen, Mr. William Henry", + "Bonnell, Miss. Elizabeth", + ], + "Age": [22, 35, 58], + "Sex": ["male", "male", "female"], + } ) df diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index c6c6bfefc4303..c9b6a12904311 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -138,7 +138,7 @@ My colleague requested the Titanic data as a spreadsheet. .. ipython:: python - titanic.to_excel('titanic.xlsx', sheet_name='passengers', index=False) + titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False) Whereas ``read_*`` functions are used to read data to pandas, the ``to_*`` methods are used to store data. The :meth:`~DataFrame.to_excel` method stores @@ -156,7 +156,7 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data .. ipython:: python - titanic = pd.read_excel('titanic.xlsx', sheet_name='passengers') + titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers") .. ipython:: python @@ -166,7 +166,8 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data :suppress: import os - os.remove('titanic.xlsx') + + os.remove("titanic.xlsx") .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst index 8476fee5e1eee..a718c39620ce5 100644 --- a/doc/source/getting_started/intro_tutorials/03_subset_data.rst +++ b/doc/source/getting_started/intro_tutorials/03_subset_data.rst @@ -27,14 +27,14 @@ This tutorial uses the Titanic data set, stored as CSV. The data consists of the following data columns: - PassengerId: Id of every passenger. -- Survived: This feature have value 0 and 1. 0 for not survived and 1 +- Survived: This feature has value 0 and 1. 0 for not survived and 1 for survived. - Pclass: There are 3 classes: Class 1, Class 2 and Class 3. - Name: Name of passenger. - Sex: Gender of passenger. - Age: Age of passenger. -- SibSp: Indication that passenger have siblings and spouse. -- Parch: Whether a passenger is alone or have family. +- SibSp: Indication that passengers have siblings and spouses. +- Parch: Whether a passenger is alone or has a family. - Ticket: Ticket number of passenger. - Fare: Indicating the fare. - Cabin: The cabin of passenger. @@ -199,7 +199,7 @@ selection brackets ``[]``. Only rows for which the value is ``True`` will be selected. We know from before that the original Titanic ``DataFrame`` consists of -891 rows. Let’s have a look at the amount of rows which satisfy the +891 rows. Let’s have a look at the number of rows which satisfy the condition by checking the ``shape`` attribute of the resulting ``DataFrame`` ``above_35``: @@ -398,7 +398,7 @@ See the user guide section on :ref:`different choices for indexing To user guide -A full overview about indexing is provided in the user guide pages on :ref:`indexing and selecting data `. +A full overview of indexing is provided in the user guide pages on :ref:`indexing and selecting data `. .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index f3d99ee56359a..b7a566a35084d 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -40,8 +40,7 @@ in respectively Paris, Antwerp and London. .. ipython:: python - air_quality = pd.read_csv("data/air_quality_no2.csv", - index_col=0, parse_dates=True) + air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True) air_quality.head() .. note:: @@ -112,9 +111,7 @@ I want to visually compare the :math:`N0_2` values measured in London versus Par .. ipython:: python @savefig 04_airqual_scatter.png - air_quality.plot.scatter(x="station_london", - y="station_paris", - alpha=0.5) + air_quality.plot.scatter(x="station_london", y="station_paris", alpha=0.5) .. raw:: html @@ -127,12 +124,15 @@ standard Python to get an overview of the available plot methods: .. ipython:: python - [method_name for method_name in dir(air_quality.plot) - if not method_name.startswith("_")] + [ + method_name + for method_name in dir(air_quality.plot) + if not method_name.startswith("_") + ] .. note:: - In many development environments as well as ipython and - jupyter notebook, use the TAB button to get an overview of the available + In many development environments as well as IPython and + Jupyter Notebook, use the TAB button to get an overview of the available methods, for example ``air_quality.plot.`` + TAB. One of the options is :meth:`DataFrame.plot.box`, which refers to a @@ -167,7 +167,7 @@ I want each of the columns in a separate subplot. @savefig 04_airqual_area_subplot.png axs = air_quality.plot.area(figsize=(12, 4), subplots=True) -Separate subplots for each of the data columns is supported by the ``subplots`` argument +Separate subplots for each of the data columns are supported by the ``subplots`` argument of the ``plot`` functions. The builtin options available in each of the pandas plot functions that are worthwhile to have a look. @@ -196,24 +196,25 @@ I want to further customize, extend or save the resulting plot. .. ipython:: python - fig, axs = plt.subplots(figsize=(12, 4)); - air_quality.plot.area(ax=axs); + fig, axs = plt.subplots(figsize=(12, 4)) + air_quality.plot.area(ax=axs) @savefig 04_airqual_customized.png - axs.set_ylabel("NO$_2$ concentration"); + axs.set_ylabel("NO$_2$ concentration") fig.savefig("no2_concentrations.png") .. ipython:: python :suppress: import os - os.remove('no2_concentrations.png') + + os.remove("no2_concentrations.png") .. raw:: html -Each of the plot objects created by pandas are a +Each of the plot objects created by pandas is a `matplotlib `__ object. As Matplotlib provides plenty of options to customize plots, making the link between pandas and Matplotlib explicit enables all the power of matplotlib to the plot. diff --git a/doc/source/getting_started/intro_tutorials/05_add_columns.rst b/doc/source/getting_started/intro_tutorials/05_add_columns.rst index d4f6a8d6bb4a2..a99c2c49585c5 100644 --- a/doc/source/getting_started/intro_tutorials/05_add_columns.rst +++ b/doc/source/getting_started/intro_tutorials/05_add_columns.rst @@ -39,8 +39,7 @@ in respectively Paris, Antwerp and London. .. ipython:: python - air_quality = pd.read_csv("data/air_quality_no2.csv", - index_col=0, parse_dates=True) + air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True) air_quality.head() .. raw:: html @@ -95,8 +94,9 @@ I want to check the ratio of the values in Paris versus Antwerp and save the res .. ipython:: python - air_quality["ratio_paris_antwerp"] = \ + air_quality["ratio_paris_antwerp"] = ( air_quality["station_paris"] / air_quality["station_antwerp"] + ) air_quality.head() The calculation is again element-wise, so the ``/`` is applied *for the @@ -122,9 +122,12 @@ I want to rename the data columns to the corresponding station identifiers used .. ipython:: python air_quality_renamed = air_quality.rename( - columns={"station_antwerp": "BETR801", - "station_paris": "FR04014", - "station_london": "London Westminster"}) + columns={ + "station_antwerp": "BETR801", + "station_paris": "FR04014", + "station_london": "London Westminster", + } + ) .. ipython:: python diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index c7363b94146ac..6ce98ba5dbd1b 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -122,8 +122,12 @@ aggregating statistics for given columns can be defined using the .. ipython:: python - titanic.agg({'Age': ['min', 'max', 'median', 'skew'], - 'Fare': ['min', 'max', 'median', 'mean']}) + titanic.agg( + { + "Age": ["min", "max", "median", "skew"], + "Fare": ["min", "max", "median", "mean"], + } + ) .. raw:: html @@ -197,7 +201,7 @@ on the grouped data as well: :align: center .. note:: - The `Pclass` column contains numerical data but actually + The ``Pclass`` column contains numerical data but actually represents 3 categories (or factors) with respectively the labels ‘1’, ‘2’ and ‘3’. Calculating statistics on these does not make much sense. Therefore, pandas provides a ``Categorical`` data type to handle this diff --git a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst index c16fec6aaba9f..20c36133330c4 100644 --- a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst +++ b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst @@ -101,8 +101,9 @@ measurement. .. ipython:: python - air_quality = pd.read_csv("data/air_quality_long.csv", - index_col="date.utc", parse_dates=True) + air_quality = pd.read_csv( + "data/air_quality_long.csv", index_col="date.utc", parse_dates=True + ) air_quality.head() .. raw:: html @@ -247,8 +248,9 @@ I want the mean concentrations for :math:`NO_2` and :math:`PM_{2.5}` in each of .. ipython:: python - air_quality.pivot_table(values="value", index="location", - columns="parameter", aggfunc="mean") + air_quality.pivot_table( + values="value", index="location", columns="parameter", aggfunc="mean" + ) In the case of :meth:`~DataFrame.pivot`, the data is only rearranged. When multiple values need to be aggregated (in this specific case, the values on @@ -266,9 +268,13 @@ the ``margin`` parameter to ``True``: .. ipython:: python - air_quality.pivot_table(values="value", index="location", - columns="parameter", aggfunc="mean", - margins=True) + air_quality.pivot_table( + values="value", + index="location", + columns="parameter", + aggfunc="mean", + margins=True, + ) .. raw:: html @@ -345,12 +351,12 @@ The :func:`pandas.melt` method can be defined in more detail: .. ipython:: python - no_2 = no2_pivoted.melt(id_vars="date.utc", - value_vars=["BETR801", - "FR04014", - "London Westminster"], - value_name="NO_2", - var_name="id_location") + no_2 = no2_pivoted.melt( + id_vars="date.utc", + value_vars=["BETR801", "FR04014", "London Westminster"], + value_name="NO_2", + var_name="id_location", + ) no_2.head() The result in the same, but in more detail defined: diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 600a75b156ac4..be4c284912db4 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -123,9 +123,9 @@ concatenated tables to verify the operation: .. ipython:: python - print('Shape of the `air_quality_pm25` table: ', air_quality_pm25.shape) - print('Shape of the `air_quality_no2` table: ', air_quality_no2.shape) - print('Shape of the resulting `air_quality` table: ', air_quality.shape) + print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape) + print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape) + print('Shape of the resulting ``air_quality`` table: ', air_quality.shape) Hence, the resulting table has 3178 = 1110 + 2068 rows. @@ -155,8 +155,7 @@ index. For example: .. ipython:: python - air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], - keys=["PM25", "NO2"]) + air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"]) .. ipython:: python @@ -233,8 +232,7 @@ Add the station coordinates, provided by the stations metadata table, to the cor .. ipython:: python - air_quality = pd.merge(air_quality, stations_coord, - how='left', on='location') + air_quality = pd.merge(air_quality, stations_coord, how="left", on="location") air_quality.head() Using the :meth:`~pandas.merge` function, for each of the rows in the diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index 19351e0e3bc75..598d3514baa15 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -204,10 +204,9 @@ Plot the typical :math:`NO_2` pattern during the day of our time series of all s .. ipython:: python fig, axs = plt.subplots(figsize=(12, 4)) - air_quality.groupby( - air_quality["datetime"].dt.hour)["value"].mean().plot(kind='bar', - rot=0, - ax=axs) + air_quality.groupby(air_quality["datetime"].dt.hour)["value"].mean().plot( + kind='bar', rot=0, ax=axs + ) plt.xlabel("Hour of the day"); # custom x label using matplotlib @savefig 09_bar_chart.png plt.ylabel("$NO_2 (µg/m^3)$"); diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 93ad35fb1960b..b8db7de5b7b10 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -66,15 +66,15 @@ How to manipulate textual data?
  • -Make all name characters lowercase +Make all name characters lowercase. .. ipython:: python titanic["Name"].str.lower() To make each of the strings in the ``Name`` column lowercase, select the ``Name`` column -(see :ref:`tutorial on selection of data <10min_tut_03_subset>`), add the ``str`` accessor and -apply the ``lower`` method. As such, each of the strings is converted element wise. +(see the :ref:`tutorial on selection of data <10min_tut_03_subset>`), add the ``str`` accessor and +apply the ``lower`` method. As such, each of the strings is converted element-wise. .. raw:: html @@ -86,7 +86,7 @@ having a ``dt`` accessor, a number of specialized string methods are available when using the ``str`` accessor. These methods have in general matching names with the equivalent built-in string methods for single elements, but are applied -element-wise (remember :ref:`element wise calculations <10min_tut_05_columns>`?) +element-wise (remember :ref:`element-wise calculations <10min_tut_05_columns>`?) on each of the values of the columns. .. raw:: html @@ -94,7 +94,7 @@ on each of the values of the columns.
    • -Create a new column ``Surname`` that contains the surname of the Passengers by extracting the part before the comma. +Create a new column ``Surname`` that contains the surname of the passengers by extracting the part before the comma. .. ipython:: python @@ -135,7 +135,7 @@ More information on extracting parts of strings is available in the user guide s
      • -Extract the passenger data about the Countesses on board of the Titanic. +Extract the passenger data about the countesses on board of the Titanic. .. ipython:: python @@ -145,15 +145,15 @@ Extract the passenger data about the Countesses on board of the Titanic. titanic[titanic["Name"].str.contains("Countess")] -(*Interested in her story? See *\ `Wikipedia `__\ *!*) +(*Interested in her story? See* `Wikipedia `__\ *!*) The string method :meth:`Series.str.contains` checks for each of the values in the column ``Name`` if the string contains the word ``Countess`` and returns -for each of the values ``True`` (``Countess`` is part of the name) of +for each of the values ``True`` (``Countess`` is part of the name) or ``False`` (``Countess`` is not part of the name). This output can be used to subselect the data using conditional (boolean) indexing introduced in the :ref:`subsetting of data tutorial <10min_tut_03_subset>`. As there was -only one Countess on the Titanic, we get one row as a result. +only one countess on the Titanic, we get one row as a result. .. raw:: html @@ -220,12 +220,11 @@ we can do a selection using the ``loc`` operator, introduced in the
        • -In the "Sex" column, replace values of "male" by "M" and values of "female" by "F" +In the "Sex" column, replace values of "male" by "M" and values of "female" by "F". .. ipython:: python - titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", - "female": "F"}) + titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", "female": "F"}) titanic["Sex_short"] Whereas :meth:`~Series.replace` is not a string method, it provides a convenient way @@ -257,7 +256,7 @@ a ``dictionary`` to define the mapping ``{from : to}``.

          REMEMBER

          - String methods are available using the ``str`` accessor. -- String methods work element wise and can be used for conditional +- String methods work element-wise and can be used for conditional indexing. - The ``replace`` method is a convenient method to convert values according to a given dictionary. diff --git a/doc/source/getting_started/intro_tutorials/index.rst b/doc/source/getting_started/intro_tutorials/index.rst index 28e7610866461..c67e18043c175 100644 --- a/doc/source/getting_started/intro_tutorials/index.rst +++ b/doc/source/getting_started/intro_tutorials/index.rst @@ -19,4 +19,3 @@ Getting started tutorials 08_combine_dataframes 09_timeseries 10_text_data - diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index d8a40c5406dee..3d8108d78ac89 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -6,12 +6,12 @@ Package overview **************** -**pandas** is a `Python `__ package providing fast, +pandas is a `Python `__ package providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the -fundamental high-level building block for doing practical, **real world** data +fundamental high-level building block for doing practical, **real-world** data analysis in Python. Additionally, it has the broader goal of becoming **the -most powerful and flexible open source data analysis / manipulation tool +most powerful and flexible open source data analysis/manipulation tool available in any language**. It is already well on its way toward this goal. pandas is well suited for many different kinds of data: @@ -21,7 +21,7 @@ pandas is well suited for many different kinds of data: - Ordered and unordered (not necessarily fixed-frequency) time series data. - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels - - Any other form of observational / statistical data sets. The data actually + - Any other form of observational / statistical data sets. The data need not be labeled at all to be placed into a pandas data structure The two primary data structures of pandas, :class:`Series` (1-dimensional) @@ -40,7 +40,7 @@ Here are just a few of the things that pandas does well: higher dimensional objects - Automatic and explicit **data alignment**: objects can be explicitly aligned to a set of labels, or the user can simply ignore the labels and - let `Series`, `DataFrame`, etc. automatically align the data for you in + let ``Series``, ``DataFrame``, etc. automatically align the data for you in computations - Powerful, flexible **group by** functionality to perform split-apply-combine operations on data sets, for both aggregating and @@ -57,7 +57,7 @@ Here are just a few of the things that pandas does well: Excel files, databases, and saving / loading data from the ultrafast **HDF5 format** - **Time series**-specific functionality: date range generation and frequency - conversion, moving window statistics, date shifting and lagging. + conversion, moving window statistics, date shifting, and lagging. Many of these principles are here to address the shortcomings frequently experienced using other languages / scientific research environments. For data @@ -101,12 +101,12 @@ fashion. Also, we would like sensible default behaviors for the common API functions which take into account the typical orientation of time series and -cross-sectional data sets. When using ndarrays to store 2- and 3-dimensional +cross-sectional data sets. When using the N-dimensional array (ndarrays) to store 2- and 3-dimensional data, a burden is placed on the user to consider the orientation of the data set when writing functions; axes are considered more or less equivalent (except when C- or Fortran-contiguousness matters for performance). In pandas, the axes are intended to lend more semantic meaning to the data; i.e., for a particular -data set there is likely to be a "right" way to orient the data. The goal, +data set, there is likely to be a "right" way to orient the data. The goal, then, is to reduce the amount of mental effort required to code up data transformations in downstream functions. @@ -148,8 +148,8 @@ pandas possible. Thanks to `all of our contributors `. pandas is a `NumFOCUS `__ sponsored project. -This will help ensure the success of development of pandas as a world-class open-source -project, and makes it possible to `donate `__ to the project. +This will help ensure the success of the development of pandas as a world-class open-source +project and makes it possible to `donate `__ to the project. Project governance ------------------ @@ -174,4 +174,3 @@ License ------- .. literalinclude:: ../../../LICENSE - diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 4c2d0621c6103..b8940d2efed2f 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -94,4 +94,4 @@ Various tutorials * `Intro to pandas data structures, by Greg Reda `_ * `Pandas and Python: Top 10, by Manish Amde `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ -* `A concise tutorial with real life examples `_ +* `A concise tutorial with real life examples `_ diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 4aba8f709fba0..c6deb4b7ea383 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -17,7 +17,7 @@ pandas documentation `Source Repository `__ | `Issues & Ideas `__ | `Q&A Support `__ | -`Mailing List `__ +`Mailing List `__ :mod:`pandas` is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the `Python `__ diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 1725c415fa020..43e2509469488 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -16,7 +16,7 @@ For some data types, pandas extends NumPy's type system. String aliases for thes can be found at :ref:`basics.dtypes`. =================== ========================= ================== ============================= -Kind of Data Pandas Data Type Scalar Array +Kind of Data pandas Data Type Scalar Array =================== ========================= ================== ============================= TZ-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :ref:`api.arrays.datetime` Timedeltas (none) :class:`Timedelta` :ref:`api.arrays.timedelta` @@ -29,7 +29,7 @@ Strings :class:`StringDtype` :class:`str` :ref:`api.array Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= -Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). +pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). The top-level :meth:`array` method can be used to create a new array, which may be stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFrame`. @@ -43,7 +43,7 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra Datetime data ------------- -NumPy cannot natively represent timezone-aware datetimes. Pandas supports this +NumPy cannot natively represent timezone-aware datetimes. pandas supports this with the :class:`arrays.DatetimeArray` extension array, which can hold timezone-naive or timezone-aware values. @@ -63,7 +63,9 @@ Properties Timestamp.asm8 Timestamp.day Timestamp.dayofweek + Timestamp.day_of_week Timestamp.dayofyear + Timestamp.day_of_year Timestamp.days_in_month Timestamp.daysinmonth Timestamp.fold @@ -162,7 +164,7 @@ If the data are tz-aware, then every value in the array must have the same timez Timedelta data -------------- -NumPy can natively represent timedeltas. Pandas provides :class:`Timedelta` +NumPy can natively represent timedeltas. pandas provides :class:`Timedelta` for symmetry with :class:`Timestamp`. .. autosummary:: @@ -217,7 +219,7 @@ A collection of timedeltas may be stored in a :class:`TimedeltaArray`. Timespan data ------------- -Pandas represents spans of times as :class:`Period` objects. +pandas represents spans of times as :class:`Period` objects. Period ------ @@ -233,7 +235,9 @@ Properties Period.day Period.dayofweek + Period.day_of_week Period.dayofyear + Period.day_of_year Period.days_in_month Period.daysinmonth Period.end_time @@ -352,7 +356,7 @@ Nullable integer ---------------- :class:`numpy.ndarray` cannot natively represent integer-data with missing values. -Pandas provides this through :class:`arrays.IntegerArray`. +pandas provides this through :class:`arrays.IntegerArray`. .. autosummary:: :toctree: api/ @@ -378,7 +382,7 @@ Pandas provides this through :class:`arrays.IntegerArray`. Categorical data ---------------- -Pandas defines a custom data type for representing data that can take only a +pandas defines a custom data type for representing data that can take only a limited, fixed set of values. The dtype of a ``Categorical`` can be described by a :class:`pandas.api.types.CategoricalDtype`. diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index e3dfb552651a0..9a1ebc8d670dc 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -37,6 +37,7 @@ Attributes and underlying data DataFrame.shape DataFrame.memory_usage DataFrame.empty + DataFrame.set_flags Conversion ~~~~~~~~~~ @@ -276,6 +277,21 @@ Time Series-related DataFrame.tz_convert DataFrame.tz_localize +.. _api.frame.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`DataFrame.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags + + .. _api.frame.metadata: Metadata @@ -343,6 +359,7 @@ Sparse-dtype specific methods and attributes are provided under the .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst DataFrame.sparse.from_spmatrix DataFrame.sparse.to_coo diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index c1759110b94ad..37fe980dbf68c 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -37,6 +37,7 @@ Exceptions and warnings errors.AccessorRegistrationWarning errors.DtypeWarning + errors.DuplicateLabelError errors.EmptyDataError errors.InvalidIndexError errors.MergeError @@ -121,4 +122,3 @@ Bug report function :toctree: api/ show_versions - diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 76cb53559f334..ccf130d03418c 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -128,6 +128,7 @@ The following methods are available only for ``SeriesGroupBy`` objects. .. autosummary:: :toctree: api/ + SeriesGroupBy.hist SeriesGroupBy.nlargest SeriesGroupBy.nsmallest SeriesGroupBy.nunique diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst index 9d5649c37e92f..f7c5eaf242b34 100644 --- a/doc/source/reference/index.rst +++ b/doc/source/reference/index.rst @@ -30,7 +30,6 @@ public functions related to data types in pandas. series frame arrays - panel indexing offset_frequency window diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst index ba12c19763605..d3f9413dae565 100644 --- a/doc/source/reference/indexing.rst +++ b/doc/source/reference/indexing.rst @@ -345,9 +345,11 @@ Time/date components DatetimeIndex.time DatetimeIndex.timetz DatetimeIndex.dayofyear + DatetimeIndex.day_of_year DatetimeIndex.weekofyear DatetimeIndex.week DatetimeIndex.dayofweek + DatetimeIndex.day_of_week DatetimeIndex.weekday DatetimeIndex.quarter DatetimeIndex.tz @@ -461,7 +463,9 @@ Properties PeriodIndex.day PeriodIndex.dayofweek + PeriodIndex.day_of_week PeriodIndex.dayofyear + PeriodIndex.day_of_year PeriodIndex.days_in_month PeriodIndex.daysinmonth PeriodIndex.end_time diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index 1b63253cde2c5..e6271a7806706 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -33,6 +33,7 @@ Methods :toctree: api/ DateOffset.apply + DateOffset.apply_index DateOffset.copy DateOffset.isAnchored DateOffset.onOffset @@ -117,6 +118,7 @@ Methods :toctree: api/ BusinessHour.apply + BusinessHour.apply_index BusinessHour.copy BusinessHour.isAnchored BusinessHour.onOffset @@ -201,6 +203,7 @@ Methods :toctree: api/ CustomBusinessHour.apply + CustomBusinessHour.apply_index CustomBusinessHour.copy CustomBusinessHour.isAnchored CustomBusinessHour.onOffset @@ -401,6 +404,7 @@ Methods :toctree: api/ CustomBusinessMonthEnd.apply + CustomBusinessMonthEnd.apply_index CustomBusinessMonthEnd.copy CustomBusinessMonthEnd.isAnchored CustomBusinessMonthEnd.onOffset @@ -447,6 +451,7 @@ Methods :toctree: api/ CustomBusinessMonthBegin.apply + CustomBusinessMonthBegin.apply_index CustomBusinessMonthBegin.copy CustomBusinessMonthBegin.isAnchored CustomBusinessMonthBegin.onOffset @@ -586,6 +591,7 @@ Methods :toctree: api/ WeekOfMonth.apply + WeekOfMonth.apply_index WeekOfMonth.copy WeekOfMonth.isAnchored WeekOfMonth.onOffset @@ -622,6 +628,7 @@ Methods :toctree: api/ LastWeekOfMonth.apply + LastWeekOfMonth.apply_index LastWeekOfMonth.copy LastWeekOfMonth.isAnchored LastWeekOfMonth.onOffset @@ -938,6 +945,7 @@ Methods :toctree: api/ FY5253.apply + FY5253.apply_index FY5253.copy FY5253.get_rule_code_suffix FY5253.get_year_end @@ -977,6 +985,7 @@ Methods :toctree: api/ FY5253Quarter.apply + FY5253Quarter.apply_index FY5253Quarter.copy FY5253Quarter.get_rule_code_suffix FY5253Quarter.get_weeks @@ -1013,6 +1022,7 @@ Methods :toctree: api/ Easter.apply + Easter.apply_index Easter.copy Easter.isAnchored Easter.onOffset @@ -1053,6 +1063,7 @@ Methods Tick.is_on_offset Tick.__call__ Tick.apply + Tick.apply_index Day --- @@ -1087,6 +1098,7 @@ Methods Day.is_on_offset Day.__call__ Day.apply + Day.apply_index Hour ---- @@ -1121,6 +1133,7 @@ Methods Hour.is_on_offset Hour.__call__ Hour.apply + Hour.apply_index Minute ------ @@ -1155,6 +1168,7 @@ Methods Minute.is_on_offset Minute.__call__ Minute.apply + Minute.apply_index Second ------ @@ -1189,6 +1203,7 @@ Methods Second.is_on_offset Second.__call__ Second.apply + Second.apply_index Milli ----- @@ -1223,6 +1238,7 @@ Methods Milli.is_on_offset Milli.__call__ Milli.apply + Milli.apply_index Micro ----- @@ -1257,6 +1273,7 @@ Methods Micro.is_on_offset Micro.__call__ Micro.apply + Micro.apply_index Nano ---- @@ -1291,6 +1308,7 @@ Methods Nano.is_on_offset Nano.__call__ Nano.apply + Nano.apply_index .. _api.frequencies: diff --git a/doc/source/reference/panel.rst b/doc/source/reference/panel.rst deleted file mode 100644 index 94bfe87fe39f0..0000000000000 --- a/doc/source/reference/panel.rst +++ /dev/null @@ -1,10 +0,0 @@ -{{ header }} - -.. _api.panel: - -===== -Panel -===== -.. currentmodule:: pandas - -`Panel` was removed in 0.25.0. For prior documentation, see the `0.24 documentation `_ diff --git a/doc/source/reference/plotting.rst b/doc/source/reference/plotting.rst index 95657dfa5fde5..632b39a1fa858 100644 --- a/doc/source/reference/plotting.rst +++ b/doc/source/reference/plotting.rst @@ -7,7 +7,7 @@ Plotting ======== .. currentmodule:: pandas.plotting -The following functions are contained in the `pandas.plotting` module. +The following functions are contained in the ``pandas.plotting`` module. .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3b595ba5ab206..cc2937695e80f 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -22,10 +22,6 @@ Attributes :toctree: api/ Series.index - -.. autosummary:: - :toctree: api/ - Series.array Series.values Series.dtype @@ -39,6 +35,8 @@ Attributes Series.empty Series.dtypes Series.name + Series.flags + Series.set_flags Conversion ---------- @@ -254,7 +252,6 @@ Combining / comparing / joining / merging Series.append Series.compare - Series.replace Series.update Time Series-related @@ -278,7 +275,7 @@ Time Series-related Accessors --------- -Pandas provides dtype-specific methods under various accessors. +pandas provides dtype-specific methods under various accessors. These are separate namespaces within :class:`Series` that only apply to specific data types. @@ -321,8 +318,10 @@ Datetime properties Series.dt.week Series.dt.weekofyear Series.dt.dayofweek + Series.dt.day_of_week Series.dt.weekday Series.dt.dayofyear + Series.dt.day_of_year Series.dt.quarter Series.dt.is_month_start Series.dt.is_month_end @@ -522,10 +521,24 @@ Sparse-dtype specific methods and attributes are provided under the .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst Series.sparse.from_coo Series.sparse.to_coo +.. _api.series.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`Series.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags .. _api.series.metadata: diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 24a47336b0522..e80dc1b57ff80 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -36,6 +36,7 @@ Style application Styler.where Styler.format Styler.set_precision + Styler.set_td_classes Styler.set_table_styles Styler.set_table_attributes Styler.set_caption diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index d7e6405a3732b..a255b3ae8081e 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -10,8 +10,10 @@ Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.roll Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc. ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:`pandas.Series.ewm`, etc. -Standard moving window functions --------------------------------- +.. _api.functions_rolling: + +Rolling window functions +------------------------ .. currentmodule:: pandas.core.window.rolling .. autosummary:: @@ -32,6 +34,17 @@ Standard moving window functions Rolling.apply Rolling.aggregate Rolling.quantile + Rolling.sem + +.. _api.functions_window: + +Weighted window functions +------------------------- +.. currentmodule:: pandas.core.window.rolling + +.. autosummary:: + :toctree: api/ + Window.mean Window.sum Window.var @@ -39,8 +52,8 @@ Standard moving window functions .. _api.functions_expanding: -Standard expanding window functions ------------------------------------ +Expanding window functions +-------------------------- .. currentmodule:: pandas.core.window.expanding .. autosummary:: @@ -61,9 +74,12 @@ Standard expanding window functions Expanding.apply Expanding.aggregate Expanding.quantile + Expanding.sem -Exponentially-weighted moving window functions ----------------------------------------------- +.. _api.functions_ewm: + +Exponentially-weighted window functions +--------------------------------------- .. currentmodule:: pandas.core.window.ewm .. autosummary:: @@ -75,6 +91,8 @@ Exponentially-weighted moving window functions ExponentialMovingWindow.corr ExponentialMovingWindow.cov +.. _api.indexers_window: + Window indexer -------------- .. currentmodule:: pandas @@ -86,3 +104,4 @@ Base class for defining custom window boundaries. api.indexers.BaseIndexer api.indexers.FixedForwardWindowIndexer + api.indexers.VariableOffsetWindowIndexer diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 93c50fff40305..cf548ba5d1133 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -34,21 +34,25 @@ and labeled columns: .. ipython:: python - dates = pd.date_range('20130101', periods=6) + dates = pd.date_range("20130101", periods=6) dates - df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) df Creating a :class:`DataFrame` by passing a dict of objects that can be converted to series-like. .. ipython:: python - df2 = pd.DataFrame({'A': 1., - 'B': pd.Timestamp('20130102'), - 'C': pd.Series(1, index=list(range(4)), dtype='float32'), - 'D': np.array([3] * 4, dtype='int32'), - 'E': pd.Categorical(["test", "train", "test", "train"]), - 'F': 'foo'}) + df2 = pd.DataFrame( + { + "A": 1.0, + "B": pd.Timestamp("20130102"), + "C": pd.Series(1, index=list(range(4)), dtype="float32"), + "D": np.array([3] * 4, dtype="int32"), + "E": pd.Categorical(["test", "train", "test", "train"]), + "F": "foo", + } + ) df2 The columns of the resulting :class:`DataFrame` have different @@ -152,7 +156,7 @@ Sorting by values: .. ipython:: python - df.sort_values(by='B') + df.sort_values(by="B") Selection --------- @@ -174,14 +178,14 @@ equivalent to ``df.A``: .. ipython:: python - df['A'] + df["A"] Selecting via ``[]``, which slices the rows. .. ipython:: python df[0:3] - df['20130102':'20130104'] + df["20130102":"20130104"] Selection by label ~~~~~~~~~~~~~~~~~~ @@ -198,31 +202,31 @@ Selecting on a multi-axis by label: .. ipython:: python - df.loc[:, ['A', 'B']] + df.loc[:, ["A", "B"]] Showing label slicing, both endpoints are *included*: .. ipython:: python - df.loc['20130102':'20130104', ['A', 'B']] + df.loc["20130102":"20130104", ["A", "B"]] Reduction in the dimensions of the returned object: .. ipython:: python - df.loc['20130102', ['A', 'B']] + df.loc["20130102", ["A", "B"]] For getting a scalar value: .. ipython:: python - df.loc[dates[0], 'A'] + df.loc[dates[0], "A"] For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python - df.at[dates[0], 'A'] + df.at[dates[0], "A"] Selection by position ~~~~~~~~~~~~~~~~~~~~~ @@ -235,13 +239,13 @@ Select via the position of the passed integers: df.iloc[3] -By integer slices, acting similar to numpy/python: +By integer slices, acting similar to numpy/Python: .. ipython:: python df.iloc[3:5, 0:2] -By lists of integer position locations, similar to the numpy/python style: +By lists of integer position locations, similar to the NumPy/Python style: .. ipython:: python @@ -278,7 +282,7 @@ Using a single column's values to select data. .. ipython:: python - df[df['A'] > 0] + df[df["A"] > 0] Selecting values from a DataFrame where a boolean condition is met. @@ -291,9 +295,9 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] + df2["E"] = ["one", "one", "two", "three", "four", "three"] df2 - df2[df2['E'].isin(['two', 'four'])] + df2[df2["E"].isin(["two", "four"])] Setting ~~~~~~~ @@ -303,15 +307,15 @@ by the indexes. .. ipython:: python - s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6)) + s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6)) s1 - df['F'] = s1 + df["F"] = s1 Setting values by label: .. ipython:: python - df.at[dates[0], 'A'] = 0 + df.at[dates[0], "A"] = 0 Setting values by position: @@ -323,7 +327,7 @@ Setting by assigning with a NumPy array: .. ipython:: python - df.loc[:, 'D'] = np.array([5] * len(df)) + df.loc[:, "D"] = np.array([5] * len(df)) The result of the prior setting operations. @@ -352,15 +356,15 @@ returns a copy of the data. .. ipython:: python - df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) - df1.loc[dates[0]:dates[1], 'E'] = 1 + df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"]) + df1.loc[dates[0] : dates[1], "E"] = 1 df1 To drop any rows that have missing data. .. ipython:: python - df1.dropna(how='any') + df1.dropna(how="any") Filling missing data. @@ -404,7 +408,7 @@ In addition, pandas automatically broadcasts along the specified dimension. s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) s - df.sub(s, axis='index') + df.sub(s, axis="index") Apply @@ -431,16 +435,16 @@ See more at :ref:`Histogramming and Discretization `. String Methods ~~~~~~~~~~~~~~ -Series is equipped with a set of string processing methods in the `str` +Series is equipped with a set of string processing methods in the ``str`` attribute that make it easy to operate on each element of the array, as in the -code snippet below. Note that pattern-matching in `str` generally uses `regular +code snippet below. Note that pattern-matching in ``str`` generally uses `regular expressions `__ by default (and in some cases always uses them). See more at :ref:`Vectorized String Methods `. .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"]) s.str.lower() Merge @@ -482,21 +486,21 @@ SQL style merges. See the :ref:`Database style joining ` section. .. ipython:: python - left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) - right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]}) + right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]}) left right - pd.merge(left, right, on='key') + pd.merge(left, right, on="key") Another example that can be given is: .. ipython:: python - left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) - right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) + left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]}) + right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]}) left right - pd.merge(left, right, on='key') + pd.merge(left, right, on="key") Grouping -------- @@ -512,12 +516,14 @@ See the :ref:`Grouping section `. .. ipython:: python - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df Grouping and then applying the :meth:`~pandas.core.groupby.GroupBy.sum` function to the resulting @@ -525,14 +531,14 @@ groups. .. ipython:: python - df.groupby('A').sum() + df.groupby("A").sum() Grouping by multiple columns forms a hierarchical index, and again we can apply the :meth:`~pandas.core.groupby.GroupBy.sum` function. .. ipython:: python - df.groupby(['A', 'B']).sum() + df.groupby(["A", "B"]).sum() Reshaping --------- @@ -545,12 +551,16 @@ Stack .. ipython:: python - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', - 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', - 'one', 'two', 'one', 'two']])) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) df2 = df[:4] df2 @@ -578,18 +588,22 @@ See the section on :ref:`Pivot Tables `. .. ipython:: python - df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3, - 'B': ['A', 'B', 'C'] * 4, - 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, - 'D': np.random.randn(12), - 'E': np.random.randn(12)}) + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 3, + "B": ["A", "B", "C"] * 4, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2, + "D": np.random.randn(12), + "E": np.random.randn(12), + } + ) df We can produce pivot tables from this data very easily: .. ipython:: python - pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) + pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) Time series @@ -602,31 +616,31 @@ financial applications. See the :ref:`Time Series section `. .. ipython:: python - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - ts.resample('5Min').sum() + ts.resample("5Min").sum() Time zone representation: .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') + rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D") ts = pd.Series(np.random.randn(len(rng)), rng) ts - ts_utc = ts.tz_localize('UTC') + ts_utc = ts.tz_localize("UTC") ts_utc Converting to another time zone: .. ipython:: python - ts_utc.tz_convert('US/Eastern') + ts_utc.tz_convert("US/Eastern") Converting between time span representations: .. ipython:: python - rng = pd.date_range('1/1/2012', periods=5, freq='M') + rng = pd.date_range("1/1/2012", periods=5, freq="M") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts ps = ts.to_period() @@ -640,9 +654,9 @@ the quarter end: .. ipython:: python - prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') + prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV") ts = pd.Series(np.random.randn(len(prng)), prng) - ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9 ts.head() Categoricals @@ -653,8 +667,11 @@ pandas can include categorical data in a :class:`DataFrame`. For full docs, see .. ipython:: python - df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], - "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) + df = pd.DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + + Convert the raw grades to a categorical data type. @@ -674,8 +691,9 @@ Reorder the categories and simultaneously add the missing categories (methods un .. ipython:: python - df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", - "good", "very good"]) + df["grade"] = df["grade"].cat.set_categories( + ["very bad", "bad", "medium", "good", "very good"] + ) df["grade"] Sorting is per order in the categories, not lexical order. @@ -701,12 +719,12 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") .. ipython:: python - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -717,8 +735,10 @@ of the columns with labels: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), index=ts.index, columns=["A", "B", "C", "D"] + ) + df = df.cumsum() plt.figure() @@ -736,19 +756,20 @@ CSV .. ipython:: python - df.to_csv('foo.csv') + df.to_csv("foo.csv") :ref:`Reading from a csv file. ` .. ipython:: python - pd.read_csv('foo.csv') + pd.read_csv("foo.csv") .. ipython:: python :suppress: import os - os.remove('foo.csv') + + os.remove("foo.csv") HDF5 ~~~~ @@ -759,18 +780,18 @@ Writing to a HDF5 Store. .. ipython:: python - df.to_hdf('foo.h5', 'df') + df.to_hdf("foo.h5", "df") Reading from a HDF5 Store. .. ipython:: python - pd.read_hdf('foo.h5', 'df') + pd.read_hdf("foo.h5", "df") .. ipython:: python :suppress: - os.remove('foo.h5') + os.remove("foo.h5") Excel ~~~~~ @@ -781,18 +802,18 @@ Writing to an excel file. .. ipython:: python - df.to_excel('foo.xlsx', sheet_name='Sheet1') + df.to_excel("foo.xlsx", sheet_name="Sheet1") Reading from an excel file. .. ipython:: python - pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"]) .. ipython:: python :suppress: - os.remove('foo.xlsx') + os.remove("foo.xlsx") Gotchas ------- diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index a0331dd632583..2cd48ac7adb0e 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -62,12 +62,14 @@ demonstrate different ways to initialize MultiIndexes. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = list(zip(*arrays)) tuples - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) index s = pd.Series(np.random.randn(8), index=index) @@ -78,8 +80,8 @@ to use the :meth:`MultiIndex.from_product` method: .. ipython:: python - iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] - pd.MultiIndex.from_product(iterables, names=['first', 'second']) + iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]] + pd.MultiIndex.from_product(iterables, names=["first", "second"]) You can also construct a ``MultiIndex`` from a ``DataFrame`` directly, using the method :meth:`MultiIndex.from_frame`. This is a complementary method to @@ -89,9 +91,10 @@ the method :meth:`MultiIndex.from_frame`. This is a complementary method to .. ipython:: python - df = pd.DataFrame([['bar', 'one'], ['bar', 'two'], - ['foo', 'one'], ['foo', 'two']], - columns=['first', 'second']) + df = pd.DataFrame( + [["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]], + columns=["first", "second"], + ) pd.MultiIndex.from_frame(df) As a convenience, you can pass a list of arrays directly into ``Series`` or @@ -99,8 +102,10 @@ As a convenience, you can pass a list of arrays directly into ``Series`` or .. ipython:: python - arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), - np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] + arrays = [ + np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]), + np.array(["one", "two", "one", "two", "one", "two", "one", "two"]), + ] s = pd.Series(np.random.randn(8), index=arrays) s df = pd.DataFrame(np.random.randn(8, 4), index=arrays) @@ -119,7 +124,7 @@ of the index is up to you: .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index) + df = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index) df pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6]) @@ -129,7 +134,7 @@ bit easier on the eyes. Note that how the index is displayed can be controlled u .. ipython:: python - with pd.option_context('display.multi_sparse', False): + with pd.option_context("display.multi_sparse", False): df It's worth keeping in mind that there's nothing preventing you from using @@ -157,7 +162,7 @@ location at a particular level: .. ipython:: python index.get_level_values(0) - index.get_level_values('second') + index.get_level_values("second") Basic indexing on axis with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -169,10 +174,10 @@ completely analogous way to selecting a column in a regular DataFrame: .. ipython:: python - df['bar'] - df['bar', 'one'] - df['bar']['one'] - s['qux'] + df["bar"] + df["bar", "one"] + df["bar"]["one"] + s["qux"] See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. @@ -190,7 +195,7 @@ For example:   df.columns.levels # original MultiIndex - df[['foo','qux']].columns.levels # sliced + df[["foo","qux"]].columns.levels # sliced This is done to avoid a recomputation of the levels in order to make slicing highly performant. If you want to see only the used levels, you can use the @@ -198,17 +203,17 @@ highly performant. If you want to see only the used levels, you can use the .. ipython:: python - df[['foo', 'qux']].columns.to_numpy() + df[["foo", "qux"]].columns.to_numpy() # for a specific level - df[['foo', 'qux']].columns.get_level_values(0) + df[["foo", "qux"]].columns.get_level_values(0) To reconstruct the ``MultiIndex`` with only the used levels, the :meth:`~MultiIndex.remove_unused_levels` method may be used. .. ipython:: python - new_mi = df[['foo', 'qux']].columns.remove_unused_levels() + new_mi = df[["foo", "qux"]].columns.remove_unused_levels() new_mi.levels Data alignment and using ``reindex`` @@ -229,7 +234,7 @@ called with another ``MultiIndex``, or even a list or array of tuples: .. ipython:: python s.reindex(index[:3]) - s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) + s.reindex([("foo", "two"), ("bar", "one"), ("qux", "one"), ("baz", "one")]) .. _advanced.advanced_hierarchical: @@ -244,7 +249,7 @@ keys take the form of tuples. For example, the following works as you would expe df = df.T df - df.loc[('bar', 'two')] + df.loc[("bar", "two")] Note that ``df.loc['bar', 'two']`` would also work in this example, but this shorthand notation can lead to ambiguity in general. @@ -254,7 +259,7 @@ like this: .. ipython:: python - df.loc[('bar', 'two'), 'A'] + df.loc[("bar", "two"), "A"] You don't have to specify all levels of the ``MultiIndex`` by passing only the first elements of the tuple. For example, you can use "partial" indexing to @@ -262,7 +267,7 @@ get all elements with ``bar`` in the first level as follows: .. ipython:: python - df.loc['bar'] + df.loc["bar"] This is a shortcut for the slightly more verbose notation ``df.loc[('bar',),]`` (equivalent to ``df.loc['bar',]`` in this example). @@ -271,20 +276,20 @@ to ``df.loc['bar',]`` in this example). .. ipython:: python - df.loc['baz':'foo'] + df.loc["baz":"foo"] You can slice with a 'range' of values, by providing a slice of tuples. .. ipython:: python - df.loc[('baz', 'two'):('qux', 'one')] - df.loc[('baz', 'two'):'foo'] + df.loc[("baz", "two"):("qux", "one")] + df.loc[("baz", "two"):"foo"] Passing a list of labels or tuples works similar to reindexing: .. ipython:: python - df.loc[[('bar', 'two'), ('qux', 'one')]] + df.loc[[("bar", "two"), ("qux", "one")]] .. note:: @@ -298,8 +303,10 @@ whereas a tuple of lists refer to several values within a level: .. ipython:: python - s = pd.Series([1, 2, 3, 4, 5, 6], - index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]])) + s = pd.Series( + [1, 2, 3, 4, 5, 6], + index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]), + ) s.loc[[("A", "c"), ("B", "d")]] # list of tuples s.loc[(["A", "B"], ["c", "d"])] # tuple of lists @@ -329,37 +336,44 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1', 'A3'), ...), :] # noqa: E999 + df.loc[(slice("A1", "A3"), ...), :] # noqa: E999   You should **not** do this:   .. code-block:: python - df.loc[(slice('A1', 'A3'), ...)] # noqa: E999 + df.loc[(slice("A1", "A3"), ...)] # noqa: E999 .. ipython:: python def mklbl(prefix, n): return ["%s%s" % (prefix, i) for i in range(n)] - miindex = pd.MultiIndex.from_product([mklbl('A', 4), - mklbl('B', 2), - mklbl('C', 4), - mklbl('D', 2)]) - micolumns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - dfmi = pd.DataFrame(np.arange(len(miindex) * len(micolumns)) - .reshape((len(miindex), len(micolumns))), - index=miindex, - columns=micolumns).sort_index().sort_index(axis=1) + + miindex = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + micolumns = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + dfmi = ( + pd.DataFrame( + np.arange(len(miindex) * len(micolumns)).reshape( + (len(miindex), len(micolumns)) + ), + index=miindex, + columns=micolumns, + ) + .sort_index() + .sort_index(axis=1) + ) dfmi Basic MultiIndex slicing using slices, lists, and labels. .. ipython:: python - dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] + dfmi.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] You can use :class:`pandas.IndexSlice` to facilitate a more natural syntax @@ -368,36 +382,36 @@ using ``:``, rather than using ``slice(None)``. .. ipython:: python idx = pd.IndexSlice - dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] + dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]] It is possible to perform quite complicated selections using this method on multiple axes at the same time. .. ipython:: python - dfmi.loc['A1', (slice(None), 'foo')] - dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] + dfmi.loc["A1", (slice(None), "foo")] + dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]] Using a boolean indexer you can provide selection related to the *values*. .. ipython:: python - mask = dfmi[('a', 'foo')] > 200 - dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']] + mask = dfmi[("a", "foo")] > 200 + dfmi.loc[idx[mask, :, ["C1", "C3"]], idx[:, "foo"]] You can also specify the ``axis`` argument to ``.loc`` to interpret the passed slicers on a single axis. .. ipython:: python - dfmi.loc(axis=0)[:, :, ['C1', 'C3']] + dfmi.loc(axis=0)[:, :, ["C1", "C3"]] Furthermore, you can *set* the values using the following methods. .. ipython:: python df2 = dfmi.copy() - df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10 + df2.loc(axis=0)[:, :, ["C1", "C3"]] = -10 df2 You can use a right-hand-side of an alignable object as well. @@ -405,7 +419,7 @@ You can use a right-hand-side of an alignable object as well. .. ipython:: python df2 = dfmi.copy() - df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000 + df2.loc[idx[:, :, ["C1", "C3"]], :] = df2 * 1000 df2 .. _advanced.xs: @@ -419,12 +433,12 @@ selecting data at a particular level of a ``MultiIndex`` easier. .. ipython:: python df - df.xs('one', level='second') + df.xs("one", level="second") .. ipython:: python # using the slicers - df.loc[(slice(None), 'one'), :] + df.loc[(slice(None), "one"), :] You can also select on the columns with ``xs``, by providing the axis argument. @@ -432,36 +446,36 @@ providing the axis argument. .. ipython:: python df = df.T - df.xs('one', level='second', axis=1) + df.xs("one", level="second", axis=1) .. ipython:: python # using the slicers - df.loc[:, (slice(None), 'one')] + df.loc[:, (slice(None), "one")] ``xs`` also allows selection with multiple keys. .. ipython:: python - df.xs(('one', 'bar'), level=('second', 'first'), axis=1) + df.xs(("one", "bar"), level=("second", "first"), axis=1) .. ipython:: python # using the slicers - df.loc[:, ('bar', 'one')] + df.loc[:, ("bar", "one")] You can pass ``drop_level=False`` to ``xs`` to retain the level that was selected. .. ipython:: python - df.xs('one', level='second', axis=1, drop_level=False) + df.xs("one", level="second", axis=1, drop_level=False) Compare the above with the result using ``drop_level=True`` (the default value). .. ipython:: python - df.xs('one', level='second', axis=1, drop_level=True) + df.xs("one", level="second", axis=1, drop_level=True) .. ipython:: python :suppress: @@ -479,8 +493,9 @@ values across a level. For instance: .. ipython:: python - midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], - codes=[[1, 1, 0, 0], [1, 0, 1, 0]]) + midx = pd.MultiIndex( + levels=[["zero", "one"], ["x", "y"]], codes=[[1, 1, 0, 0], [1, 0, 1, 0]] + ) df = pd.DataFrame(np.random.randn(4, 2), index=midx) df df2 = df.mean(level=0) @@ -543,7 +558,7 @@ used to move the values from the ``MultiIndex`` to a column. .. ipython:: python - df.rename_axis(index=['abc', 'def']) + df.rename_axis(index=["abc", "def"]) Note that the columns of a ``DataFrame`` are an index, so that using ``rename_axis`` with the ``columns`` argument will change the name of that @@ -561,7 +576,7 @@ When working with an ``Index`` object directly, rather than via a ``DataFrame``, .. ipython:: python - mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi = pd.MultiIndex.from_product([[1, 2], ["a", "b"]], names=["x", "y"]) mi.names mi2 = mi.rename("new name", level=0) @@ -586,6 +601,7 @@ they need to be sorted. As with any index, you can use :meth:`~DataFrame.sort_in .. ipython:: python import random + random.shuffle(tuples) s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples)) s @@ -600,9 +616,9 @@ are named. .. ipython:: python - s.index.set_names(['L1', 'L2'], inplace=True) - s.sort_index(level='L1') - s.sort_index(level='L2') + s.index.set_names(["L1", "L2"], inplace=True) + s.sort_index(level="L1") + s.sort_index(level="L2") On higher dimensional objects, you can sort any of the other axes by level if they have a ``MultiIndex``: @@ -617,10 +633,10 @@ return a copy of the data rather than a view: .. ipython:: python - dfm = pd.DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}) - dfm = dfm.set_index(['jim', 'joe']) + dfm = pd.DataFrame( + {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} + ) + dfm = dfm.set_index(["jim", "joe"]) dfm .. code-block:: ipython @@ -661,7 +677,7 @@ And now selection works as expected. .. ipython:: python - dfm.loc[(0, 'y'):(1, 'z')] + dfm.loc[(0, "y"):(1, "z")] Take methods ------------ @@ -754,18 +770,18 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python from pandas.api.types import CategoricalDtype - df = pd.DataFrame({'A': np.arange(6), - 'B': list('aabbca')}) - df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) + + df = pd.DataFrame({"A": np.arange(6), "B": list("aabbca")}) + df["B"] = df["B"].astype(CategoricalDtype(list("cab"))) df df.dtypes - df['B'].cat.categories + df["B"].cat.categories Setting the index will create a ``CategoricalIndex``. .. ipython:: python - df2 = df.set_index('B') + df2 = df.set_index("B") df2.index Indexing with ``__getitem__/.iloc/.loc`` works similarly to an ``Index`` with duplicates. @@ -773,13 +789,13 @@ The indexers **must** be in the category or the operation will raise a ``KeyErro .. ipython:: python - df2.loc['a'] + df2.loc["a"] The ``CategoricalIndex`` is **preserved** after indexing: .. ipython:: python - df2.loc['a'].index + df2.loc["a"].index Sorting the index will sort by the order of the categories (recall that we created the index with ``CategoricalDtype(list('cab'))``, so the sorted @@ -804,17 +820,18 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({'A': np.arange(3), - 'B': pd.Series(list('abc')).astype('category')}) - df3 = df3.set_index('B') + df3 = pd.DataFrame( + {"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")} + ) + df3 = df3.set_index("B") df3 .. ipython:: python - df3.reindex(['a', 'e']) - df3.reindex(['a', 'e']).index - df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) - df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index + df3.reindex(["a", "e"]) + df3.reindex(["a", "e"]).index + df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))) + df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))).index .. warning:: @@ -823,16 +840,14 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df4 = pd.DataFrame({'A': np.arange(2), - 'B': list('ba')}) - df4['B'] = df4['B'].astype(CategoricalDtype(list('ab'))) - df4 = df4.set_index('B') + df4 = pd.DataFrame({"A": np.arange(2), "B": list("ba")}) + df4["B"] = df4["B"].astype(CategoricalDtype(list("ab"))) + df4 = df4.set_index("B") df4.index - df5 = pd.DataFrame({'A': np.arange(2), - 'B': list('bc')}) - df5['B'] = df5['B'].astype(CategoricalDtype(list('bc'))) - df5 = df5.set_index('B') + df5 = pd.DataFrame({"A": np.arange(2), "B": list("bc")}) + df5["B"] = df5["B"].astype(CategoricalDtype(list("bc"))) + df5 = df5.set_index("B") df5.index .. code-block:: ipython @@ -916,12 +931,18 @@ example, be millisecond offsets. .. ipython:: python - dfir = pd.concat([pd.DataFrame(np.random.randn(5, 2), - index=np.arange(5) * 250.0, - columns=list('AB')), - pd.DataFrame(np.random.randn(6, 2), - index=np.arange(4, 10) * 250.1, - columns=list('AB'))]) + dfir = pd.concat( + [ + pd.DataFrame( + np.random.randn(5, 2), index=np.arange(5) * 250.0, columns=list("AB") + ), + pd.DataFrame( + np.random.randn(6, 2), + index=np.arange(4, 10) * 250.1, + columns=list("AB"), + ), + ] + ) dfir Selection operations then will always work on a value basis, for all selection operators. @@ -929,7 +950,7 @@ Selection operations then will always work on a value basis, for all selection o .. ipython:: python dfir[0:1000.4] - dfir.loc[0:1001, 'A'] + dfir.loc[0:1001, "A"] dfir.loc[1000.4] You could retrieve the first 1 second (1000 ms) of data as such: @@ -963,8 +984,9 @@ An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3, 4]}, - index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])) + df = pd.DataFrame( + {"A": [1, 2, 3, 4]}, index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]) + ) df Label based indexing via ``.loc`` along the edges of an interval works as you would expect, @@ -1041,9 +1063,9 @@ datetime-like intervals: pd.interval_range(start=0, end=5) - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4) + pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4) - pd.interval_range(end=pd.Timedelta('3 days'), periods=3) + pd.interval_range(end=pd.Timedelta("3 days"), periods=3) The ``freq`` parameter can used to specify non-default frequencies, and can utilize a variety of :ref:`frequency aliases ` with datetime-like intervals: @@ -1052,20 +1074,18 @@ of :ref:`frequency aliases ` with datetime-like inter pd.interval_range(start=0, periods=5, freq=1.5) - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4, freq='W') + pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W") - pd.interval_range(start=pd.Timedelta('0 days'), periods=3, freq='9H') + pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H") Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals are closed on. Intervals are closed on the right side by default. .. ipython:: python - pd.interval_range(start=0, end=4, closed='both') - - pd.interval_range(start=0, end=4, closed='neither') + pd.interval_range(start=0, end=4, closed="both") -.. versionadded:: 0.23.0 + pd.interval_range(start=0, end=4, closed="neither") Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements @@ -1075,8 +1095,7 @@ in the resulting ``IntervalIndex``: pd.interval_range(start=0, end=6, periods=4) - pd.interval_range(pd.Timestamp('2018-01-01'), - pd.Timestamp('2018-02-28'), periods=3) + pd.interval_range(pd.Timestamp("2018-01-01"), pd.Timestamp("2018-02-28"), periods=3) Miscellaneous indexing FAQ -------------------------- @@ -1114,7 +1133,7 @@ normal Python ``list``. Monotonicity of an index can be tested with the :meth:`~ .. ipython:: python - df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=['data'], data=list(range(5))) + df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=["data"], data=list(range(5))) df.index.is_monotonic_increasing # no rows 0 or 1, but still returns rows 2, 3 (both of them), and 4: @@ -1128,8 +1147,7 @@ On the other hand, if the index is not monotonic, then both slice bounds must be .. ipython:: python - df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5], - columns=['data'], data=list(range(6))) + df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5], columns=["data"], data=list(range(6))) df.index.is_monotonic_increasing # OK because 2 and 4 are in the index @@ -1151,7 +1169,7 @@ the :meth:`~Index.is_unique` attribute. .. ipython:: python - weakly_monotonic = pd.Index(['a', 'b', 'c', 'c']) + weakly_monotonic = pd.Index(["a", "b", "c", "c"]) weakly_monotonic weakly_monotonic.is_monotonic_increasing weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique @@ -1169,7 +1187,7 @@ consider the following ``Series``: .. ipython:: python - s = pd.Series(np.random.randn(6), index=list('abcdef')) + s = pd.Series(np.random.randn(6), index=list("abcdef")) s Suppose we wished to slice from ``c`` to ``e``, using integers this would be @@ -1192,7 +1210,7 @@ slicing include both endpoints: .. ipython:: python - s.loc['c':'e'] + s.loc["c":"e"] This is most definitely a "practicality beats purity" sort of thing, but it is something to watch out for if you expect label-based slicing to behave exactly diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 87359042928eb..ffecaa222e1f9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -12,10 +12,9 @@ the :ref:`10 minutes to pandas <10min>` section: .. ipython:: python - index = pd.date_range('1/1/2000', periods=8) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = pd.DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.date_range("1/1/2000", periods=8) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) .. _basics.head_tail: @@ -52,7 +51,7 @@ Note, **these attributes can be safely assigned to**! df.columns = [x.lower() for x in df.columns] df -Pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be +pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be thought of as containers for arrays, which hold the actual data and do the actual computation. For many types, the underlying array is a :class:`numpy.ndarray`. However, pandas and 3rd party libraries may *extend* @@ -97,7 +96,7 @@ Timezones may be preserved with ``dtype=object`` .. ipython:: python - ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + ser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) ser.to_numpy(dtype=object) Or thrown away with ``dtype='datetime64[ns]'`` @@ -174,8 +173,8 @@ These are both enabled to be used by default, you can control this by setting th .. code-block:: python - pd.set_option('compute.use_bottleneck', False) - pd.set_option('compute.use_numexpr', False) + pd.set_option("compute.use_bottleneck", False) + pd.set_option("compute.use_numexpr", False) .. _basics.binop: @@ -204,18 +203,21 @@ either match on the *index* or *columns* via the **axis** keyword: .. ipython:: python - df = pd.DataFrame({ - 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame( + { + "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]), + "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]), + "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]), + } + ) df row = df.iloc[1] - column = df['two'] + column = df["two"] - df.sub(row, axis='columns') + df.sub(row, axis="columns") df.sub(row, axis=1) - df.sub(column, axis='index') + df.sub(column, axis="index") df.sub(column, axis=0) .. ipython:: python @@ -228,10 +230,10 @@ Furthermore you can align a level of a MultiIndexed DataFrame with a Series. .. ipython:: python dfmi = df.copy() - dfmi.index = pd.MultiIndex.from_tuples([(1, 'a'), (1, 'b'), - (1, 'c'), (2, 'a')], - names=['first', 'second']) - dfmi.sub(column, axis=0, level='second') + dfmi.index = pd.MultiIndex.from_tuples( + [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"] + ) + dfmi.sub(column, axis=0, level="second") Series and Index also support the :func:`divmod` builtin. This function takes the floor division and modulo operation at the same time returning a two-tuple @@ -273,7 +275,7 @@ using ``fillna`` if you wish). :suppress: df2 = df.copy() - df2['three']['a'] = 1. + df2["three"]["a"] = 1.0 .. ipython:: python @@ -325,7 +327,7 @@ You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` prope .. ipython:: python df.empty - pd.DataFrame(columns=list('ABC')).empty + pd.DataFrame(columns=list("ABC")).empty To evaluate single-element pandas objects in a boolean context, use the method :meth:`~DataFrame.bool`: @@ -394,8 +396,8 @@ equality to be True: .. ipython:: python - df1 = pd.DataFrame({'col': ['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0]) + df1 = pd.DataFrame({"col": ["foo", 0, np.nan]}) + df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0]) df1.equals(df2) df1.equals(df2.sort_index()) @@ -407,16 +409,16 @@ data structure with a scalar value: .. ipython:: python - pd.Series(['foo', 'bar', 'baz']) == 'foo' - pd.Index(['foo', 'bar', 'baz']) == 'foo' + pd.Series(["foo", "bar", "baz"]) == "foo" + pd.Index(["foo", "bar", "baz"]) == "foo" -Pandas also handles element-wise comparisons between different array-like +pandas also handles element-wise comparisons between different array-like objects of the same length: .. ipython:: python - pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux']) - pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux']) + pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"]) + pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"]) Trying to compare ``Index`` or ``Series`` objects of different lengths will raise a ValueError: @@ -458,10 +460,15 @@ which we illustrate: .. ipython:: python - df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) - df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 = pd.DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) + df2 = pd.DataFrame( + { + "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], + "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0], + } + ) df1 df2 df1.combine_first(df2) @@ -480,6 +487,8 @@ So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: def combiner(x, y): return np.where(pd.isna(x), y, x) + + df1.combine(df2, combiner) .. _basics.stats: @@ -529,8 +538,8 @@ standard deviation of 1), very concisely: Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` preserve the location of ``NaN`` values. This is somewhat different from -:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling`. -For more details please see :ref:`this note `. +:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling` since ``NaN`` behavior +is furthermore dictated by a ``min_periods`` parameter. .. ipython:: python @@ -570,8 +579,8 @@ will exclude NAs on Series input by default: .. ipython:: python - np.mean(df['one']) - np.mean(df['one'].to_numpy()) + np.mean(df["one"]) + np.mean(df["one"].to_numpy()) :meth:`Series.nunique` will return the number of unique non-NA values in a Series: @@ -597,8 +606,7 @@ course): series = pd.Series(np.random.randn(1000)) series[::2] = np.nan series.describe() - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan frame.describe() @@ -606,7 +614,7 @@ You can select specific percentiles to include in the output: .. ipython:: python - series.describe(percentiles=[.05, .25, .75, .95]) + series.describe(percentiles=[0.05, 0.25, 0.75, 0.95]) By default, the median is always included. @@ -615,7 +623,7 @@ summary of the number of unique values and most frequently occurring values: .. ipython:: python - s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"]) s.describe() Note that on a mixed-type DataFrame object, :meth:`~DataFrame.describe` will @@ -624,7 +632,7 @@ categorical columns: .. ipython:: python - frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) + frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)}) frame.describe() This behavior can be controlled by providing a list of types as ``include``/``exclude`` @@ -632,9 +640,9 @@ arguments. The special value ``all`` can also be used: .. ipython:: python - frame.describe(include=['object']) - frame.describe(include=['number']) - frame.describe(include='all') + frame.describe(include=["object"]) + frame.describe(include=["number"]) + frame.describe(include="all") That feature relies on :ref:`select_dtypes `. Refer to there for details about accepted inputs. @@ -654,7 +662,7 @@ corresponding values: s1 s1.idxmin(), s1.idxmax() - df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C']) + df1 = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) df1 df1.idxmin(axis=0) df1.idxmax(axis=1) @@ -665,9 +673,9 @@ matching index: .. ipython:: python - df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) + df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=["A"], index=list("edcba")) df3 - df3['A'].idxmin() + df3["A"].idxmin() .. note:: @@ -706,8 +714,12 @@ Similarly, you can get the most frequently occurring value(s), i.e. the mode, of s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() - df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50), - "B": np.random.randint(-10, 15, size=50)}) + df5 = pd.DataFrame( + { + "A": np.random.randint(0, 7, size=50), + "B": np.random.randint(-10, 15, size=50), + } + ) df5.mode() @@ -732,7 +744,7 @@ normally distributed data into equal-size quartiles like so: .. ipython:: python arr = np.random.randn(30) - factor = pd.qcut(arr, [0, .25, .5, .75, 1]) + factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1]) factor pd.value_counts(factor) @@ -775,18 +787,20 @@ First some setup: """ Chicago, IL -> Chicago for city_name column """ - df['city_name'] = df['city_and_code'].str.split(",").str.get(0) + df["city_name"] = df["city_and_code"].str.split(",").str.get(0) return df + def add_country_name(df, country_name=None): """ Chicago -> Chicago-US for city_name column """ - col = 'city_name' - df['city_and_country'] = df[col] + country_name + col = "city_name" + df["city_and_country"] = df[col] + country_name return df - df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']}) + + df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]}) ``extract_city_name`` and ``add_country_name`` are functions taking and returning ``DataFrames``. @@ -795,16 +809,15 @@ Now compare the following: .. ipython:: python - add_country_name(extract_city_name(df_p), country_name='US') + add_country_name(extract_city_name(df_p), country_name="US") Is equivalent to: .. ipython:: python - (df_p.pipe(extract_city_name) - .pipe(add_country_name, country_name="US")) + df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US") -Pandas encourages the second style, which is known as method chaining. +pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions in method chains, alongside pandas' methods. @@ -820,18 +833,19 @@ For example, we can fit a regression using statsmodels. Their API expects a form import statsmodels.formula.api as sm - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") - (bb.query('h > 0') - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') - .fit() - .summary() - ) + ( + bb.query("h > 0") + .assign(ln_h=lambda df: np.log(df.h)) + .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .fit() + .summary() + ) The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which have introduced the popular ``(%>%)`` (read pipe) operator for R_. -The implementation of ``pipe`` here is quite clean and feels right at home in python. +The implementation of ``pipe`` here is quite clean and feels right at home in Python. We encourage you to view the source code of :meth:`~DataFrame.pipe`. .. _dplyr: https://github.com/hadley/dplyr @@ -858,8 +872,8 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. .. ipython:: python - df.apply('mean') - df.apply('mean', axis=1) + df.apply("mean") + df.apply("mean", axis=1) The return type of the function passed to :meth:`~DataFrame.apply` affects the type of the final output from ``DataFrame.apply`` for the default behaviour: @@ -878,8 +892,11 @@ maximum value for each column occurred: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=1000)) + tsdf = pd.DataFrame( + np.random.randn(1000, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=1000), + ) tsdf.apply(lambda x: x.idxmax()) You may also pass additional arguments and keyword arguments to the :meth:`~DataFrame.apply` @@ -902,8 +919,11 @@ Series operation on each column or row: .. ipython:: python :suppress: - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan .. ipython:: python @@ -925,7 +945,7 @@ Aggregation API The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. This API is similar across pandas objects, see :ref:`groupby API `, the -:ref:`window functions API `, and the :ref:`resample API `. +:ref:`window API `, and the :ref:`resample API `. The entry point for aggregation is :meth:`DataFrame.aggregate`, or the alias :meth:`DataFrame.agg`. @@ -933,8 +953,11 @@ We will use a similar starting frame from above: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan tsdf @@ -946,7 +969,7 @@ output: tsdf.agg(np.sum) - tsdf.agg('sum') + tsdf.agg("sum") # these are equivalent to a ``.sum()`` because we are aggregating # on a single function @@ -956,7 +979,7 @@ Single aggregations on a ``Series`` this will return a scalar value: .. ipython:: python - tsdf['A'].agg('sum') + tsdf["A"].agg("sum") Aggregating with multiple functions @@ -968,25 +991,25 @@ These are naturally named from the aggregation function. .. ipython:: python - tsdf.agg(['sum']) + tsdf.agg(["sum"]) Multiple functions yield multiple rows: .. ipython:: python - tsdf.agg(['sum', 'mean']) + tsdf.agg(["sum", "mean"]) On a ``Series``, multiple functions return a ``Series``, indexed by the function names: .. ipython:: python - tsdf['A'].agg(['sum', 'mean']) + tsdf["A"].agg(["sum", "mean"]) Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python - tsdf['A'].agg(['sum', lambda x: x.mean()]) + tsdf["A"].agg(["sum", lambda x: x.mean()]) Passing a named function will yield that name for the row: @@ -995,7 +1018,8 @@ Passing a named function will yield that name for the row: def mymean(x): return x.mean() - tsdf['A'].agg(['sum', mymean]) + + tsdf["A"].agg(["sum", mymean]) Aggregating with a dict +++++++++++++++++++++++ @@ -1006,7 +1030,7 @@ are not in any particular order, you can use an ``OrderedDict`` instead to guara .. ipython:: python - tsdf.agg({'A': 'mean', 'B': 'sum'}) + tsdf.agg({"A": "mean", "B": "sum"}) Passing a list-like will generate a ``DataFrame`` output. You will get a matrix-like output of all of the aggregators. The output will consist of all unique functions. Those that are @@ -1014,7 +1038,7 @@ not noted for a particular column will be ``NaN``: .. ipython:: python - tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'}) + tsdf.agg({"A": ["mean", "min"], "B": "sum"}) .. _basics.aggregation.mixed_string: @@ -1026,15 +1050,19 @@ aggregations. This is similar to how ``.groupby.agg`` works. .. ipython:: python - mdf = pd.DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) + mdf = pd.DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) mdf.dtypes .. ipython:: python - mdf.agg(['min', 'sum']) + mdf.agg(["min", "sum"]) .. _basics.aggregation.custom_describe: @@ -1049,11 +1077,11 @@ to the built in :ref:`describe function `. from functools import partial q_25 = partial(pd.Series.quantile, q=0.25) - q_25.__name__ = '25%' + q_25.__name__ = "25%" q_75 = partial(pd.Series.quantile, q=0.75) - q_75.__name__ = '75%' + q_75.__name__ = "75%" - tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max']) + tsdf.agg(["count", "mean", "std", "min", q_25, "median", q_75, "max"]) .. _basics.transform: @@ -1068,8 +1096,11 @@ We create a frame similar to the one used in the above sections. .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan tsdf @@ -1080,7 +1111,7 @@ function name or a user defined function. :okwarning: tsdf.transform(np.abs) - tsdf.transform('abs') + tsdf.transform("abs") tsdf.transform(lambda x: x.abs()) Here :meth:`~DataFrame.transform` received a single function; this is equivalent to a `ufunc @@ -1094,7 +1125,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin .. ipython:: python - tsdf['A'].transform(np.abs) + tsdf["A"].transform(np.abs) Transform with multiple functions @@ -1113,7 +1144,7 @@ resulting column names will be the transforming functions. .. ipython:: python - tsdf['A'].transform([np.abs, lambda x: x + 1]) + tsdf["A"].transform([np.abs, lambda x: x + 1]) Transforming with a dict @@ -1124,7 +1155,7 @@ Passing a dict of functions will allow selective transforming per column. .. ipython:: python - tsdf.transform({'A': np.abs, 'B': lambda x: x + 1}) + tsdf.transform({"A": np.abs, "B": lambda x: x + 1}) Passing a dict of lists will generate a MultiIndexed DataFrame with these selective transforms. @@ -1132,7 +1163,7 @@ selective transforms. .. ipython:: python :okwarning: - tsdf.transform({'A': np.abs, 'B': [lambda x: x + 1, 'sqrt']}) + tsdf.transform({"A": np.abs, "B": [lambda x: x + 1, "sqrt"]}) .. _basics.elementwise: @@ -1153,10 +1184,12 @@ a single value and returning a single value. For example: df4 + def f(x): return len(str(x)) - df4['one'].map(f) + + df4["one"].map(f) df4.applymap(f) :meth:`Series.map` has an additional feature; it can be used to easily @@ -1165,9 +1198,10 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], - index=['a', 'b', 'c', 'd', 'e']) - t = pd.Series({'six': 6., 'seven': 7.}) + s = pd.Series( + ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"] + ) + t = pd.Series({"six": 6.0, "seven": 7.0}) s s.map(t) @@ -1192,9 +1226,9 @@ Here is a simple example: .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s - s.reindex(['e', 'b', 'f', 'd']) + s.reindex(["e", "b", "f", "d"]) Here, the ``f`` label was not contained in the Series and hence appears as ``NaN`` in the result. @@ -1204,13 +1238,13 @@ With a DataFrame, you can simultaneously reindex the index and columns: .. ipython:: python df - df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) + df.reindex(index=["c", "f", "b"], columns=["three", "two", "one"]) You may also use ``reindex`` with an ``axis`` keyword: .. ipython:: python - df.reindex(['c', 'f', 'b'], axis='index') + df.reindex(["c", "f", "b"], axis="index") Note that the ``Index`` objects containing the actual axis labels can be **shared** between objects. So if we have a Series and a DataFrame, the @@ -1230,8 +1264,8 @@ where you specify a single ``labels`` argument and the ``axis`` it applies to. .. ipython:: python - df.reindex(['c', 'f', 'b'], axis='index') - df.reindex(['three', 'two', 'one'], axis='columns') + df.reindex(["c", "f", "b"], axis="index") + df.reindex(["three", "two", "one"], axis="columns") .. seealso:: @@ -1261,7 +1295,7 @@ available to make this simpler: .. ipython:: python :suppress: - df2 = df.reindex(['a', 'b', 'c'], columns=['one', 'two']) + df2 = df.reindex(["a", "b", "c"], columns=["one", "two"]) df3 = df2 - df2.mean() @@ -1288,12 +1322,12 @@ It returns a tuple with both of the reindexed Series: .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s1 = s[:4] s2 = s[1:] s1.align(s2) - s1.align(s2, join='inner') - s1.align(s2, join='left') + s1.align(s2, join="inner") + s1.align(s2, join="left") .. _basics.df_join: @@ -1302,13 +1336,13 @@ columns by default: .. ipython:: python - df.align(df2, join='inner') + df.align(df2, join="inner") You can also pass an ``axis`` option to only align on the specified axis: .. ipython:: python - df.align(df2, join='inner', axis=0) + df.align(df2, join="inner", axis=0) .. _basics.align.frame.series: @@ -1339,16 +1373,16 @@ We illustrate these fill methods on a simple Series: .. ipython:: python - rng = pd.date_range('1/3/2000', periods=8) + rng = pd.date_range("1/3/2000", periods=8) ts = pd.Series(np.random.randn(8), index=rng) ts2 = ts[[0, 3, 6]] ts ts2 ts2.reindex(ts.index) - ts2.reindex(ts.index, method='ffill') - ts2.reindex(ts.index, method='bfill') - ts2.reindex(ts.index, method='nearest') + ts2.reindex(ts.index, method="ffill") + ts2.reindex(ts.index, method="bfill") + ts2.reindex(ts.index, method="nearest") These methods require that the indexes are **ordered** increasing or decreasing. @@ -1359,7 +1393,7 @@ Note that the same result could have been achieved using .. ipython:: python - ts2.reindex(ts.index).fillna(method='ffill') + ts2.reindex(ts.index).fillna(method="ffill") :meth:`~Series.reindex` will raise a ValueError if the index is not monotonically increasing or decreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate` @@ -1376,14 +1410,14 @@ matches: .. ipython:: python - ts2.reindex(ts.index, method='ffill', limit=1) + ts2.reindex(ts.index, method="ffill", limit=1) In contrast, tolerance specifies the maximum distance between the index and indexer values: .. ipython:: python - ts2.reindex(ts.index, method='ffill', tolerance='1 day') + ts2.reindex(ts.index, method="ffill", tolerance="1 day") Notice that when used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. @@ -1400,14 +1434,14 @@ It removes a set of labels from an axis: .. ipython:: python df - df.drop(['a', 'd'], axis=0) - df.drop(['one'], axis=1) + df.drop(["a", "d"], axis=0) + df.drop(["one"], axis=1) Note that the following also works, but is a bit less obvious / clean: .. ipython:: python - df.reindex(df.index.difference(['a', 'd'])) + df.reindex(df.index.difference(["a", "d"])) .. _basics.rename: @@ -1428,8 +1462,10 @@ Series can also be used: .. ipython:: python - df.rename(columns={'one': 'foo', 'two': 'bar'}, - index={'a': 'apple', 'b': 'banana', 'd': 'durian'}) + df.rename( + columns={"one": "foo", "two": "bar"}, + index={"a": "apple", "b": "banana", "d": "durian"}, + ) If the mapping doesn't include a column/index label, it isn't renamed. Note that extra labels in the mapping don't throw an error. @@ -1439,8 +1475,8 @@ you specify a single ``mapper`` and the ``axis`` to apply that mapping to. .. ipython:: python - df.rename({'one': 'foo', 'two': 'bar'}, axis='columns') - df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index') + df.rename({"one": "foo", "two": "bar"}, axis="columns") + df.rename({"a": "apple", "b": "banana", "d": "durian"}, axis="index") The :meth:`~DataFrame.rename` method also provides an ``inplace`` named @@ -1459,17 +1495,19 @@ for altering the ``Series.name`` attribute. .. versionadded:: 0.24.0 The methods :meth:`DataFrame.rename_axis` and :meth:`Series.rename_axis` -allow specific names of a `MultiIndex` to be changed (as opposed to the +allow specific names of a ``MultiIndex`` to be changed (as opposed to the labels). .. ipython:: python - df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], - 'y': [10, 20, 30, 40, 50, 60]}, - index=pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], - names=['let', 'num'])) + df = pd.DataFrame( + {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]}, + index=pd.MultiIndex.from_product( + [["a", "b", "c"], [1, 2]], names=["let", "num"] + ), + ) df - df.rename_axis(index={'let': 'abc'}) + df.rename_axis(index={"let": "abc"}) df.rename_axis(index=str.upper) .. _basics.iteration: @@ -1491,14 +1529,15 @@ Thus, for example, iterating over a DataFrame gives you the column names: .. ipython:: python - df = pd.DataFrame({'col1': np.random.randn(3), - 'col2': np.random.randn(3)}, index=['a', 'b', 'c']) + df = pd.DataFrame( + {"col1": np.random.randn(3), "col2": np.random.randn(3)}, index=["a", "b", "c"] + ) for col in df: print(col) -Pandas objects also have the dict-like :meth:`~DataFrame.items` method to +pandas objects also have the dict-like :meth:`~DataFrame.items` method to iterate over the (key, value) pairs. To iterate over the rows of a DataFrame, you can use the following methods: @@ -1540,10 +1579,10 @@ To iterate over the rows of a DataFrame, you can use the following methods: .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) for index, row in df.iterrows(): - row['a'] = 10 + row["a"] = 10 df @@ -1576,7 +1615,7 @@ index value along with a Series containing the data in each row: .. ipython:: python for row_index, row in df.iterrows(): - print(row_index, row, sep='\n') + print(row_index, row, sep="\n") .. note:: @@ -1586,18 +1625,18 @@ index value along with a Series containing the data in each row: .. ipython:: python - df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + df_orig = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) df_orig.dtypes row = next(df_orig.iterrows())[1] row All values in ``row``, returned as a Series, are now upcasted - to floats, also the original integer value in column `x`: + to floats, also the original integer value in column ``x``: .. ipython:: python - row['int'].dtype - df_orig['int'].dtype + row["int"].dtype + df_orig["int"].dtype To preserve dtypes while iterating over the rows, it is better to use :meth:`~DataFrame.itertuples` which returns namedtuples of the values @@ -1607,7 +1646,7 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df2 = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) print(df2) print(df2.T) @@ -1652,7 +1691,7 @@ This will return a Series, indexed like the existing Series. .. ipython:: python # datetime - s = pd.Series(pd.date_range('20130101 09:10:12', periods=4)) + s = pd.Series(pd.date_range("20130101 09:10:12", periods=4)) s s.dt.hour s.dt.second @@ -1668,7 +1707,7 @@ You can easily produces tz aware transformations: .. ipython:: python - stz = s.dt.tz_localize('US/Eastern') + stz = s.dt.tz_localize("US/Eastern") stz stz.dt.tz @@ -1676,7 +1715,7 @@ You can also chain these types of operations: .. ipython:: python - s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") You can also format datetime values as strings with :meth:`Series.dt.strftime` which supports the same format as the standard :meth:`~datetime.datetime.strftime`. @@ -1684,23 +1723,23 @@ supports the same format as the standard :meth:`~datetime.datetime.strftime`. .. ipython:: python # DatetimeIndex - s = pd.Series(pd.date_range('20130101', periods=4)) + s = pd.Series(pd.date_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") .. ipython:: python # PeriodIndex - s = pd.Series(pd.period_range('20130101', periods=4)) + s = pd.Series(pd.period_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # period - s = pd.Series(pd.period_range('20130101', periods=4, freq='D')) + s = pd.Series(pd.period_range("20130101", periods=4, freq="D")) s s.dt.year s.dt.day @@ -1708,7 +1747,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # timedelta - s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s')) + s = pd.Series(pd.timedelta_range("1 day 00:00:05", periods=4, freq="s")) s s.dt.days s.dt.seconds @@ -1729,8 +1768,9 @@ built-in string methods. For example: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1741,7 +1781,7 @@ always uses them). .. note:: Prior to pandas 1.0, string methods were only available on ``object`` -dtype - ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated + ``Series``. pandas 1.0 added the :class:`StringDtype` which is dedicated to strings. See :ref:`text.types` for more. Please see :ref:`Vectorized String Methods ` for a complete @@ -1752,7 +1792,7 @@ description. Sorting ------- -Pandas supports three kinds of sorting: sorting by index labels, +pandas supports three kinds of sorting: sorting by index labels, sorting by column values, and sorting by a combination of both. .. _basics.sort_index: @@ -1765,13 +1805,17 @@ used to sort a pandas object by its index levels. .. ipython:: python - df = pd.DataFrame({ - 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame( + { + "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]), + "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]), + "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]), + } + ) - unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], - columns=['three', 'two', 'one']) + unsorted_df = df.reindex( + index=["a", "d", "c", "b"], columns=["three", "two", "one"] + ) unsorted_df # DataFrame @@ -1780,23 +1824,21 @@ used to sort a pandas object by its index levels. unsorted_df.sort_index(axis=1) # Series - unsorted_df['three'].sort_index() + unsorted_df["three"].sort_index() .. _basics.sort_index_key: .. versionadded:: 1.1.0 Sorting by index also supports a ``key`` parameter that takes a callable -function to apply to the index being sorted. For `MultiIndex` objects, -the key is applied per-level to the levels specified by `level`. +function to apply to the index being sorted. For ``MultiIndex`` objects, +the key is applied per-level to the levels specified by ``level``. .. ipython:: python - s1 = pd.DataFrame({ - "a": ['B', 'a', 'C'], - "b": [1, 2, 3], - "c": [2, 3, 4] - }).set_index(list("ab")) + s1 = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3], "c": [2, 3, 4]}).set_index( + list("ab") + ) s1 .. ipython:: python @@ -1812,23 +1854,23 @@ For information on key sorting by value, see :ref:`value sorting By values ~~~~~~~~~ -The :meth:`Series.sort_values` method is used to sort a `Series` by its values. The -:meth:`DataFrame.sort_values` method is used to sort a `DataFrame` by its column or row values. +The :meth:`Series.sort_values` method is used to sort a ``Series`` by its values. The +:meth:`DataFrame.sort_values` method is used to sort a ``DataFrame`` by its column or row values. The optional ``by`` parameter to :meth:`DataFrame.sort_values` may used to specify one or more columns to use to determine the sorted order. .. ipython:: python - df1 = pd.DataFrame({'one': [2, 1, 1, 1], - 'two': [1, 3, 2, 4], - 'three': [5, 4, 3, 2]}) - df1.sort_values(by='two') + df1 = pd.DataFrame( + {"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]} + ) + df1.sort_values(by="two") The ``by`` parameter can take a list of column names, e.g.: .. ipython:: python - df1[['one', 'two', 'three']].sort_values(by=['one', 'two']) + df1[["one", "two", "three"]].sort_values(by=["one", "two"]) These methods have special treatment of NA values via the ``na_position`` argument: @@ -1837,7 +1879,7 @@ argument: s[2] = np.nan s.sort_values() - s.sort_values(na_position='first') + s.sort_values(na_position="first") .. _basics.sort_value_key: @@ -1848,26 +1890,26 @@ to apply to the values being sorted. .. ipython:: python - s1 = pd.Series(['B', 'a', 'C']) + s1 = pd.Series(["B", "a", "C"]) .. ipython:: python s1.sort_values() s1.sort_values(key=lambda x: x.str.lower()) -`key` will be given the :class:`Series` of values and should return a ``Series`` -or array of the same shape with the transformed values. For `DataFrame` objects, +``key`` will be given the :class:`Series` of values and should return a ``Series`` +or array of the same shape with the transformed values. For ``DataFrame`` objects, the key is applied per column, so the key should still expect a Series and return a Series, e.g. .. ipython:: python - df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]}) + df = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3]}) .. ipython:: python - df.sort_values(by='a') - df.sort_values(by='a', key=lambda col: col.str.lower()) + df.sort_values(by="a") + df.sort_values(by="a", key=lambda col: col.str.lower()) The name or type of each column can be used to apply different functions to different columns. @@ -1877,28 +1919,26 @@ different columns. By indexes and values ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.23.0 - Strings passed as the ``by`` parameter to :meth:`DataFrame.sort_values` may refer to either columns or index level names. .. ipython:: python # Build MultiIndex - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), - ('b', 2), ('b', 1), ('b', 1)]) - idx.names = ['first', 'second'] + idx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("a", 2), ("b", 2), ("b", 1), ("b", 1)] + ) + idx.names = ["first", "second"] # Build DataFrame - df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, - index=idx) + df_multi = pd.DataFrame({"A": np.arange(6, 0, -1)}, index=idx) df_multi Sort by 'second' (index) and 'A' (column) .. ipython:: python - df_multi.sort_values(by=['second', 'A']) + df_multi.sort_values(by=["second", "A"]) .. note:: @@ -1919,8 +1959,8 @@ Series has the :meth:`~Series.searchsorted` method, which works similarly to ser = pd.Series([1, 2, 3]) ser.searchsorted([0, 3]) ser.searchsorted([0, 4]) - ser.searchsorted([1, 3], side='right') - ser.searchsorted([1, 3], side='left') + ser.searchsorted([1, 3], side="right") + ser.searchsorted([1, 3], side="left") ser = pd.Series([3, 1, 2]) ser.searchsorted([0, 3], sorter=np.argsort(ser)) @@ -1945,13 +1985,17 @@ faster than sorting the entire Series and calling ``head(n)`` on the result. .. ipython:: python - df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1], - 'b': list('abdceff'), - 'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]}) - df.nlargest(3, 'a') - df.nlargest(5, ['a', 'c']) - df.nsmallest(3, 'a') - df.nsmallest(5, ['a', 'c']) + df = pd.DataFrame( + { + "a": [-2, -1, 1, 10, 8, 11, -1], + "b": list("abdceff"), + "c": [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0], + } + ) + df.nlargest(3, "a") + df.nlargest(5, ["a", "c"]) + df.nsmallest(3, "a") + df.nsmallest(5, ["a", "c"]) .. _basics.multiindex_sorting: @@ -1964,10 +2008,10 @@ all levels to ``by``. .. ipython:: python - df1.columns = pd.MultiIndex.from_tuples([('a', 'one'), - ('a', 'two'), - ('b', 'three')]) - df1.sort_values(by=('a', 'two')) + df1.columns = pd.MultiIndex.from_tuples( + [("a", "one"), ("a", "two"), ("b", "three")] + ) + df1.sort_values(by=("a", "two")) Copying @@ -1997,7 +2041,7 @@ columns of a DataFrame. NumPy provides support for ``float``, ``int``, ``bool``, ``timedelta64[ns]`` and ``datetime64[ns]`` (note that NumPy does not support timezone-aware datetimes). -Pandas and third-party libraries *extend* NumPy's type system in a few places. +pandas and third-party libraries *extend* NumPy's type system in a few places. This section describes the extensions pandas has made internally. See :ref:`extending.extension-types` for how to write your own extension that works with pandas. See :ref:`ecosystem.extensions` for a list of third-party @@ -2034,7 +2078,7 @@ documentation sections for more on each type. | Boolean (with NA) | :class:`BooleanDtype` | :class:`bool` | :class:`arrays.BooleanArray` | ``'boolean'`` | :ref:`api.arrays.bool` | +-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ -Pandas has two ways to store strings. +pandas has two ways to store strings. 1. ``object`` dtype, which can hold any Python object, including strings. 2. :class:`StringDtype`, which is dedicated to strings. @@ -2050,13 +2094,17 @@ with the data type of each column. .. ipython:: python - dft = pd.DataFrame({'A': np.random.rand(3), - 'B': 1, - 'C': 'foo', - 'D': pd.Timestamp('20010102'), - 'E': pd.Series([1.0] * 3).astype('float32'), - 'F': False, - 'G': pd.Series([1] * 3, dtype='int8')}) + dft = pd.DataFrame( + { + "A": np.random.rand(3), + "B": 1, + "C": "foo", + "D": pd.Timestamp("20010102"), + "E": pd.Series([1.0] * 3).astype("float32"), + "F": False, + "G": pd.Series([1] * 3, dtype="int8"), + } + ) dft dft.dtypes @@ -2064,7 +2112,7 @@ On a ``Series`` object, use the :attr:`~Series.dtype` attribute. .. ipython:: python - dft['A'].dtype + dft["A"].dtype If a pandas object contains data with multiple dtypes *in a single column*, the dtype of the column will be chosen to accommodate all of the data types @@ -2073,10 +2121,10 @@ dtype of the column will be chosen to accommodate all of the data types .. ipython:: python # these ints are coerced to floats - pd.Series([1, 2, 3, 4, 5, 6.]) + pd.Series([1, 2, 3, 4, 5, 6.0]) # string data forces an ``object`` dtype - pd.Series([1, 2, 3, 6., 'foo']) + pd.Series([1, 2, 3, 6.0, "foo"]) The number of columns of each type in a ``DataFrame`` can be found by calling ``DataFrame.dtypes.value_counts()``. @@ -2092,13 +2140,16 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32") df1 df1.dtypes - df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), - 'B': pd.Series(np.random.randn(8)), - 'C': pd.Series(np.array(np.random.randn(8), - dtype='uint8'))}) + df2 = pd.DataFrame( + { + "A": pd.Series(np.random.randn(8), dtype="float16"), + "B": pd.Series(np.random.randn(8)), + "C": pd.Series(np.array(np.random.randn(8), dtype="uint8")), + } + ) df2 df2.dtypes @@ -2111,9 +2162,9 @@ The following will all result in ``int64`` dtypes. .. ipython:: python - pd.DataFrame([1, 2], columns=['a']).dtypes - pd.DataFrame({'a': [1, 2]}).dtypes - pd.DataFrame({'a': 1}, index=list(range(2))).dtypes + pd.DataFrame([1, 2], columns=["a"]).dtypes + pd.DataFrame({"a": [1, 2]}).dtypes + pd.DataFrame({"a": 1}, index=list(range(2))).dtypes Note that Numpy will choose *platform-dependent* types when creating arrays. The following **WILL** result in ``int32`` on 32-bit platform. @@ -2152,7 +2203,7 @@ You can use the :meth:`~DataFrame.astype` method to explicitly convert dtypes fr even if the dtype was unchanged (pass ``copy=False`` to change this behavior). In addition, they will raise an exception if the astype operation is invalid. -Upcasting is always according to the **numpy** rules. If two different dtypes are involved in an operation, +Upcasting is always according to the **NumPy** rules. If two different dtypes are involved in an operation, then the more *general* one will be used as the result of the operation. .. ipython:: python @@ -2161,15 +2212,15 @@ then the more *general* one will be used as the result of the operation. df3.dtypes # conversion of dtypes - df3.astype('float32').dtypes + df3.astype("float32").dtypes Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`. .. ipython:: python - dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft[['a', 'b']] = dft[['a', 'b']].astype(np.uint8) + dft = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft[["a", "b"]] = dft[["a", "b"]].astype(np.uint8) dft dft.dtypes @@ -2177,8 +2228,8 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft1 = dft1.astype({'a': np.bool, 'c': np.float64}) + dft1 = pd.DataFrame({"a": [1, 0, 1], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft1 = dft1.astype({"a": np.bool, "c": np.float64}) dft1 dft1.dtypes @@ -2190,9 +2241,9 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft.loc[:, ['a', 'b']].astype(np.uint8).dtypes - dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) + dft = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft.loc[:, ["a", "b"]].astype(np.uint8).dtypes + dft.loc[:, ["a", "b"]] = dft.loc[:, ["a", "b"]].astype(np.uint8) dft.dtypes .. _basics.object_conversion: @@ -2208,10 +2259,14 @@ to the correct type. .. ipython:: python import datetime - df = pd.DataFrame([[1, 2], - ['a', 'b'], - [datetime.datetime(2016, 3, 2), - datetime.datetime(2016, 3, 2)]]) + + df = pd.DataFrame( + [ + [1, 2], + ["a", "b"], + [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)], + ] + ) df = df.T df df.dtypes @@ -2230,7 +2285,7 @@ hard conversion of objects to a specified type: .. ipython:: python - m = ['1.1', 2, 3] + m = ["1.1", 2, 3] pd.to_numeric(m) * :meth:`~pandas.to_datetime` (conversion to datetime objects) @@ -2238,14 +2293,15 @@ hard conversion of objects to a specified type: .. ipython:: python import datetime - m = ['2016-07-09', datetime.datetime(2016, 3, 2)] + + m = ["2016-07-09", datetime.datetime(2016, 3, 2)] pd.to_datetime(m) * :meth:`~pandas.to_timedelta` (conversion to timedelta objects) .. ipython:: python - m = ['5us', pd.Timedelta('1day')] + m = ["5us", pd.Timedelta("1day")] pd.to_timedelta(m) To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements @@ -2258,14 +2314,15 @@ non-conforming elements intermixed that you want to represent as missing: .. ipython:: python import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='coerce') - m = ['apple', 2, 3] - pd.to_numeric(m, errors='coerce') + m = ["apple", datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors="coerce") - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='coerce') + m = ["apple", 2, 3] + pd.to_numeric(m, errors="coerce") + + m = ["apple", pd.Timedelta("1day")] + pd.to_timedelta(m, errors="coerce") The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it encounters any errors with the conversion to a desired data type: @@ -2273,25 +2330,26 @@ encounters any errors with the conversion to a desired data type: .. ipython:: python import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='ignore') - m = ['apple', 2, 3] - pd.to_numeric(m, errors='ignore') + m = ["apple", datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors="ignore") + + m = ["apple", 2, 3] + pd.to_numeric(m, errors="ignore") - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='ignore') + m = ["apple", pd.Timedelta("1day")] + pd.to_timedelta(m, errors="ignore") In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: .. ipython:: python - m = ['1', 2, 3] - pd.to_numeric(m, downcast='integer') # smallest signed int dtype - pd.to_numeric(m, downcast='signed') # same as 'integer' - pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype - pd.to_numeric(m, downcast='float') # smallest float dtype + m = ["1", 2, 3] + pd.to_numeric(m, downcast="integer") # smallest signed int dtype + pd.to_numeric(m, downcast="signed") # same as 'integer' + pd.to_numeric(m, downcast="unsigned") # smallest unsigned int dtype + pd.to_numeric(m, downcast="float") # smallest float dtype As these methods apply only to one-dimensional arrays, lists or scalars; they cannot be used directly on multi-dimensional objects such as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over each column efficiently: @@ -2299,16 +2357,16 @@ as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the .. ipython:: python import datetime - df = pd.DataFrame([ - ['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') + + df = pd.DataFrame([["2016-07-09", datetime.datetime(2016, 3, 2)]] * 2, dtype="O") df df.apply(pd.to_datetime) - df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O') + df = pd.DataFrame([["1.1", 2, 3]] * 2, dtype="O") df df.apply(pd.to_numeric) - df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O') + df = pd.DataFrame([["5us", pd.Timedelta("1day")]] * 2, dtype="O") df df.apply(pd.to_timedelta) @@ -2321,8 +2379,8 @@ See also :ref:`Support for integer NA `. .. ipython:: python - dfi = df3.astype('int32') - dfi['E'] = 1 + dfi = df3.astype("int32") + dfi["E"] = 1 dfi dfi.dtypes @@ -2335,7 +2393,7 @@ While float dtypes are unchanged. .. ipython:: python dfa = df3.copy() - dfa['A'] = dfa['A'].astype('float32') + dfa["A"] = dfa["A"].astype("float32") dfa.dtypes casted = dfa[df2 > 0] @@ -2355,18 +2413,22 @@ dtypes: .. ipython:: python - df = pd.DataFrame({'string': list('abc'), - 'int64': list(range(1, 4)), - 'uint8': np.arange(3, 6).astype('u1'), - 'float64': np.arange(4.0, 7.0), - 'bool1': [True, False, True], - 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3), - 'category': pd.Series(list("ABC")).astype('category')}) - df['tdeltas'] = df.dates.diff() - df['uint64'] = np.arange(3, 6).astype('u8') - df['other_dates'] = pd.date_range('20130101', periods=3) - df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') + df = pd.DataFrame( + { + "string": list("abc"), + "int64": list(range(1, 4)), + "uint8": np.arange(3, 6).astype("u1"), + "float64": np.arange(4.0, 7.0), + "bool1": [True, False, True], + "bool2": [False, True, False], + "dates": pd.date_range("now", periods=3), + "category": pd.Series(list("ABC")).astype("category"), + } + ) + df["tdeltas"] = df.dates.diff() + df["uint64"] = np.arange(3, 6).astype("u8") + df["other_dates"] = pd.date_range("20130101", periods=3) + df["tz_aware_dates"] = pd.date_range("20130101", periods=3, tz="US/Eastern") df And the dtypes: @@ -2390,7 +2452,7 @@ You can also pass the name of a dtype in the `NumPy dtype hierarchy .. ipython:: python - df.select_dtypes(include=['bool']) + df.select_dtypes(include=["bool"]) :meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well. @@ -2399,13 +2461,13 @@ integers: .. ipython:: python - df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger']) + df.select_dtypes(include=["number", "bool"], exclude=["unsignedinteger"]) To select string columns you must use the ``object`` dtype: .. ipython:: python - df.select_dtypes(include=['object']) + df.select_dtypes(include=["object"]) To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you can define a function that returns a tree of child dtypes: @@ -2426,5 +2488,5 @@ All NumPy dtypes are subclasses of ``numpy.generic``: .. note:: - Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal + pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal NumPy hierarchy and won't show up with the above function. diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index d690c1093399a..76c922fcef638 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -82,7 +82,7 @@ the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``, but ``True & False`` is ``False``, so we can't determine the output. -This differs from how ``np.nan`` behaves in logical operations. Pandas treated +This differs from how ``np.nan`` behaves in logical operations. pandas treated ``np.nan`` is *always false in the output*. In ``or`` diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 7def45ddc13e2..5c43de05fb5b9 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -9,9 +9,9 @@ Categorical data This is an introduction to pandas categorical data type, including a short comparison with R's ``factor``. -`Categoricals` are a pandas data type corresponding to categorical variables in +``Categoricals`` are a pandas data type corresponding to categorical variables in statistics. A categorical variable takes on a limited, and usually fixed, -number of possible values (`categories`; `levels` in R). Examples are gender, +number of possible values (``categories``; ``levels`` in R). Examples are gender, social class, blood type, country affiliation, observation time or rating via Likert scales. @@ -19,10 +19,10 @@ In contrast to statistical categorical variables, categorical data might have an 'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical operations (additions, divisions, ...) are not possible. -All values of categorical data are either in `categories` or `np.nan`. Order is defined by -the order of `categories`, not lexical order of the values. Internally, the data structure -consists of a `categories` array and an integer array of `codes` which point to the real value in -the `categories` array. +All values of categorical data are either in ``categories`` or ``np.nan``. Order is defined by +the order of ``categories``, not lexical order of the values. Internally, the data structure +consists of a ``categories`` array and an integer array of ``codes`` which point to the real value in +the ``categories`` array. The categorical data type is useful in the following cases: @@ -58,7 +58,7 @@ By converting an existing ``Series`` or column to a ``category`` dtype: .. ipython:: python df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) - df["B"] = df["A"].astype('category') + df["B"] = df["A"].astype("category") df By using special functions, such as :func:`~pandas.cut`, which groups data into @@ -66,18 +66,19 @@ discrete bins. See the :ref:`example on tiling ` in the docs .. ipython:: python - df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) + df = pd.DataFrame({"value": np.random.randint(0, 100, 20)}) labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] - df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) + df["group"] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it to a ``DataFrame``. .. ipython:: python - raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"], - ordered=False) + raw_cat = pd.Categorical( + ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False + ) s = pd.Series(raw_cat) s df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) @@ -100,7 +101,7 @@ This can be done during construction by specifying ``dtype="category"`` in the ` .. ipython:: python - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category") + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}, dtype="category") df.dtypes Note that the categories present in each column differ; the conversion is done column by column, so @@ -108,26 +109,24 @@ only labels present in a given column are categories: .. ipython:: python - df['A'] - df['B'] - + df["A"] + df["B"] -.. versionadded:: 0.23.0 Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`: .. ipython:: python - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) - df_cat = df.astype('category') + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}) + df_cat = df.astype("category") df_cat.dtypes This conversion is likewise done column by column: .. ipython:: python - df_cat['A'] - df_cat['B'] + df_cat["A"] + df_cat["B"] Controlling behavior @@ -145,9 +144,9 @@ of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python from pandas.api.types import CategoricalDtype + s = pd.Series(["a", "b", "c", "a"]) - cat_type = CategoricalDtype(categories=["b", "c", "d"], - ordered=True) + cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True) s_cat = s.astype(cat_type) s_cat @@ -157,12 +156,12 @@ are consistent among all columns. .. ipython:: python from pandas.api.types import CategoricalDtype - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) - cat_type = CategoricalDtype(categories=list('abcd'), - ordered=True) + + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}) + cat_type = CategoricalDtype(categories=list("abcd"), ordered=True) df_cat = df.astype(cat_type) - df_cat['A'] - df_cat['B'] + df_cat["A"] + df_cat["B"] .. note:: @@ -177,8 +176,7 @@ during normal constructor mode: .. ipython:: python splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5]) - s = pd.Series(pd.Categorical.from_codes(splitter, - categories=["train", "test"])) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Regaining original data @@ -191,20 +189,20 @@ To get back to the original ``Series`` or NumPy array, use s = pd.Series(["a", "b", "c", "a"]) s - s2 = s.astype('category') + s2 = s.astype("category") s2 s2.astype(str) np.asarray(s2) .. note:: - In contrast to R's `factor` function, categorical data is not converting input values to + In contrast to R's ``factor`` function, categorical data is not converting input values to strings; categories will end up the same data type as the original values. .. note:: - In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `categories` to change the categories after creation time. + In contrast to R's ``factor`` function, there is currently no way to assign/change labels at + creation time. Use ``categories`` to change the categories after creation time. .. _categorical.categoricaldtype: @@ -225,12 +223,13 @@ by default. .. ipython:: python from pandas.api.types import CategoricalDtype - CategoricalDtype(['a', 'b', 'c']) - CategoricalDtype(['a', 'b', 'c'], ordered=True) + + CategoricalDtype(["a", "b", "c"]) + CategoricalDtype(["a", "b", "c"], ordered=True) CategoricalDtype() A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas -expects a `dtype`. For example :func:`pandas.read_csv`, +expects a ``dtype``. For example :func:`pandas.read_csv`, :func:`pandas.DataFrame.astype`, or in the ``Series`` constructor. .. note:: @@ -250,19 +249,19 @@ unordered categoricals, the order of the ``categories`` is not considered. .. ipython:: python - c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False) + c1 = CategoricalDtype(["a", "b", "c"], ordered=False) # Equal, since order is not considered when ordered=False - c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) + c1 == CategoricalDtype(["b", "c", "a"], ordered=False) # Unequal, since the second CategoricalDtype is ordered - c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + c1 == CategoricalDtype(["a", "b", "c"], ordered=True) All instances of ``CategoricalDtype`` compare equal to the string ``'category'``. .. ipython:: python - c1 == 'category' + c1 == "category" .. warning:: @@ -290,7 +289,7 @@ output to a ``Series`` or ``DataFrame`` of type ``string``. Working with categories ----------------------- -Categorical data has a `categories` and a `ordered` property, which list their +Categorical data has a ``categories`` and a ``ordered`` property, which list their possible values and whether the ordering matters or not. These properties are exposed as ``s.cat.categories`` and ``s.cat.ordered``. If you don't manually specify categories and ordering, they are inferred from the passed arguments. @@ -305,8 +304,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "c", "a"], - categories=["c", "b", "a"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"])) s.cat.categories s.cat.ordered @@ -324,7 +322,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd'))) + s = pd.Series(list("babc")).astype(CategoricalDtype(list("abcd"))) s # categories @@ -350,19 +348,19 @@ Renaming categories is done by assigning new values to the s = s.cat.rename_categories([1, 2, 3]) s # You can also pass a dict-like object to map the renaming - s = s.cat.rename_categories({1: 'x', 2: 'y', 3: 'z'}) + s = s.cat.rename_categories({1: "x", 2: "y", 3: "z"}) s .. note:: - In contrast to R's `factor`, categorical data can have categories of other types than string. + In contrast to R's ``factor``, categorical data can have categories of other types than string. .. note:: Be aware that assigning new categories is an inplace operation, while most other operations - under ``Series.cat`` per default return a new ``Series`` of dtype `category`. + under ``Series.cat`` per default return a new ``Series`` of dtype ``category``. -Categories must be unique or a `ValueError` is raised: +Categories must be unique or a ``ValueError`` is raised: .. ipython:: python @@ -371,7 +369,7 @@ Categories must be unique or a `ValueError` is raised: except ValueError as e: print("ValueError:", str(e)) -Categories must also not be ``NaN`` or a `ValueError` is raised: +Categories must also not be ``NaN`` or a ``ValueError`` is raised: .. ipython:: python @@ -411,8 +409,7 @@ Removing unused categories can also be done: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "a"], - categories=["a", "b", "c", "d"])) + s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", "c", "d"])) s s.cat.remove_unused_categories() @@ -448,9 +445,7 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a", "b", "c", "a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a", "b", "c", "a"]).astype( - CategoricalDtype(ordered=True) - ) + s = pd.Series(["a", "b", "c", "a"]).astype(CategoricalDtype(ordered=True)) s.sort_values(inplace=True) s s.min(), s.max() @@ -516,18 +511,24 @@ The ordering of the categorical is determined by the ``categories`` of that colu .. ipython:: python - dfs = pd.DataFrame({'A': pd.Categorical(list('bbeebbaa'), - categories=['e', 'a', 'b'], - ordered=True), - 'B': [1, 2, 1, 2, 2, 1, 2, 1]}) - dfs.sort_values(by=['A', 'B']) + dfs = pd.DataFrame( + { + "A": pd.Categorical( + list("bbeebbaa"), + categories=["e", "a", "b"], + ordered=True, + ), + "B": [1, 2, 1, 2, 2, 1, 2, 1], + } + ) + dfs.sort_values(by=["A", "B"]) Reordering the ``categories`` changes a future sort. .. ipython:: python - dfs['A'] = dfs['A'].cat.reorder_categories(['a', 'b', 'e']) - dfs.sort_values(by=['A', 'B']) + dfs["A"] = dfs["A"].cat.reorder_categories(["a", "b", "e"]) + dfs.sort_values(by=["A", "B"]) Comparisons ----------- @@ -537,7 +538,7 @@ Comparing categorical data with other objects is possible in three cases: * Comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array, ...) of the same length as the categorical data. * All comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to - another categorical Series, when ``ordered==True`` and the `categories` are the same. + another categorical Series, when ``ordered==True`` and the ``categories`` are the same. * All comparisons of a categorical data to a scalar. All other comparisons, especially "non-equality" comparisons of two categoricals with different @@ -552,15 +553,9 @@ categories or a categorical with any list-like object, will raise a ``TypeError` .. ipython:: python - cat = pd.Series([1, 2, 3]).astype( - CategoricalDtype([3, 2, 1], ordered=True) - ) - cat_base = pd.Series([2, 2, 2]).astype( - CategoricalDtype([3, 2, 1], ordered=True) - ) - cat_base2 = pd.Series([2, 2, 2]).astype( - CategoricalDtype(ordered=True) - ) + cat = pd.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True)) + cat_base = pd.Series([2, 2, 2]).astype(CategoricalDtype([3, 2, 1], ordered=True)) + cat_base2 = pd.Series([2, 2, 2]).astype(CategoricalDtype(ordered=True)) cat cat_base @@ -609,8 +604,8 @@ When you compare two unordered categoricals with the same categories, the order .. ipython:: python - c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=False) + c2 = pd.Categorical(["a", "b"], categories=["b", "a"], ordered=False) c1 == c2 Operations @@ -624,23 +619,40 @@ even if some categories are not present in the data: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "c", "c"], - categories=["c", "a", "b", "d"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"])) s.value_counts() +``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories. + +.. ipython:: python + + columns = pd.Categorical( + ["One", "One", "Two"], categories=["One", "Two", "Three"], ordered=True + ) + df = pd.DataFrame( + data=[[1, 2, 3], [4, 5, 6]], + columns=pd.MultiIndex.from_arrays([["A", "B", "B"], columns]), + ) + df.sum(axis=1, level=1) + Groupby will also show "unused" categories: .. ipython:: python - cats = pd.Categorical(["a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"]) + cats = pd.Categorical( + ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"] + ) df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]}) df.groupby("cats").mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df2 = pd.DataFrame({"cats": cats2, - "B": ["c", "d", "c", "d"], - "values": [1, 2, 3, 4]}) + df2 = pd.DataFrame( + { + "cats": cats2, + "B": ["c", "d", "c", "d"], + "values": [1, 2, 3, 4], + } + ) df2.groupby(["cats", "B"]).mean() @@ -649,17 +661,15 @@ Pivot tables: .. ipython:: python raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df = pd.DataFrame({"A": raw_cat, - "B": ["c", "d", "c", "d"], - "values": [1, 2, 3, 4]}) - pd.pivot_table(df, values='values', index=['A', 'B']) + df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) + pd.pivot_table(df, values="values", index=["A", "B"]) Data munging ------------ The optimized pandas data access methods ``.loc``, ``.iloc``, ``.at``, and ``.iat``, work as normal. The only difference is the return type (for getting) and -that only values already in `categories` can be assigned. +that only values already in ``categories`` can be assigned. Getting ~~~~~~~ @@ -670,8 +680,7 @@ If the slicing operation returns either a ``DataFrame`` or a column of type .. ipython:: python idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) - cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], - dtype="category", index=idx) + cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], dtype="category", index=idx) values = [1, 2, 2, 2, 3, 4, 5] df = pd.DataFrame({"cats": cats, "values": values}, index=idx) df.iloc[2:4, :] @@ -697,8 +706,8 @@ of length "1". df.at["h", "cats"] # returns a string .. note:: - The is in contrast to R's `factor` function, where ``factor(c(1,2,3))[1]`` - returns a single value `factor`. + The is in contrast to R's ``factor`` function, where ``factor(c(1,2,3))[1]`` + returns a single value ``factor``. To get a single value ``Series`` of type ``category``, you pass in a list with a single value: @@ -716,13 +725,13 @@ an appropriate type: .. ipython:: python - str_s = pd.Series(list('aabb')) - str_cat = str_s.astype('category') + str_s = pd.Series(list("aabb")) + str_cat = str_s.astype("category") str_cat str_cat.str.contains("a") - date_s = pd.Series(pd.date_range('1/1/2015', periods=5)) - date_cat = date_s.astype('category') + date_s = pd.Series(pd.date_range("1/1/2015", periods=5)) + date_cat = date_s.astype("category") date_cat date_cat.dt.day @@ -734,7 +743,7 @@ an appropriate type: That means, that the returned values from methods and properties on the accessors of a ``Series`` and the returned values from methods and properties on the accessors of this -``Series`` transformed to one of type `category` will be equal: +``Series`` transformed to one of type ``category`` will be equal: .. ipython:: python @@ -755,13 +764,12 @@ Setting ~~~~~~~ Setting values in a categorical column (or ``Series``) works as long as the -value is included in the `categories`: +value is included in the ``categories``: .. ipython:: python idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) - cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], - categories=["a", "b"]) + cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) values = [1, 1, 1, 1, 1, 1, 1] df = pd.DataFrame({"cats": cats, "values": values}, index=idx) @@ -772,15 +780,14 @@ value is included in the `categories`: except ValueError as e: print("ValueError:", str(e)) -Setting values by assigning categorical data will also check that the `categories` match: +Setting values by assigning categorical data will also check that the ``categories`` match: .. ipython:: python df.loc["j":"k", "cats"] = pd.Categorical(["a", "a"], categories=["a", "b"]) df try: - df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], - categories=["a", "b", "c"]) + df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], categories=["a", "b", "c"]) except ValueError as e: print("ValueError:", str(e)) @@ -811,12 +818,12 @@ dtypes will likely have higher memory usage. Use ``.astype`` or from pandas.api.types import union_categoricals # same categories - s1 = pd.Series(['a', 'b'], dtype='category') - s2 = pd.Series(['a', 'b', 'a'], dtype='category') + s1 = pd.Series(["a", "b"], dtype="category") + s2 = pd.Series(["a", "b", "a"], dtype="category") pd.concat([s1, s2]) # different categories - s3 = pd.Series(['b', 'c'], dtype='category') + s3 = pd.Series(["b", "c"], dtype="category") pd.concat([s1, s3]) # Output dtype is inferred based on categories values @@ -824,7 +831,7 @@ dtypes will likely have higher memory usage. Use ``.astype`` or float_cats = pd.Series([3.0, 4.0], dtype="category") pd.concat([int_cats, float_cats]) - pd.concat([s1, s3]).astype('category') + pd.concat([s1, s3]).astype("category") union_categoricals([s1.array, s3.array]) The following table summarizes the results of merging ``Categoricals``: @@ -855,6 +862,7 @@ the categories being combined. .. ipython:: python from pandas.api.types import union_categoricals + a = pd.Categorical(["b", "c"]) b = pd.Categorical(["a", "b"]) union_categoricals([a, b]) @@ -902,8 +910,8 @@ the resulting array will always be a plain ``Categorical``: .. ipython:: python - a = pd.Series(["b", "c"], dtype='category') - b = pd.Series(["a", "b"], dtype='category') + a = pd.Series(["b", "c"], dtype="category") + b = pd.Series(["a", "b"], dtype="category") union_categoricals([a, b]) .. note:: @@ -943,12 +951,13 @@ See :ref:`here ` for an example and caveats. Writing to a CSV file will convert the data, effectively removing any information about the categorical (categories and ordering). So if you read back the CSV file you have to convert the -relevant columns back to `category` and assign the right categories and categories ordering. +relevant columns back to ``category`` and assign the right categories and categories ordering. .. ipython:: python import io - s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) + + s = pd.Series(pd.Categorical(["a", "b", "b", "a", "a", "d"])) # rename the categories s.cat.categories = ["very good", "good", "bad"] # reorder the categories and add missing categories @@ -961,9 +970,9 @@ relevant columns back to `category` and assign the right categories and categori df2["cats"] # Redo the category df2["cats"] = df2["cats"].astype("category") - df2["cats"].cat.set_categories(["very bad", "bad", "medium", - "good", "very good"], - inplace=True) + df2["cats"].cat.set_categories( + ["very bad", "bad", "medium", "good", "very good"], inplace=True + ) df2.dtypes df2["cats"] @@ -972,7 +981,7 @@ The same holds for writing to a SQL database with ``to_sql``. Missing data ------------ -pandas primarily uses the value `np.nan` to represent missing data. It is by +pandas primarily uses the value ``np.nan`` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section `. @@ -1000,20 +1009,20 @@ Methods for working with missing data, e.g. :meth:`~Series.isna`, :meth:`~Series pd.isna(s) s.fillna("a") -Differences to R's `factor` ---------------------------- +Differences to R's ``factor`` +----------------------------- The following differences to R's factor functions can be observed: -* R's `levels` are named `categories`. -* R's `levels` are always of type string, while `categories` in pandas can be of any dtype. +* R's ``levels`` are named ``categories``. +* R's ``levels`` are always of type string, while ``categories`` in pandas can be of any dtype. * It's not possible to specify labels at creation time. Use ``s.cat.rename_categories(new_labels)`` afterwards. -* In contrast to R's `factor` function, using categorical data as the sole input to create a +* In contrast to R's ``factor`` function, using categorical data as the sole input to create a new categorical series will *not* remove unused categories but create a new categorical series which is equal to the passed in one! -* R allows for missing values to be included in its `levels` (pandas' `categories`). Pandas - does not allow `NaN` categories, but missing values can still be in the `values`. +* R allows for missing values to be included in its ``levels`` (pandas' ``categories``). pandas + does not allow ``NaN`` categories, but missing values can still be in the ``values``. Gotchas @@ -1031,13 +1040,13 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = pd.Series(['foo', 'bar'] * 1000) + s = pd.Series(["foo", "bar"] * 1000) # object dtype s.nbytes # category dtype - s.astype('category').nbytes + s.astype("category").nbytes .. note:: @@ -1046,22 +1055,22 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = pd.Series(['foo%04d' % i for i in range(2000)]) + s = pd.Series(["foo%04d" % i for i in range(2000)]) # object dtype s.nbytes # category dtype - s.astype('category').nbytes + s.astype("category").nbytes -`Categorical` is not a `numpy` array -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``Categorical`` is not a ``numpy`` array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Currently, categorical data and the underlying ``Categorical`` is implemented as a Python object and not as a low-level NumPy array dtype. This leads to some problems. -NumPy itself doesn't know about the new `dtype`: +NumPy itself doesn't know about the new ``dtype``: .. ipython:: python @@ -1087,10 +1096,10 @@ To check if a Series contains Categorical data, use ``hasattr(s, 'cat')``: .. ipython:: python - hasattr(pd.Series(['a'], dtype='category'), 'cat') - hasattr(pd.Series(['a']), 'cat') + hasattr(pd.Series(["a"], dtype="category"), "cat") + hasattr(pd.Series(["a"]), "cat") -Using NumPy functions on a ``Series`` of type ``category`` should not work as `Categoricals` +Using NumPy functions on a ``Series`` of type ``category`` should not work as ``Categoricals`` are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python @@ -1108,16 +1117,20 @@ are not numeric data (even in the case that ``.categories`` is numeric). dtype in apply ~~~~~~~~~~~~~~ -Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get -a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a +pandas currently does not preserve the dtype in apply functions: If you apply along rows you get +a ``Series`` of ``object`` ``dtype`` (same as getting a row -> getting one element will return a basic type) and applying along columns will also convert to object. ``NaN`` values are unaffected. You can use ``fillna`` to handle missing values before applying a function. .. ipython:: python - df = pd.DataFrame({"a": [1, 2, 3, 4], - "b": ["a", "b", "c", "d"], - "cats": pd.Categorical([1, 2, 3, 2])}) + df = pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": ["a", "b", "c", "d"], + "cats": pd.Categorical([1, 2, 3, 2]), + } + ) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index d7875e5b8d861..17d1809638d61 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -63,8 +63,7 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -73,9 +72,9 @@ in order to have a valid result. .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.cov() @@ -116,13 +115,12 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan # Series with Series - frame['a'].corr(frame['b']) - frame['a'].corr(frame['b'], method='spearman') + frame["a"].corr(frame["b"]) + frame["a"].corr(frame["b"], method="spearman") # Pairwise correlation of DataFrame columns frame.corr() @@ -134,9 +132,9 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.corr() @@ -154,8 +152,8 @@ compute the correlation based on histogram intersection: # histogram intersection def histogram_intersection(a, b): - return np.minimum(np.true_divide(a, a.sum()), - np.true_divide(b, b.sum())).sum() + return np.minimum(np.true_divide(a, a.sum()), np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) @@ -165,8 +163,8 @@ DataFrame objects. .. ipython:: python - index = ['a', 'b', 'c', 'd', 'e'] - columns = ['one', 'two', 'three', 'four'] + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) df1.corrwith(df2) @@ -182,8 +180,8 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python - s = pd.Series(np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s = pd.Series(np.random.randn(5), index=list("abcde")) + s["d"] = s["b"] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -208,957 +206,9 @@ parameter: - ``max`` : highest rank in the group - ``first`` : ranks assigned in the order they appear in the array -.. _stats.moments: - -Window functions ----------------- - -.. currentmodule:: pandas.core.window - -For working with data, a number of window functions are provided for -computing common *window* or *rolling* statistics. Among these are count, sum, -mean, median, correlation, variance, covariance, standard deviation, skewness, -and kurtosis. - -The ``rolling()`` and ``expanding()`` -functions can be used directly from DataFrameGroupBy objects, -see the :ref:`groupby docs `. - - -.. note:: - - The API for window statistics is quite similar to the way one works with ``GroupBy`` objects, see the documentation :ref:`here `. - -We work with ``rolling``, ``expanding`` and ``exponentially weighted`` data through the corresponding -objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expanding` and :class:`~pandas.core.window.ExponentialMovingWindow`. - -.. ipython:: python - - s = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) - s = s.cumsum() - s - -These are created from methods on ``Series`` and ``DataFrame``. - -.. ipython:: python - - r = s.rolling(window=60) - r - -These object provide tab-completion of the available methods and properties. - -.. code-block:: ipython - - In [14]: r. # noqa: E225, E999 - r.agg r.apply r.count r.exclusions r.max r.median r.name r.skew r.sum - r.aggregate r.corr r.cov r.kurt r.mean r.min r.quantile r.std r.var - -Generally these methods all have the same interface. They all -accept the following arguments: - -- ``window``: size of moving window -- ``min_periods``: threshold of non-null data points to require (otherwise - result is NA) -- ``center``: boolean, whether to set the labels at the center (default is False) - -We can then call methods on these ``rolling`` objects. These return like-indexed objects: - -.. ipython:: python - - r.mean() - -.. ipython:: python - - s.plot(style='k--') - - @savefig rolling_mean_ex.png - r.mean().plot(style='k') - -.. ipython:: python - :suppress: - - plt.close('all') - -They can also be applied to DataFrame objects. This is really just syntactic -sugar for applying the moving window operator to all of the DataFrame's columns: - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) - df = df.cumsum() - - @savefig rolling_mean_frame.png - df.rolling(window=60).sum().plot(subplots=True) - -.. _stats.summary: - -Method summary -~~~~~~~~~~~~~~ - -We provide a number of common statistical functions: - -.. currentmodule:: pandas.core.window - -.. csv-table:: - :header: "Method", "Description" - :widths: 20, 80 - - :meth:`~Rolling.count`, Number of non-null observations - :meth:`~Rolling.sum`, Sum of values - :meth:`~Rolling.mean`, Mean of values - :meth:`~Rolling.median`, Arithmetic median of values - :meth:`~Rolling.min`, Minimum - :meth:`~Rolling.max`, Maximum - :meth:`~Rolling.std`, Sample standard deviation - :meth:`~Rolling.var`, Sample variance - :meth:`~Rolling.skew`, Sample skewness (3rd moment) - :meth:`~Rolling.kurt`, Sample kurtosis (4th moment) - :meth:`~Rolling.quantile`, Sample quantile (value at %) - :meth:`~Rolling.apply`, Generic apply - :meth:`~Rolling.cov`, Sample covariance (binary) - :meth:`~Rolling.corr`, Sample correlation (binary) - -.. _computation.window_variance.caveats: - -.. note:: - - Please note that :meth:`~Rolling.std` and :meth:`~Rolling.var` use the sample - variance formula by default, i.e. the sum of squared differences is divided by - ``window_size - 1`` and not by ``window_size`` during averaging. In statistics, - we use sample when the dataset is drawn from a larger population that we - don't have access to. Using it implies that the data in our window is a - random sample from the population, and we are interested not in the variance - inside the specific window but in the variance of some general window that - our windows represent. In this situation, using the sample variance formula - results in an unbiased estimator and so is preferred. - - Usually, we are instead interested in the variance of each window as we slide - it over the data, and in this case we should specify ``ddof=0`` when calling - these methods to use population variance instead of sample variance. Using - sample variance under the circumstances would result in a biased estimator - of the variable we are trying to determine. - - The same caveats apply to using any supported statistical sample methods. - -.. _stats.rolling_apply: - -Rolling apply -~~~~~~~~~~~~~ - -The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs -generic rolling computations. The ``func`` argument should be a single function -that produces a single value from an ndarray input. Suppose we wanted to -compute the mean absolute deviation on a rolling basis: - -.. ipython:: python - - def mad(x): - return np.fabs(x - x.mean()).mean() - - @savefig rolling_apply_ex.png - s.rolling(window=60).apply(mad, raw=True).plot(style='k') - -.. versionadded:: 1.0 - -Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ -if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying -``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). -Numba will be applied in potentially two routines: - -1. If ``func`` is a standard Python function, the engine will `JIT `__ -the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. - -2. The engine will JIT the for loop where the apply function is applied to each window. - -The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the -`numba.jit decorator `__. -These keyword arguments will be applied to *both* the passed function (if a standard Python function) -and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, -and their default values are set to ``False``, ``True`` and ``False`` respectively. - -.. note:: - - In terms of performance, **the first time a function is run using the Numba engine will be slow** - as Numba will have some function compilation overhead. However, the compiled functions are cached, - and subsequent calls will be fast. In general, the Numba engine is performant with - a larger amount of data points (e.g. 1+ million). - -.. code-block:: ipython - - In [1]: data = pd.Series(range(1_000_000)) - - In [2]: roll = data.rolling(10) - - In [3]: def f(x): - ...: return np.sum(x) + 5 - # Run the first time, compilation time will affect performance - In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 - 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) - # Function is cached and performance will improve - In [5]: %timeit roll.apply(f, engine='numba', raw=True) - 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [6]: %timeit roll.apply(f, engine='cython', raw=True) - 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - -.. _stats.rolling_window: - -Rolling windows -~~~~~~~~~~~~~~~ - -Passing ``win_type`` to ``.rolling`` generates a generic rolling window computation, that is weighted according the ``win_type``. -The following methods are available: - -.. csv-table:: - :header: "Method", "Description" - :widths: 20, 80 - - :meth:`~Window.sum`, Sum of values - :meth:`~Window.mean`, Mean of values - -The weights used in the window are specified by the ``win_type`` keyword. -The list of recognized types are the `scipy.signal window functions -`__: - -* ``boxcar`` -* ``triang`` -* ``blackman`` -* ``hamming`` -* ``bartlett`` -* ``parzen`` -* ``bohman`` -* ``blackmanharris`` -* ``nuttall`` -* ``barthann`` -* ``kaiser`` (needs beta) -* ``gaussian`` (needs std) -* ``general_gaussian`` (needs power, width) -* ``slepian`` (needs width) -* ``exponential`` (needs tau). - -.. ipython:: python - - ser = pd.Series(np.random.randn(10), - index=pd.date_range('1/1/2000', periods=10)) - - ser.rolling(window=5, win_type='triang').mean() - -Note that the ``boxcar`` window is equivalent to :meth:`~Rolling.mean`. - -.. ipython:: python - - ser.rolling(window=5, win_type='boxcar').mean() - ser.rolling(window=5).mean() - -For some windowing functions, additional parameters must be specified: - -.. ipython:: python - - ser.rolling(window=5, win_type='gaussian').mean(std=0.1) - -.. _stats.moments.normalization: - -.. note:: - - For ``.sum()`` with a ``win_type``, there is no normalization done to the - weights for the window. Passing custom weights of ``[1, 1, 1]`` will yield a different - result than passing weights of ``[2, 2, 2]``, for example. When passing a - ``win_type`` instead of explicitly specifying the weights, the weights are - already normalized so that the largest weight is 1. - - In contrast, the nature of the ``.mean()`` calculation is - such that the weights are normalized with respect to each other. Weights - of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. - -.. _stats.moments.ts: - -Time-aware rolling -~~~~~~~~~~~~~~~~~~ - -It is possible to pass an offset (or convertible) to a ``.rolling()`` method and have it produce -variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring -within the indicated time delta. - -This can be particularly useful for a non-regular time frequency index. - -.. ipython:: python - - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', - periods=5, - freq='s')) - dft - -This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. - -.. ipython:: python - - dft.rolling(2).sum() - dft.rolling(2, min_periods=1).sum() - -Specifying an offset allows a more intuitive specification of the rolling frequency. - -.. ipython:: python - - dft.rolling('2s').sum() - -Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. - - -.. ipython:: python - - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) - dft - dft.rolling(2).sum() - - -Using the time-specification generates variable windows for this sparse data. - -.. ipython:: python - - dft.rolling('2s').sum() - -Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the -default of the index) in a DataFrame. - -.. ipython:: python - - dft = dft.reset_index() - dft - dft.rolling('2s', on='foo').sum() - -.. _stats.custom_rolling_window: - -Custom window rolling -~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 1.0 - -In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts -a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds. -The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns -a tuple of two arrays, the first being the starting indices of the windows and second being the -ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed`` -and will automatically be passed to ``get_window_bounds`` and the defined method must -always accept these arguments. - -For example, if we have the following ``DataFrame``: - -.. ipython:: python - - use_expanding = [True, False, True, False, True] - use_expanding - df = pd.DataFrame({'values': range(5)}) - df - -and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size -1, we can create the following ``BaseIndexer`` subclass: - -.. code-block:: ipython - - In [2]: from pandas.api.indexers import BaseIndexer - ...: - ...: class CustomIndexer(BaseIndexer): - ...: - ...: def get_window_bounds(self, num_values, min_periods, center, closed): - ...: start = np.empty(num_values, dtype=np.int64) - ...: end = np.empty(num_values, dtype=np.int64) - ...: for i in range(num_values): - ...: if self.use_expanding[i]: - ...: start[i] = 0 - ...: end[i] = i + 1 - ...: else: - ...: start[i] = i - ...: end[i] = i + self.window_size - ...: return start, end - ...: - - In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) - - In [4]: df.rolling(indexer).sum() - Out[4]: - values - 0 0.0 - 1 1.0 - 2 3.0 - 3 3.0 - 4 10.0 - -You can view other examples of ``BaseIndexer`` subclasses `here `__ - -.. versionadded:: 1.1 - -One subclass of note within those examples is the ``VariableOffsetWindowIndexer`` that allows -rolling operations over a non-fixed offset like a ``BusinessDay``. - -.. ipython:: python - - from pandas.api.indexers import VariableOffsetWindowIndexer - df = pd.DataFrame(range(10), index=pd.date_range('2020', periods=10)) - offset = pd.offsets.BDay(1) - indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) - df - df.rolling(indexer).sum() - -For some problems knowledge of the future is available for analysis. For example, this occurs when -each data point is a full time series read from an experiment, and the task is to extract underlying -conditions. In these cases it can be useful to perform forward-looking rolling window computations. -:func:`FixedForwardWindowIndexer ` class is available for this purpose. -This :func:`BaseIndexer ` subclass implements a closed fixed-width -forward-looking rolling window, and we can use it as follows: - -.. ipython:: ipython - - from pandas.api.indexers import FixedForwardWindowIndexer - indexer = FixedForwardWindowIndexer(window_size=2) - df.rolling(indexer, min_periods=1).sum() - -.. _stats.rolling_window.endpoints: - -Rolling window endpoints -~~~~~~~~~~~~~~~~~~~~~~~~ - -The inclusion of the interval endpoints in rolling window calculations can be specified with the ``closed`` -parameter: - -.. csv-table:: - :header: "``closed``", "Description", "Default for" - :widths: 20, 30, 30 - - ``right``, close right endpoint, time-based windows - ``left``, close left endpoint, - ``both``, close both endpoints, fixed windows - ``neither``, open endpoints, - -For example, having the right endpoint open is useful in many problems that require that there is no contamination -from present information back to past information. This allows the rolling window to compute statistics -"up to that point in time", but not including that point in time. - -.. ipython:: python - - df = pd.DataFrame({'x': 1}, - index=[pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) - - df["right"] = df.rolling('2s', closed='right').x.sum() # default - df["both"] = df.rolling('2s', closed='both').x.sum() - df["left"] = df.rolling('2s', closed='left').x.sum() - df["neither"] = df.rolling('2s', closed='neither').x.sum() - - df - -Currently, this feature is only implemented for time-based windows. -For fixed windows, the closed parameter cannot be set and the rolling window will always have both endpoints closed. - -.. _stats.iter_rolling_window: - -Iteration over window: -~~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 1.1.0 - -``Rolling`` and ``Expanding`` objects now support iteration. Be noted that ``min_periods`` is ignored in iteration. - -.. ipython:: - - In [1]: df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - In [2]: for i in df.rolling(2): - ...: print(i) - ...: - - -.. _stats.moments.ts-versus-resampling: - -Time-aware rolling vs. resampling -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They -both operate and perform reductive operations on time-indexed pandas objects. - -When using ``.rolling()`` with an offset. The offset is a time-delta. Take a backwards-in-time looking window, and -aggregate all of the values in that window (including the end-point, but not the start-point). This is the new value -at that point in the result. These are variable sized windows in time-space for each point of the input. You will get -a same sized result as the input. - -When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency -bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this -aggregation is the output for that frequency point. The windows are fixed size in the frequency space. Your result -will have the shape of a regular frequency between the min and the max of the original input object. - -To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. - -Centering windows -~~~~~~~~~~~~~~~~~ - -By default the labels are set to the right edge of the window, but a -``center`` keyword is available so the labels can be set at the center. - -.. ipython:: python - - ser.rolling(window=5).mean() - ser.rolling(window=5, center=True).mean() - -.. _stats.moments.binary: - -Binary window functions -~~~~~~~~~~~~~~~~~~~~~~~ - -:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about -two ``Series`` or any combination of ``DataFrame/Series`` or -``DataFrame/DataFrame``. Here is the behavior in each case: - -* two ``Series``: compute the statistic for the pairing. -* ``DataFrame/Series``: compute the statistics for each column of the DataFrame - with the passed Series, thus returning a DataFrame. -* ``DataFrame/DataFrame``: by default compute the statistic for matching column - names, returning a DataFrame. If the keyword argument ``pairwise=True`` is - passed then computes the statistic for each pair of columns, returning a - ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section - `). - -For example: - -.. ipython:: python - - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) - df = df.cumsum() - - df2 = df[:20] - df2.rolling(window=5).corr(df2['B']) - -.. _stats.moments.corr_pairwise: - -Computing rolling pairwise covariances and correlations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In financial data analysis and other fields it's common to compute covariance -and correlation matrices for a collection of time series. Often one is also -interested in moving-window covariance and correlation matrices. This can be -done by passing the ``pairwise`` keyword argument, which in the case of -``DataFrame`` inputs will yield a MultiIndexed ``DataFrame`` whose ``index`` are the dates in -question. In the case of a single DataFrame argument the ``pairwise`` argument -can even be omitted: - -.. note:: - - Missing values are ignored and each entry is computed using the pairwise - complete observations. Please see the :ref:`covariance section - ` for :ref:`caveats - ` associated with this method of - calculating covariance and correlation matrices. - -.. ipython:: python - - covs = (df[['B', 'C', 'D']].rolling(window=50) - .cov(df[['A', 'B', 'C']], pairwise=True)) - covs.loc['2002-09-22':] - -.. ipython:: python - - correls = df.rolling(window=50).corr() - correls.loc['2002-09-22':] - -You can efficiently retrieve the time series of correlations between two -columns by reshaping and indexing: - -.. ipython:: python - :suppress: - - plt.close('all') - -.. ipython:: python - - @savefig rolling_corr_pairwise_ex.png - correls.unstack(1)[('A', 'C')].plot() - -.. _stats.aggregate: - -Aggregation ------------ - -Once the ``Rolling``, ``Expanding`` or ``ExponentialMovingWindow`` objects have been created, several methods are available to -perform multiple computations on the data. These operations are similar to the :ref:`aggregating API `, -:ref:`groupby API `, and :ref:`resample API `. - - -.. ipython:: python - - dfa = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) - r = dfa.rolling(window=60, min_periods=1) - r - -We can aggregate by passing a function to the entire DataFrame, or select a -Series (or multiple Series) via standard ``__getitem__``. - -.. ipython:: python - - r.aggregate(np.sum) - - r['A'].aggregate(np.sum) - - r[['A', 'B']].aggregate(np.sum) - -As you can see, the result of the aggregation will have the selected columns, or all -columns if none are selected. - -.. _stats.aggregate.multifunc: - -Applying multiple functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -With windowed ``Series`` you can also pass a list of functions to do -aggregation with, outputting a DataFrame: - -.. ipython:: python - - r['A'].agg([np.sum, np.mean, np.std]) - -On a windowed DataFrame, you can pass a list of functions to apply to each -column, which produces an aggregated result with a hierarchical index: - -.. ipython:: python - - r.agg([np.sum, np.mean]) - -Passing a dict of functions has different behavior by default, see the next -section. - -Applying different functions to DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -By passing a dict to ``aggregate`` you can apply a different aggregation to the -columns of a ``DataFrame``: - -.. ipython:: python - - r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) - -The function names can also be strings. In order for a string to be valid it -must be implemented on the windowed object - -.. ipython:: python - - r.agg({'A': 'sum', 'B': 'std'}) - -Furthermore you can pass a nested dict to indicate different aggregations on different columns. - -.. ipython:: python - - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) - - -.. _stats.moments.expanding: - -Expanding windows ------------------ - -A common alternative to rolling statistics is to use an *expanding* window, -which yields the value of the statistic with all the data available up to that -point in time. - -These follow a similar interface to ``.rolling``, with the ``.expanding`` method -returning an :class:`~pandas.core.window.Expanding` object. - -As these calculations are a special case of rolling statistics, -they are implemented in pandas such that the following two calls are equivalent: - -.. ipython:: python - - df.rolling(window=len(df), min_periods=1).mean()[:5] - - df.expanding(min_periods=1).mean()[:5] - -These have a similar set of methods to ``.rolling`` methods. - -Method summary -~~~~~~~~~~~~~~ - -.. currentmodule:: pandas.core.window - -.. csv-table:: - :header: "Function", "Description" - :widths: 20, 80 - - :meth:`~Expanding.count`, Number of non-null observations - :meth:`~Expanding.sum`, Sum of values - :meth:`~Expanding.mean`, Mean of values - :meth:`~Expanding.median`, Arithmetic median of values - :meth:`~Expanding.min`, Minimum - :meth:`~Expanding.max`, Maximum - :meth:`~Expanding.std`, Sample standard deviation - :meth:`~Expanding.var`, Sample variance - :meth:`~Expanding.skew`, Sample skewness (3rd moment) - :meth:`~Expanding.kurt`, Sample kurtosis (4th moment) - :meth:`~Expanding.quantile`, Sample quantile (value at %) - :meth:`~Expanding.apply`, Generic apply - :meth:`~Expanding.cov`, Sample covariance (binary) - :meth:`~Expanding.corr`, Sample correlation (binary) - -.. note:: - - Using sample variance formulas for :meth:`~Expanding.std` and - :meth:`~Expanding.var` comes with the same caveats as using them with rolling - windows. See :ref:`this section ` for more - information. - - The same caveats apply to using any supported statistical sample methods. - -.. currentmodule:: pandas - -Aside from not having a ``window`` parameter, these functions have the same -interfaces as their ``.rolling`` counterparts. Like above, the parameters they -all accept are: - -* ``min_periods``: threshold of non-null data points to require. Defaults to - minimum needed to compute statistic. No ``NaNs`` will be output once - ``min_periods`` non-null data points have been seen. -* ``center``: boolean, whether to set the labels at the center (default is False). - -.. _stats.moments.expanding.note: -.. note:: - - The output of the ``.rolling`` and ``.expanding`` methods do not return a - ``NaN`` if there are at least ``min_periods`` non-null values in the current - window. For example: - - .. ipython:: python - - sn = pd.Series([1, 2, np.nan, 3, np.nan, 4]) - sn - sn.rolling(2).max() - sn.rolling(2, min_periods=1).max() - - In case of expanding functions, this differs from :meth:`~DataFrame.cumsum`, - :meth:`~DataFrame.cumprod`, :meth:`~DataFrame.cummax`, - and :meth:`~DataFrame.cummin`, which return ``NaN`` in the output wherever - a ``NaN`` is encountered in the input. In order to match the output of ``cumsum`` - with ``expanding``, use :meth:`~DataFrame.fillna`: - - .. ipython:: python - - sn.expanding().sum() - sn.cumsum() - sn.cumsum().fillna(method='ffill') - - -An expanding window statistic will be more stable (and less responsive) than -its rolling window counterpart as the increasing window size decreases the -relative impact of an individual data point. As an example, here is the -:meth:`~core.window.Expanding.mean` output for the previous time series dataset: - -.. ipython:: python - :suppress: - - plt.close('all') - -.. ipython:: python - - s.plot(style='k--') - - @savefig expanding_mean_frame.png - s.expanding().mean().plot(style='k') - - -.. _stats.moments.exponentially_weighted: - -Exponentially weighted windows ------------------------------- - -.. currentmodule:: pandas.core.window - -A related set of functions are exponentially weighted versions of several of -the above statistics. A similar interface to ``.rolling`` and ``.expanding`` is accessed -through the ``.ewm`` method to receive an :class:`~ExponentialMovingWindow` object. -A number of expanding EW (exponentially weighted) -methods are provided: - - -.. csv-table:: - :header: "Function", "Description" - :widths: 20, 80 - - :meth:`~ExponentialMovingWindow.mean`, EW moving average - :meth:`~ExponentialMovingWindow.var`, EW moving variance - :meth:`~ExponentialMovingWindow.std`, EW moving standard deviation - :meth:`~ExponentialMovingWindow.corr`, EW moving correlation - :meth:`~ExponentialMovingWindow.cov`, EW moving covariance - -In general, a weighted moving average is calculated as - -.. math:: - - y_t = \frac{\sum_{i=0}^t w_i x_{t-i}}{\sum_{i=0}^t w_i}, - -where :math:`x_t` is the input, :math:`y_t` is the result and the :math:`w_i` -are the weights. - -The EW functions support two variants of exponential weights. -The default, ``adjust=True``, uses the weights :math:`w_i = (1 - \alpha)^i` -which gives - -.. math:: - - y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... - + (1 - \alpha)^t x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... - + (1 - \alpha)^t} - -When ``adjust=False`` is specified, moving averages are calculated as - -.. math:: - - y_0 &= x_0 \\ - y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, - -which is equivalent to using weights - -.. math:: - - w_i = \begin{cases} - \alpha (1 - \alpha)^i & \text{if } i < t \\ - (1 - \alpha)^i & \text{if } i = t. - \end{cases} - -.. note:: - - These equations are sometimes written in terms of :math:`\alpha' = 1 - \alpha`, e.g. - - .. math:: - - y_t = \alpha' y_{t-1} + (1 - \alpha') x_t. - -The difference between the above two variants arises because we are -dealing with series which have finite history. Consider a series of infinite -history, with ``adjust=True``: - -.. math:: - - y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} - {1 + (1 - \alpha) + (1 - \alpha)^2 + ...} - -Noting that the denominator is a geometric series with initial term equal to 1 -and a ratio of :math:`1 - \alpha` we have - -.. math:: - - y_t &= \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} - {\frac{1}{1 - (1 - \alpha)}}\\ - &= [x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...] \alpha \\ - &= \alpha x_t + [(1-\alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...]\alpha \\ - &= \alpha x_t + (1 - \alpha)[x_{t-1} + (1 - \alpha) x_{t-2} + ...]\alpha\\ - &= \alpha x_t + (1 - \alpha) y_{t-1} - -which is the same expression as ``adjust=False`` above and therefore -shows the equivalence of the two variants for infinite series. -When ``adjust=False``, we have :math:`y_0 = x_0` and -:math:`y_t = \alpha x_t + (1 - \alpha) y_{t-1}`. -Therefore, there is an assumption that :math:`x_0` is not an ordinary value -but rather an exponentially weighted moment of the infinite series up to that -point. - -One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass -:math:`\alpha` directly, it's often easier to think about either the -**span**, **center of mass (com)** or **half-life** of an EW moment: - -.. math:: - - \alpha = - \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 - \end{cases} - -One must specify precisely one of **span**, **center of mass**, **half-life** -and **alpha** to the EW functions: - -* **Span** corresponds to what is commonly called an "N-day EW moving average". -* **Center of mass** has a more physical interpretation and can be thought of - in terms of span: :math:`c = (s - 1) / 2`. -* **Half-life** is the period of time for the exponential weight to reduce to - one half. -* **Alpha** specifies the smoothing factor directly. - -.. versionadded:: 1.1.0 - -You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of -time it takes for an observation to decay to half its value when also specifying a sequence -of ``times``. - -.. ipython:: python - - df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) - df - times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] - df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() - -The following formula is used to compute exponentially weighted mean with an input vector of times: - -.. math:: - - y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}}, - -Here is an example for a univariate time series: - -.. ipython:: python - - s.plot(style='k--') - - @savefig ewma_ex.png - s.ewm(span=20).mean().plot(style='k') - -ExponentialMovingWindow has a ``min_periods`` argument, which has the same -meaning it does for all the ``.expanding`` and ``.rolling`` methods: -no output values will be set until at least ``min_periods`` non-null values -are encountered in the (expanding) window. - -ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how -intermediate null values affect the calculation of the weights. -When ``ignore_na=False`` (the default), weights are calculated based on absolute -positions, so that intermediate null values affect the result. -When ``ignore_na=True``, -weights are calculated by ignoring intermediate null values. -For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted -average of ``3, NaN, 5`` would be calculated as - -.. math:: - - \frac{(1-\alpha)^2 \cdot 3 + 1 \cdot 5}{(1-\alpha)^2 + 1}. - -Whereas if ``ignore_na=True``, the weighted average would be calculated as - -.. math:: - - \frac{(1-\alpha) \cdot 3 + 1 \cdot 5}{(1-\alpha) + 1}. - -The :meth:`~Ewm.var`, :meth:`~Ewm.std`, and :meth:`~Ewm.cov` functions have a ``bias`` argument, -specifying whether the result should contain biased or unbiased statistics. -For example, if ``bias=True``, ``ewmvar(x)`` is calculated as -``ewmvar(x) = ewma(x**2) - ewma(x)**2``; -whereas if ``bias=False`` (the default), the biased variance statistics -are scaled by debiasing factors - -.. math:: +.. _computation.windowing: - \frac{\left(\sum_{i=0}^t w_i\right)^2}{\left(\sum_{i=0}^t w_i\right)^2 - \sum_{i=0}^t w_i^2}. +Windowing functions +~~~~~~~~~~~~~~~~~~~ -(For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor, -with :math:`N = t + 1`.) -See `Weighted Sample Variance `__ -on Wikipedia for further details. +See :ref:`the window operations user guide ` for an overview of windowing functions. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 49487ac327e73..5a6f56388dee5 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -15,12 +15,9 @@ Simplified, condensed, new-user friendly, in-line examples have been inserted wh augment the Stack-Overflow and GitHub links. Many of the links contain expanded information, above what the in-line examples offer. -Pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept +pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept explicitly imported for newer users. -These examples are written for Python 3. Minor tweaks might be necessary for earlier python -versions. - Idioms ------ @@ -33,9 +30,9 @@ These are some neat pandas ``idioms`` .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df if-then... @@ -45,42 +42,42 @@ An if-then on one column .. ipython:: python - df.loc[df.AAA >= 5, 'BBB'] = -1 + df.loc[df.AAA >= 5, "BBB"] = -1 df An if-then with assignment to 2 columns: .. ipython:: python - df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555 + df.loc[df.AAA >= 5, ["BBB", "CCC"]] = 555 df Add another line with different logic, to do the -else .. ipython:: python - df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000 + df.loc[df.AAA < 5, ["BBB", "CCC"]] = 2000 df Or use pandas where after you've set up a mask .. ipython:: python - df_mask = pd.DataFrame({'AAA': [True] * 4, - 'BBB': [False] * 4, - 'CCC': [True, False] * 2}) + df_mask = pd.DataFrame( + {"AAA": [True] * 4, "BBB": [False] * 4, "CCC": [True, False] * 2} + ) df.where(df_mask, -1000) -`if-then-else using numpy's where() +`if-then-else using NumPy's where() `__ .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df - df['logic'] = np.where(df['AAA'] > 5, 'high', 'low') + df["logic"] = np.where(df["AAA"] > 5, "high", "low") df Splitting @@ -91,9 +88,9 @@ Splitting .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df df[df.AAA <= 5] @@ -107,28 +104,28 @@ Building criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df ...and (without assignment returns a Series) .. ipython:: python - df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA'] + df.loc[(df["BBB"] < 25) & (df["CCC"] >= -40), "AAA"] ...or (without assignment returns a Series) .. ipython:: python - df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA'] + df.loc[(df["BBB"] > 25) | (df["CCC"] >= -40), "AAA"] ...or (with assignment modifies the DataFrame.) .. ipython:: python - df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1 + df.loc[(df["BBB"] > 25) | (df["CCC"] >= 75), "AAA"] = 0.1 df `Select rows with data closest to certain value using argsort @@ -136,9 +133,9 @@ Building criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df aValue = 43.0 df.loc[(df.CCC - aValue).abs().argsort()] @@ -148,9 +145,9 @@ Building criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df Crit1 = df.AAA <= 5.5 @@ -189,9 +186,9 @@ The :ref:`indexing ` docs. .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df df[(df.AAA <= 6) & (df.index.isin([0, 2, 4]))] @@ -201,10 +198,10 @@ The :ref:`indexing ` docs. .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}, - index=['foo', 'bar', 'boo', 'kar']) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]}, + index=["foo", "bar", "boo", "kar"], + ) There are 2 explicit slicing methods, with a third general case @@ -216,19 +213,17 @@ There are 2 explicit slicing methods, with a third general case .. ipython:: python df.iloc[0:3] # Positional - df.loc['bar':'kar'] # Label + df.loc["bar":"kar"] # Label # Generic df[0:3] - df['bar':'kar'] + df["bar":"kar"] Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. .. ipython:: python - data = {'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]} + data = {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} df2 = pd.DataFrame(data=data, index=[1, 2, 3, 4]) # Note index starts at 1. df2.iloc[1:3] # Position-oriented df2.loc[1:3] # Label-oriented @@ -238,9 +233,9 @@ Ambiguity arises when an index consists of integers with a non-zero start or non .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))] @@ -253,14 +248,12 @@ New columns .. ipython:: python - df = pd.DataFrame({'AAA': [1, 2, 1, 3], - 'BBB': [1, 1, 2, 2], - 'CCC': [2, 1, 3, 1]}) + df = pd.DataFrame({"AAA": [1, 2, 1, 3], "BBB": [1, 1, 2, 2], "CCC": [2, 1, 3, 1]}) df - source_cols = df.columns # Or some subset would work too + source_cols = df.columns # Or some subset would work too new_cols = [str(x) + "_cat" for x in source_cols] - categories = {1: 'Alpha', 2: 'Beta', 3: 'Charlie'} + categories = {1: "Alpha", 2: "Beta", 3: "Charlie"} df[new_cols] = df[source_cols].applymap(categories.get) df @@ -270,8 +263,9 @@ New columns .. ipython:: python - df = pd.DataFrame({'AAA': [1, 1, 1, 2, 2, 2, 3, 3], - 'BBB': [2, 1, 3, 4, 5, 1, 2, 3]}) + df = pd.DataFrame( + {"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]} + ) df Method 1 : idxmin() to get the index of the minimums @@ -300,25 +294,28 @@ The :ref:`multindexing ` docs. .. ipython:: python - df = pd.DataFrame({'row': [0, 1, 2], - 'One_X': [1.1, 1.1, 1.1], - 'One_Y': [1.2, 1.2, 1.2], - 'Two_X': [1.11, 1.11, 1.11], - 'Two_Y': [1.22, 1.22, 1.22]}) + df = pd.DataFrame( + { + "row": [0, 1, 2], + "One_X": [1.1, 1.1, 1.1], + "One_Y": [1.2, 1.2, 1.2], + "Two_X": [1.11, 1.11, 1.11], + "Two_Y": [1.22, 1.22, 1.22], + } + ) df # As Labelled Index - df = df.set_index('row') + df = df.set_index("row") df # With Hierarchical Columns - df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) - for c in df.columns]) + df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns]) df # Now stack & Reset df = df.stack(0).reset_index(1) df # And fix the labels (Notice the label 'level_1' got added automatically) - df.columns = ['Sample', 'All_X', 'All_Y'] + df.columns = ["Sample", "All_X", "All_Y"] df Arithmetic @@ -329,11 +326,12 @@ Arithmetic .. ipython:: python - cols = pd.MultiIndex.from_tuples([(x, y) for x in ['A', 'B', 'C'] - for y in ['O', 'I']]) - df = pd.DataFrame(np.random.randn(2, 6), index=['n', 'm'], columns=cols) + cols = pd.MultiIndex.from_tuples( + [(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]] + ) + df = pd.DataFrame(np.random.randn(2, 6), index=["n", "m"], columns=cols) df - df = df.div(df['C'], level=1) + df = df.div(df["C"], level=1) df Slicing @@ -344,10 +342,9 @@ Slicing .. ipython:: python - coords = [('AA', 'one'), ('AA', 'six'), ('BB', 'one'), ('BB', 'two'), - ('BB', 'six')] + coords = [("AA", "one"), ("AA", "six"), ("BB", "one"), ("BB", "two"), ("BB", "six")] index = pd.MultiIndex.from_tuples(coords) - df = pd.DataFrame([11, 22, 33, 44, 55], index, ['MyData']) + df = pd.DataFrame([11, 22, 33, 44, 55], index, ["MyData"]) df To take the cross section of the 1st level and 1st axis the index: @@ -355,13 +352,13 @@ To take the cross section of the 1st level and 1st axis the index: .. ipython:: python # Note : level and axis are optional, and default to zero - df.xs('BB', level=0, axis=0) + df.xs("BB", level=0, axis=0) ...and now the 2nd level of the 1st axis. .. ipython:: python - df.xs('six', level=1, axis=0) + df.xs("six", level=1, axis=0) `Slicing a MultiIndex with xs, method #2 `__ @@ -370,21 +367,20 @@ To take the cross section of the 1st level and 1st axis the index: import itertools - index = list(itertools.product(['Ada', 'Quinn', 'Violet'], - ['Comp', 'Math', 'Sci'])) - headr = list(itertools.product(['Exams', 'Labs'], ['I', 'II'])) - indx = pd.MultiIndex.from_tuples(index, names=['Student', 'Course']) - cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named + index = list(itertools.product(["Ada", "Quinn", "Violet"], ["Comp", "Math", "Sci"])) + headr = list(itertools.product(["Exams", "Labs"], ["I", "II"])) + indx = pd.MultiIndex.from_tuples(index, names=["Student", "Course"]) + cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named data = [[70 + x + y + (x * y) % 3 for x in range(4)] for y in range(9)] df = pd.DataFrame(data, indx, cols) df All = slice(None) - df.loc['Violet'] - df.loc[(All, 'Math'), All] - df.loc[(slice('Ada', 'Quinn'), 'Math'), All] - df.loc[(All, 'Math'), ('Exams')] - df.loc[(All, 'Math'), (All, 'II')] + df.loc["Violet"] + df.loc[(All, "Math"), All] + df.loc[(slice("Ada", "Quinn"), "Math"), All] + df.loc[(All, "Math"), ("Exams")] + df.loc[(All, "Math"), (All, "II")] `Setting portions of a MultiIndex with xs `__ @@ -397,7 +393,7 @@ Sorting .. ipython:: python - df.sort_values(by=('Labs', 'II'), ascending=False) + df.sort_values(by=("Labs", "II"), ascending=False) `Partial selection, the need for sortedness; `__ @@ -422,10 +418,12 @@ Fill forward a reversed timeseries .. ipython:: python - df = pd.DataFrame(np.random.randn(6, 1), - index=pd.date_range('2013-08-01', periods=6, freq='B'), - columns=list('A')) - df.loc[df.index[3], 'A'] = np.nan + df = pd.DataFrame( + np.random.randn(6, 1), + index=pd.date_range("2013-08-01", periods=6, freq="B"), + columns=list("A"), + ) + df.loc[df.index[3], "A"] = np.nan df df.reindex(df.index[::-1]).ffill() @@ -452,22 +450,26 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(), - 'size': list('SSMMMLL'), - 'weight': [8, 10, 11, 1, 20, 12, 12], - 'adult': [False] * 5 + [True] * 2}) + df = pd.DataFrame( + { + "animal": "cat dog cat fish dog cat cat".split(), + "size": list("SSMMMLL"), + "weight": [8, 10, 11, 1, 20, 12, 12], + "adult": [False] * 5 + [True] * 2, + } + ) df # List the size of the animals with the highest weight. - df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ .. ipython:: python - gb = df.groupby(['animal']) - gb.get_group('cat') + gb = df.groupby(["animal"]) + gb.get_group("cat") `Apply to different items in a group `__ @@ -475,12 +477,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python def GrowUp(x): - avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) - avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) - avg_weight += sum(x[x['size'] == 'L'].weight) + avg_weight = sum(x[x["size"] == "S"].weight * 1.5) + avg_weight += sum(x[x["size"] == "M"].weight * 1.25) + avg_weight += sum(x[x["size"] == "L"].weight) avg_weight /= len(x) - return pd.Series(['L', avg_weight, True], - index=['size', 'weight', 'adult']) + return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) + expected_df = gb.apply(GrowUp) expected_df @@ -492,12 +494,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to S = pd.Series([i / 100.0 for i in range(1, 11)]) + def cum_ret(x, y): return x * (1 + y) + def red(x): return functools.reduce(cum_ret, x, 1.0) + S.expanding().apply(red, raw=True) @@ -506,13 +511,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, -1, 1, 2]}) - gb = df.groupby('A') + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + gb = df.groupby("A") + def replace(g): mask = g < 0 return g.where(mask, g[~mask].mean()) + gb.transform(replace) `Sort groups by aggregated data @@ -520,13 +527,17 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'code': ['foo', 'bar', 'baz'] * 2, - 'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62], - 'flag': [False, True] * 3}) + df = pd.DataFrame( + { + "code": ["foo", "bar", "baz"] * 2, + "data": [0.16, -0.21, 0.33, 0.45, -0.59, 0.62], + "flag": [False, True] * 3, + } + ) - code_groups = df.groupby('code') + code_groups = df.groupby("code") - agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data') + agg_n_sort_order = code_groups[["data"]].transform(sum).sort_values(by="data") sorted_df = df.loc[agg_n_sort_order.index] @@ -537,15 +548,17 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - rng = pd.date_range(start="2014-10-07", periods=10, freq='2min') + rng = pd.date_range(start="2014-10-07", periods=10, freq="2min") ts = pd.Series(data=list(range(10)), index=rng) + def MyCust(x): if len(x) > 2: return x[1] * 1.234 return pd.NaT - mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust} + + mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust} ts.resample("5min").apply(mhc) ts @@ -554,10 +567,11 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(), - 'Value': [100, 150, 50, 50]}) + df = pd.DataFrame( + {"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]} + ) df - df['Counts'] = df.groupby(['Color']).transform(len) + df["Counts"] = df.groupby(["Color"]).transform(len) df `Shift groups of the values in a column based on the index @@ -565,13 +579,19 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'line_race': [10, 10, 8, 10, 10, 8], - 'beyer': [99, 102, 103, 103, 88, 100]}, - index=['Last Gunfighter', 'Last Gunfighter', - 'Last Gunfighter', 'Paynter', 'Paynter', - 'Paynter']) + df = pd.DataFrame( + {"line_race": [10, 10, 8, 10, 10, 8], "beyer": [99, 102, 103, 103, 88, 100]}, + index=[ + "Last Gunfighter", + "Last Gunfighter", + "Last Gunfighter", + "Paynter", + "Paynter", + "Paynter", + ], + ) df - df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) + df["beyer_shifted"] = df.groupby(level=0)["beyer"].shift(1) df `Select row with maximum value from each group @@ -579,11 +599,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'host': ['other', 'other', 'that', 'this', 'this'], - 'service': ['mail', 'web', 'mail', 'mail', 'web'], - 'no': [1, 2, 1, 2, 1]}).set_index(['host', 'service']) - mask = df.groupby(level=0).agg('idxmax') - df_count = df.loc[mask['no']].reset_index() + df = pd.DataFrame( + { + "host": ["other", "other", "that", "this", "this"], + "service": ["mail", "web", "mail", "mail", "web"], + "no": [1, 2, 1, 2, 1], + } + ).set_index(["host", "service"]) + mask = df.groupby(level=0).agg("idxmax") + df_count = df.loc[mask["no"]].reset_index() df_count `Grouping like Python's itertools.groupby @@ -591,9 +615,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A']) - df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups - df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum() + df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=["A"]) + df["A"].groupby((df["A"] != df["A"].shift()).cumsum()).groups + df["A"].groupby((df["A"] != df["A"].shift()).cumsum()).cumsum() Expanding data ************** @@ -617,12 +641,23 @@ Create a list of dataframes, split using a delineation based on logic included i .. ipython:: python - df = pd.DataFrame(data={'Case': ['A', 'A', 'A', 'B', 'A', 'A', 'B', 'A', - 'A'], - 'Data': np.random.randn(9)}) - - dfs = list(zip(*df.groupby((1 * (df['Case'] == 'B')).cumsum() - .rolling(window=3, min_periods=1).median())))[-1] + df = pd.DataFrame( + data={ + "Case": ["A", "A", "A", "B", "A", "A", "B", "A", "A"], + "Data": np.random.randn(9), + } + ) + + dfs = list( + zip( + *df.groupby( + (1 * (df["Case"] == "B")) + .cumsum() + .rolling(window=3, min_periods=1) + .median() + ) + ) + )[-1] dfs[0] dfs[1] @@ -639,14 +674,30 @@ The :ref:`Pivot ` docs. .. ipython:: python - df = pd.DataFrame(data={'Province': ['ON', 'QC', 'BC', 'AL', 'AL', 'MN', 'ON'], - 'City': ['Toronto', 'Montreal', 'Vancouver', - 'Calgary', 'Edmonton', 'Winnipeg', - 'Windsor'], - 'Sales': [13, 6, 16, 8, 4, 3, 1]}) - table = pd.pivot_table(df, values=['Sales'], index=['Province'], - columns=['City'], aggfunc=np.sum, margins=True) - table.stack('City') + df = pd.DataFrame( + data={ + "Province": ["ON", "QC", "BC", "AL", "AL", "MN", "ON"], + "City": [ + "Toronto", + "Montreal", + "Vancouver", + "Calgary", + "Edmonton", + "Winnipeg", + "Windsor", + ], + "Sales": [13, 6, 16, 8, 4, 3, 1], + } + ) + table = pd.pivot_table( + df, + values=["Sales"], + index=["Province"], + columns=["City"], + aggfunc=np.sum, + margins=True, + ) + table.stack("City") `Frequency table like plyr in R `__ @@ -654,25 +705,71 @@ The :ref:`Pivot ` docs. .. ipython:: python grades = [48, 99, 75, 80, 42, 80, 72, 68, 36, 78] - df = pd.DataFrame({'ID': ["x%d" % r for r in range(10)], - 'Gender': ['F', 'M', 'F', 'M', 'F', - 'M', 'F', 'M', 'M', 'M'], - 'ExamYear': ['2007', '2007', '2007', '2008', '2008', - '2008', '2008', '2009', '2009', '2009'], - 'Class': ['algebra', 'stats', 'bio', 'algebra', - 'algebra', 'stats', 'stats', 'algebra', - 'bio', 'bio'], - 'Participated': ['yes', 'yes', 'yes', 'yes', 'no', - 'yes', 'yes', 'yes', 'yes', 'yes'], - 'Passed': ['yes' if x > 50 else 'no' for x in grades], - 'Employed': [True, True, True, False, - False, False, False, True, True, False], - 'Grade': grades}) - - df.groupby('ExamYear').agg({'Participated': lambda x: x.value_counts()['yes'], - 'Passed': lambda x: sum(x == 'yes'), - 'Employed': lambda x: sum(x), - 'Grade': lambda x: sum(x) / len(x)}) + df = pd.DataFrame( + { + "ID": ["x%d" % r for r in range(10)], + "Gender": ["F", "M", "F", "M", "F", "M", "F", "M", "M", "M"], + "ExamYear": [ + "2007", + "2007", + "2007", + "2008", + "2008", + "2008", + "2008", + "2009", + "2009", + "2009", + ], + "Class": [ + "algebra", + "stats", + "bio", + "algebra", + "algebra", + "stats", + "stats", + "algebra", + "bio", + "bio", + ], + "Participated": [ + "yes", + "yes", + "yes", + "yes", + "no", + "yes", + "yes", + "yes", + "yes", + "yes", + ], + "Passed": ["yes" if x > 50 else "no" for x in grades], + "Employed": [ + True, + True, + True, + False, + False, + False, + False, + True, + True, + False, + ], + "Grade": grades, + } + ) + + df.groupby("ExamYear").agg( + { + "Participated": lambda x: x.value_counts()["yes"], + "Passed": lambda x: sum(x == "yes"), + "Employed": lambda x: sum(x), + "Grade": lambda x: sum(x) / len(x), + } + ) `Plot pandas DataFrame with year over year data `__ @@ -681,11 +778,14 @@ To create year and month cross tabulation: .. ipython:: python - df = pd.DataFrame({'value': np.random.randn(36)}, - index=pd.date_range('2011-01-01', freq='M', periods=36)) + df = pd.DataFrame( + {"value": np.random.randn(36)}, + index=pd.date_range("2011-01-01", freq="M", periods=36), + ) - pd.pivot_table(df, index=df.index.month, columns=df.index.year, - values='value', aggfunc='sum') + pd.pivot_table( + df, index=df.index.month, columns=df.index.year, values="value", aggfunc="sum" + ) Apply ***** @@ -695,15 +795,22 @@ Apply .. ipython:: python - df = pd.DataFrame(data={'A': [[2, 4, 8, 16], [100, 200], [10, 20, 30]], - 'B': [['a', 'b', 'c'], ['jj', 'kk'], ['ccc']]}, - index=['I', 'II', 'III']) + df = pd.DataFrame( + data={ + "A": [[2, 4, 8, 16], [100, 200], [10, 20, 30]], + "B": [["a", "b", "c"], ["jj", "kk"], ["ccc"]], + }, + index=["I", "II", "III"], + ) + def SeriesFromSubList(aList): return pd.Series(aList) - df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) - for ind, row in df.iterrows()}) + + df_orgz = pd.concat( + {ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()} + ) df_orgz `Rolling apply with a DataFrame returning a Series @@ -713,17 +820,25 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc .. ipython:: python - df = pd.DataFrame(data=np.random.randn(2000, 2) / 10000, - index=pd.date_range('2001-01-01', periods=2000), - columns=['A', 'B']) + df = pd.DataFrame( + data=np.random.randn(2000, 2) / 10000, + index=pd.date_range("2001-01-01", periods=2000), + columns=["A", "B"], + ) df + def gm(df, const): - v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const + v = ((((df["A"] + df["B"]) + 1).cumprod()) - 1) * const return v.iloc[-1] - s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) - for i in range(len(df) - 50)}) + + s = pd.Series( + { + df.index[i]: gm(df.iloc[i: min(i + 51, len(df) - 1)], 5) + for i in range(len(df) - 50) + } + ) s `Rolling apply with a DataFrame returning a Scalar @@ -733,20 +848,29 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight .. ipython:: python - rng = pd.date_range(start='2014-01-01', periods=100) - df = pd.DataFrame({'Open': np.random.randn(len(rng)), - 'Close': np.random.randn(len(rng)), - 'Volume': np.random.randint(100, 2000, len(rng))}, - index=rng) + rng = pd.date_range(start="2014-01-01", periods=100) + df = pd.DataFrame( + { + "Open": np.random.randn(len(rng)), + "Close": np.random.randn(len(rng)), + "Volume": np.random.randint(100, 2000, len(rng)), + }, + index=rng, + ) df + def vwap(bars): - return ((bars.Close * bars.Volume).sum() / bars.Volume.sum()) + return (bars.Close * bars.Volume).sum() / bars.Volume.sum() + window = 5 - s = pd.concat([(pd.Series(vwap(df.iloc[i:i + window]), - index=[df.index[i + window]])) - for i in range(len(df) - window)]) + s = pd.concat( + [ + (pd.Series(vwap(df.iloc[i: i + window]), index=[df.index[i + window]])) + for i in range(len(df) - window) + ] + ) s.round(2) Timeseries @@ -765,7 +889,7 @@ Timeseries `__ `Aggregation and plotting time series -`__ +`__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? @@ -778,8 +902,8 @@ Calculate the first day of the month for each entry in a DatetimeIndex .. ipython:: python - dates = pd.date_range('2000-01-01', periods=5) - dates.to_period(freq='M').to_timestamp() + dates = pd.date_range("2000-01-01", periods=5) + dates.to_period(freq="M").to_timestamp() .. _cookbook.resample: @@ -825,8 +949,8 @@ The :ref:`Concat ` docs. The :ref:`Join ` d .. ipython:: python - rng = pd.date_range('2000-01-01', periods=6) - df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=['A', 'B', 'C']) + rng = pd.date_range("2000-01-01", periods=6) + df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=["A", "B", "C"]) df2 = df1.copy() Depending on df construction, ``ignore_index`` may be needed @@ -841,17 +965,25 @@ Depending on df construction, ``ignore_index`` may be needed .. ipython:: python - df = pd.DataFrame(data={'Area': ['A'] * 5 + ['C'] * 2, - 'Bins': [110] * 2 + [160] * 3 + [40] * 2, - 'Test_0': [0, 1, 0, 1, 2, 0, 1], - 'Data': np.random.randn(7)}) + df = pd.DataFrame( + data={ + "Area": ["A"] * 5 + ["C"] * 2, + "Bins": [110] * 2 + [160] * 3 + [40] * 2, + "Test_0": [0, 1, 0, 1, 2, 0, 1], + "Data": np.random.randn(7), + } + ) df - df['Test_1'] = df['Test_0'] - 1 + df["Test_1"] = df["Test_0"] - 1 - pd.merge(df, df, left_on=['Bins', 'Area', 'Test_0'], - right_on=['Bins', 'Area', 'Test_1'], - suffixes=('_L', '_R')) + pd.merge( + df, + df, + left_on=["Bins", "Area", "Test_0"], + right_on=["Bins", "Area", "Test_1"], + suffixes=("_L", "_R"), + ) `How to set the index and join `__ @@ -878,7 +1010,7 @@ The :ref:`Plotting ` docs. `Setting x-axis major and minor labels `__ -`Plotting multiple charts in an ipython notebook +`Plotting multiple charts in an IPython Jupyter notebook `__ `Creating a multi-line plot @@ -902,16 +1034,18 @@ The :ref:`Plotting ` docs. .. ipython:: python df = pd.DataFrame( - {'stratifying_var': np.random.uniform(0, 100, 20), - 'price': np.random.normal(100, 5, 20)}) + { + "stratifying_var": np.random.uniform(0, 100, 20), + "price": np.random.normal(100, 5, 20), + } + ) - df['quartiles'] = pd.qcut( - df['stratifying_var'], - 4, - labels=['0-25%', '25-50%', '50-75%', '75-100%']) + df["quartiles"] = pd.qcut( + df["stratifying_var"], 4, labels=["0-25%", "25-50%", "50-75%", "75-100%"] + ) @savefig quartile_boxplot.png - df.boxplot(column='price', by='quartiles') + df.boxplot(column="price", by="quartiles") Data in/out ----------- @@ -973,9 +1107,9 @@ of the individual frames into a list, and then combine the frames in the list us for i in range(3): data = pd.DataFrame(np.random.randn(10, 4)) - data.to_csv('file_{}.csv'.format(i)) + data.to_csv("file_{}.csv".format(i)) - files = ['file_0.csv', 'file_1.csv', 'file_2.csv'] + files = ["file_0.csv", "file_1.csv", "file_2.csv"] result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) You can use the same approach to read all files matching a pattern. Here is an example using ``glob``: @@ -985,7 +1119,7 @@ You can use the same approach to read all files matching a pattern. Here is an import glob import os - files = glob.glob('file_*.csv') + files = glob.glob("file_*.csv") result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) Finally, this strategy will work with the other ``pd.read_*(...)`` functions described in the :ref:`io docs`. @@ -994,7 +1128,7 @@ Finally, this strategy will work with the other ``pd.read_*(...)`` functions des :suppress: for i in range(3): - os.remove('file_{}.csv'.format(i)) + os.remove("file_{}.csv".format(i)) Parsing date components in multi-columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1003,12 +1137,12 @@ Parsing date components in multi-columns is faster with a format .. ipython:: python - i = pd.date_range('20000101', periods=10000) - df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) + i = pd.date_range("20000101", periods=10000) + df = pd.DataFrame({"year": i.year, "month": i.month, "day": i.day}) df.head() + %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') - ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], - x['month'], x['day']), axis=1) + ds = df.apply(lambda x: "%04d%02d%02d" % (x["year"], x["month"], x["day"]), axis=1) ds.head() %timeit pd.to_datetime(ds) @@ -1046,18 +1180,25 @@ Option 1: pass rows explicitly to skip rows from io import StringIO - pd.read_csv(StringIO(data), sep=';', skiprows=[11, 12], - index_col=0, parse_dates=True, header=10) + pd.read_csv( + StringIO(data), + sep=";", + skiprows=[11, 12], + index_col=0, + parse_dates=True, + header=10, + ) Option 2: read column names and then data """"""""""""""""""""""""""""""""""""""""" .. ipython:: python - pd.read_csv(StringIO(data), sep=';', header=10, nrows=10).columns - columns = pd.read_csv(StringIO(data), sep=';', header=10, nrows=10).columns - pd.read_csv(StringIO(data), sep=';', index_col=0, - header=12, parse_dates=True, names=columns) + pd.read_csv(StringIO(data), sep=";", header=10, nrows=10).columns + columns = pd.read_csv(StringIO(data), sep=";", header=10, nrows=10).columns + pd.read_csv( + StringIO(data), sep=";", index_col=0, header=12, parse_dates=True, names=columns + ) .. _cookbook.sql: @@ -1153,18 +1294,18 @@ Storing Attributes to a group node .. ipython:: python df = pd.DataFrame(np.random.randn(8, 3)) - store = pd.HDFStore('test.h5') - store.put('df', df) + store = pd.HDFStore("test.h5") + store.put("df", df) # you can store an arbitrary Python object via pickle - store.get_storer('df').attrs.my_attribute = {'A': 10} - store.get_storer('df').attrs.my_attribute + store.get_storer("df").attrs.my_attribute = {"A": 10} + store.get_storer("df").attrs.my_attribute .. ipython:: python :suppress: store.close() - os.remove('test.h5') + os.remove("test.h5") You can create or load a HDFStore in-memory by passing the ``driver`` parameter to PyTables. Changes are only written to disk when the HDFStore @@ -1172,10 +1313,10 @@ is closed. .. ipython:: python - store = pd.HDFStore('test.h5', 'w', diver='H5FD_CORE') + store = pd.HDFStore("test.h5", "w", diver="H5FD_CORE") df = pd.DataFrame(np.random.randn(8, 3)) - store['test'] = df + store["test"] = df # only after closing the store, data is written to disk: store.close() @@ -1183,7 +1324,7 @@ is closed. .. ipython:: python :suppress: - os.remove('test.h5') + os.remove("test.h5") .. _cookbook.binary: @@ -1232,15 +1373,14 @@ in the frame: .. code-block:: python - names = 'count', 'avg', 'scale' + names = "count", "avg", "scale" # note that the offsets are larger than the size of the type because of # struct padding offsets = 0, 8, 16 - formats = 'i4', 'f8', 'f4' - dt = np.dtype({'names': names, 'offsets': offsets, 'formats': formats}, - align=True) - df = pd.DataFrame(np.fromfile('binary.dat', dt)) + formats = "i4", "f8", "f4" + dt = np.dtype({"names": names, "offsets": offsets, "formats": formats}, align=True) + df = pd.DataFrame(np.fromfile("binary.dat", dt)) .. note:: @@ -1270,7 +1410,7 @@ Often it's useful to obtain the lower (or upper) triangular form of a correlatio corr_mat.where(mask) -The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types. Here we compute the `distance correlation `__ matrix for a `DataFrame` object. +The ``method`` argument within ``DataFrame.corr`` can accept a callable in addition to the named correlation types. Here we compute the ``distance correlation ``__ matrix for a ``DataFrame`` object. .. ipython:: python @@ -1289,10 +1429,11 @@ The `method` argument within `DataFrame.corr` can accept a callable in addition A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean()) B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean()) cov_ab = np.sqrt(np.nansum(A * B)) / n - std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n) - std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n) + std_a = np.sqrt(np.sqrt(np.nansum(A ** 2)) / n) + std_b = np.sqrt(np.sqrt(np.nansum(B ** 2)) / n) return cov_ab / std_a / std_b + df = pd.DataFrame(np.random.normal(size=(100, 3))) df.corr(method=distcorr) @@ -1308,7 +1449,7 @@ The :ref:`Timedeltas ` docs. import datetime - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")) s - s.max() @@ -1329,12 +1470,12 @@ The :ref:`Timedeltas ` docs. deltas = pd.Series([datetime.timedelta(days=i) for i in range(3)]) - df = pd.DataFrame({'A': s, 'B': deltas}) + df = pd.DataFrame({"A": s, "B": deltas}) df - df['New Dates'] = df['A'] + df['B'] + df["New Dates"] = df["A"] + df["B"] - df['Delta'] = df['A'] - df['New Dates'] + df["Delta"] = df["A"] - df["New Dates"] df df.dtypes @@ -1365,7 +1506,8 @@ of the data values: rows = itertools.product(*data_dict.values()) return pd.DataFrame.from_records(rows, columns=data_dict.keys()) - df = expand_grid({'height': [60, 70], - 'weight': [100, 140, 180], - 'sex': ['Male', 'Female']}) + + df = expand_grid( + {"height": [60, 70], "weight": [100, 140, 180], "sex": ["Male", "Female"]} + ) df diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 360a14998b227..f2bb99dd2ebc0 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -51,7 +51,7 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s s.index @@ -71,20 +71,20 @@ Series can be instantiated from dicts: .. ipython:: python - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} pd.Series(d) .. note:: When the data is a dict, and an index is not passed, the ``Series`` index will be ordered by the dict's insertion order, if you're using Python - version >= 3.6 and Pandas version >= 0.23. + version >= 3.6 and pandas version >= 0.23. - If you're using Python < 3.6 or Pandas < 0.23, and an index is not passed, + If you're using Python < 3.6 or pandas < 0.23, and an index is not passed, the ``Series`` index will be the lexically ordered list of dict keys. In the example above, if you were on a Python version lower than 3.6 or a -Pandas version lower than 0.23, the ``Series`` would be ordered by the lexical +pandas version lower than 0.23, the ``Series`` would be ordered by the lexical order of the dict keys (i.e. ``['a', 'b', 'c']`` rather than ``['b', 'a', 'c']``). If an index is passed, the values in data corresponding to the labels in the @@ -92,9 +92,9 @@ index will be pulled out. .. ipython:: python - d = {'a': 0., 'b': 1., 'c': 2.} + d = {"a": 0.0, "b": 1.0, "c": 2.0} pd.Series(d) - pd.Series(d, index=['b', 'c', 'd', 'a']) + pd.Series(d, index=["b", "c", "d", "a"]) .. note:: @@ -107,7 +107,7 @@ provided. The value will be repeated to match the length of **index**. .. ipython:: python - pd.Series(5., index=['a', 'b', 'c', 'd', 'e']) + pd.Series(5.0, index=["a", "b", "c", "d", "e"]) Series is ndarray-like ~~~~~~~~~~~~~~~~~~~~~~ @@ -151,7 +151,7 @@ index (to disable :ref:`automatic alignment `, for example). :attr:`Series.array` will always be an :class:`~pandas.api.extensions.ExtensionArray`. Briefly, an ExtensionArray is a thin wrapper around one or more *concrete* arrays like a -:class:`numpy.ndarray`. Pandas knows how to take an ``ExtensionArray`` and +:class:`numpy.ndarray`. pandas knows how to take an ``ExtensionArray`` and store it in a ``Series`` or a column of a ``DataFrame``. See :ref:`basics.dtypes` for more. @@ -173,26 +173,26 @@ label: .. ipython:: python - s['a'] - s['e'] = 12. + s["a"] + s["e"] = 12.0 s - 'e' in s - 'f' in s + "e" in s + "f" in s If a label is not contained, an exception is raised: .. code-block:: python - >>> s['f'] + >>> s["f"] KeyError: 'f' Using the ``get`` method, a missing label will return None or specified default: .. ipython:: python - s.get('f') + s.get("f") - s.get('f', np.nan) + s.get("f", np.nan) See also the :ref:`section on attribute access`. @@ -244,7 +244,7 @@ Series can also have a ``name`` attribute: .. ipython:: python - s = pd.Series(np.random.randn(5), name='something') + s = pd.Series(np.random.randn(5), name="something") s s.name @@ -290,9 +290,9 @@ based on common sense rules. When the data is a dict, and ``columns`` is not specified, the ``DataFrame`` columns will be ordered by the dict's insertion order, if you are using - Python version >= 3.6 and Pandas >= 0.23. + Python version >= 3.6 and pandas >= 0.23. - If you are using Python < 3.6 or Pandas < 0.23, and ``columns`` is not + If you are using Python < 3.6 or pandas < 0.23, and ``columns`` is not specified, the ``DataFrame`` columns will be the lexically ordered list of dict keys. @@ -306,13 +306,15 @@ keys. .. ipython:: python - d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), - 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + d = { + "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), + "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]), + } df = pd.DataFrame(d) df - pd.DataFrame(d, index=['d', 'b', 'a']) - pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three']) + pd.DataFrame(d, index=["d", "b", "a"]) + pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"]) The row and column labels can be accessed respectively by accessing the **index** and **columns** attributes: @@ -336,10 +338,9 @@ result will be ``range(n)``, where ``n`` is the array length. .. ipython:: python - d = {'one': [1., 2., 3., 4.], - 'two': [4., 3., 2., 1.]} + d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} pd.DataFrame(d) - pd.DataFrame(d, index=['a', 'b', 'c', 'd']) + pd.DataFrame(d, index=["a", "b", "c", "d"]) From structured or record array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -348,12 +349,12 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) - data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) - pd.DataFrame(data, index=['first', 'second']) - pd.DataFrame(data, columns=['C', 'A', 'B']) + pd.DataFrame(data, index=["first", "second"]) + pd.DataFrame(data, columns=["C", "A", "B"]) .. note:: @@ -367,10 +368,10 @@ From a list of dicts .. ipython:: python - data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] + data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}] pd.DataFrame(data2) - pd.DataFrame(data2, index=['first', 'second']) - pd.DataFrame(data2, columns=['a', 'b']) + pd.DataFrame(data2, index=["first", "second"]) + pd.DataFrame(data2, columns=["a", "b"]) .. _basics.dataframe.from_dict_of_tuples: @@ -382,11 +383,15 @@ dictionary. .. ipython:: python - pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, - ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, - ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, - ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, - ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + pd.DataFrame( + { + ("a", "b"): {("A", "B"): 1, ("A", "C"): 2}, + ("a", "a"): {("A", "C"): 3, ("A", "B"): 4}, + ("a", "c"): {("A", "B"): 5, ("A", "C"): 6}, + ("b", "a"): {("A", "C"): 7, ("A", "B"): 8}, + ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}, + } + ) .. _basics.dataframe.from_series: @@ -397,6 +402,32 @@ The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided). + +.. _basics.dataframe.from_list_namedtuples: + +From a list of namedtuples +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The field names of the first ``namedtuple`` in the list determine the columns +of the ``DataFrame``. The remaining namedtuples (or tuples) are simply unpacked +and their values are fed into the rows of the ``DataFrame``. If any of those +tuples is shorter than the first ``namedtuple`` then the later columns in the +corresponding row are marked as missing values. If any are longer than the +first ``namedtuple``, a ``ValueError`` is raised. + +.. ipython:: python + + from collections import namedtuple + + Point = namedtuple("Point", "x y") + + pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]) + + Point3D = namedtuple("Point3D", "x y z") + + pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]) + + .. _basics.dataframe.from_list_dataclasses: From a list of dataclasses @@ -408,7 +439,7 @@ Data Classes as introduced in `PEP557 can be passed into the DataFrame constructor. Passing a list of dataclasses is equivalent to passing a list of dictionaries. -Please be aware, that that all values in the list should be dataclasses, mixing +Please be aware, that all values in the list should be dataclasses, mixing types in the list would result in a TypeError. .. ipython:: python @@ -442,15 +473,18 @@ set to ``'index'`` in order to use the dict keys as row labels. .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])])) + pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])) If you pass ``orient='index'``, the keys will be the row labels. In this case, you can also pass the desired column names: .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]), - orient='index', columns=['one', 'two', 'three']) + pd.DataFrame.from_dict( + dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]), + orient="index", + columns=["one", "two", "three"], + ) .. _basics.dataframe.from_records: @@ -464,7 +498,7 @@ dtype. For example: .. ipython:: python data - pd.DataFrame.from_records(data, index='C') + pd.DataFrame.from_records(data, index="C") .. _basics.dataframe.sel_add_del: @@ -477,17 +511,17 @@ the analogous dict operations: .. ipython:: python - df['one'] - df['three'] = df['one'] * df['two'] - df['flag'] = df['one'] > 2 + df["one"] + df["three"] = df["one"] * df["two"] + df["flag"] = df["one"] > 2 df Columns can be deleted or popped like with a dict: .. ipython:: python - del df['two'] - three = df.pop('three') + del df["two"] + three = df.pop("three") df When inserting a scalar value, it will naturally be propagated to fill the @@ -495,7 +529,7 @@ column: .. ipython:: python - df['foo'] = 'bar' + df["foo"] = "bar" df When inserting a Series that does not have the same index as the DataFrame, it @@ -503,7 +537,7 @@ will be conformed to the DataFrame's index: .. ipython:: python - df['one_trunc'] = df['one'][:2] + df["one_trunc"] = df["one"][:2] df You can insert raw ndarrays but their length must match the length of the @@ -514,7 +548,7 @@ available to insert at a particular location in the columns: .. ipython:: python - df.insert(1, 'bar', df['one']) + df.insert(1, "bar", df["one"]) df .. _dsintro.chained_assignment: @@ -530,17 +564,16 @@ derived from existing columns. .. ipython:: python - iris = pd.read_csv('data/iris.data') + iris = pd.read_csv("data/iris.data") iris.head() - (iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength']) - .head()) + iris.assign(sepal_ratio=iris["SepalWidth"] / iris["SepalLength"]).head() In the example above, we inserted a precomputed value. We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to. .. ipython:: python - iris.assign(sepal_ratio=lambda x: (x['SepalWidth'] / x['SepalLength'])).head() + iris.assign(sepal_ratio=lambda x: (x["SepalWidth"] / x["SepalLength"])).head() ``assign`` **always** returns a copy of the data, leaving the original DataFrame untouched. @@ -554,10 +587,14 @@ greater than 5, calculate the ratio, and plot: .. ipython:: python @savefig basics_assign.png - (iris.query('SepalLength > 5') - .assign(SepalRatio=lambda x: x.SepalWidth / x.SepalLength, - PetalRatio=lambda x: x.PetalWidth / x.PetalLength) - .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) + ( + iris.query("SepalLength > 5") + .assign( + SepalRatio=lambda x: x.SepalWidth / x.SepalLength, + PetalRatio=lambda x: x.PetalWidth / x.PetalLength, + ) + .plot(kind="scatter", x="SepalRatio", y="PetalRatio") + ) Since a function is passed in, the function is computed on the DataFrame being assigned to. Importantly, this is the DataFrame that's been filtered @@ -571,18 +608,14 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function of one argument to be called on the ``DataFrame``. A *copy* of the original DataFrame is returned, with the new values inserted. -.. versionchanged:: 0.23.0 - Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows for *dependent* assignment, where an expression later in ``**kwargs`` can refer to a column created earlier in the same :meth:`~DataFrame.assign`. .. ipython:: python - dfa = pd.DataFrame({"A": [1, 2, 3], - "B": [4, 5, 6]}) - dfa.assign(C=lambda x: x['A'] + x['B'], - D=lambda x: x['A'] + x['C']) + dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"]) In the second expression, ``x['C']`` will refer to the newly created column, that's equal to ``dfa['A'] + dfa['B']``. @@ -607,7 +640,7 @@ DataFrame: .. ipython:: python - df.loc['b'] + df.loc["b"] df.iloc[2] For a more exhaustive treatment of sophisticated label-based indexing and @@ -626,8 +659,8 @@ union of the column and row labels. .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C']) + df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"]) df + df2 When doing an operation between DataFrame and Series, the default behavior is @@ -639,31 +672,6 @@ row-wise. For example: df - df.iloc[0] -In the special case of working with time series data, if the DataFrame index -contains dates, the broadcasting will be column-wise: - -.. ipython:: python - :okwarning: - - index = pd.date_range('1/1/2000', periods=8) - df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC')) - df - type(df['A']) - df - df['A'] - -.. warning:: - - .. code-block:: python - - df - df['A'] - - is now deprecated and will be removed in a future release. The preferred way - to replicate this behavior is - - .. code-block:: python - - df.sub(df['A'], axis=0) - For explicit control over the matching and broadcasting behavior, see the section on :ref:`flexible binary operations `. @@ -681,8 +689,8 @@ Boolean operators work as well: .. ipython:: python - df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool) - df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool) + df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool) + df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool) df1 & df2 df1 | df2 df1 ^ df2 @@ -738,8 +746,8 @@ on two :class:`Series` with differently ordered labels will align before the ope .. ipython:: python - ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c']) + ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + ser2 = pd.Series([1, 3, 5], index=["b", "a", "c"]) ser1 ser2 np.remainder(ser1, ser2) @@ -749,7 +757,7 @@ with missing values. .. ipython:: python - ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd']) + ser3 = pd.Series([2, 4, 6], index=["b", "c", "d"]) ser3 np.remainder(ser1, ser3) @@ -779,11 +787,11 @@ R package): :suppress: # force a summary to be printed - pd.set_option('display.max_rows', 5) + pd.set_option("display.max_rows", 5) .. ipython:: python - baseball = pd.read_csv('data/baseball.csv') + baseball = pd.read_csv("data/baseball.csv") print(baseball) baseball.info() @@ -792,7 +800,7 @@ R package): :okwarning: # restore GlobalPrintConfig - pd.reset_option(r'^display\.') + pd.reset_option(r"^display\.") However, using ``to_string`` will return a string representation of the DataFrame in tabular form, though it won't always fit the console width: @@ -813,7 +821,7 @@ option: .. ipython:: python - pd.set_option('display.width', 40) # default is 80 + pd.set_option("display.width", 40) # default is 80 pd.DataFrame(np.random.randn(3, 12)) @@ -821,21 +829,25 @@ You can adjust the max width of the individual columns by setting ``display.max_ .. ipython:: python - datafile = {'filename': ['filename_01', 'filename_02'], - 'path': ["media/user_name/storage/folder_01/filename_01", - "media/user_name/storage/folder_02/filename_02"]} + datafile = { + "filename": ["filename_01", "filename_02"], + "path": [ + "media/user_name/storage/folder_01/filename_01", + "media/user_name/storage/folder_02/filename_02", + ], + } - pd.set_option('display.max_colwidth', 30) + pd.set_option("display.max_colwidth", 30) pd.DataFrame(datafile) - pd.set_option('display.max_colwidth', 100) + pd.set_option("display.max_colwidth", 100) pd.DataFrame(datafile) .. ipython:: python :suppress: - pd.reset_option('display.width') - pd.reset_option('display.max_colwidth') + pd.reset_option("display.width") + pd.reset_option("display.max_colwidth") You can also disable this feature via the ``expand_frame_repr`` option. This will print the table in one block. @@ -848,8 +860,7 @@ accessed like an attribute: .. ipython:: python - df = pd.DataFrame({'foo1': np.random.randn(5), - 'foo2': np.random.randn(5)}) + df = pd.DataFrame({"foo1": np.random.randn(5), "foo2": np.random.randn(5)}) df df.foo1 diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst new file mode 100644 index 0000000000000..7cda067fb24ad --- /dev/null +++ b/doc/source/user_guide/duplicates.rst @@ -0,0 +1,205 @@ +.. _duplicates: + +**************** +Duplicate Labels +**************** + +:class:`Index` objects are not required to be unique; you can have duplicate row +or column labels. This may be a bit confusing at first. If you're familiar with +SQL, you know that row labels are similar to a primary key on a table, and you +would never want duplicates in a SQL table. But one of pandas' roles is to clean +messy, real-world data before it goes to some downstream system. And real-world +data has duplicates, even in fields that are supposed to be unique. + +This section describes how duplicate labels change the behavior of certain +operations, and how prevent duplicates from arising during operations, or to +detect them if they do. + +.. ipython:: python + + import pandas as pd + import numpy as np + +Consequences of Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some pandas methods (:meth:`Series.reindex` for example) just don't work with +duplicates present. The output can't be determined, and so pandas raises. + +.. ipython:: python + :okexcept: + + s1 = pd.Series([0, 1, 2], index=["a", "b", "b"]) + s1.reindex(["a", "b", "c"]) + +Other methods, like indexing, can give very surprising results. Typically +indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame`` +with a scalar will return a ``Series``. Slicing a ``Series`` with a scalar will +return a scalar. But with duplicates, this isn't the case. + +.. ipython:: python + + df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "A", "B"]) + df1 + +We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series`` + +.. ipython:: python + + df1["B"] # a series + +But slicing ``'A'`` returns a ``DataFrame`` + + +.. ipython:: python + + df1["A"] # a DataFrame + +This applies to row labels as well + +.. ipython:: python + + df2 = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "a", "b"]) + df2 + df2.loc["b", "A"] # a scalar + df2.loc["a", "A"] # a Series + +Duplicate Label Detection +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can check whether an :class:`Index` (storing the row or column labels) is +unique with :attr:`Index.is_unique`: + +.. ipython:: python + + df2 + df2.index.is_unique + df2.columns.is_unique + +.. note:: + + Checking whether an index is unique is somewhat expensive for large datasets. + pandas does cache this result, so re-checking on the same index is very fast. + +:meth:`Index.duplicated` will return a boolean ndarray indicating whether a +label is repeated. + +.. ipython:: python + + df2.index.duplicated() + +Which can be used as a boolean filter to drop duplicate rows. + +.. ipython:: python + + df2.loc[~df2.index.duplicated(), :] + +If you need additional logic to handle duplicate labels, rather than just +dropping the repeats, using :meth:`~DataFrame.groupby` on the index is a common +trick. For example, we'll resolve duplicates by taking the average of all rows +with the same label. + +.. ipython:: python + + df2.groupby(level=0).mean() + +.. _duplicates.disallow: + +Disallowing Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.2.0 + +As noted above, handling duplicates is an important feature when reading in raw +data. That said, you may want to avoid introducing duplicates as part of a data +processing pipeline (from methods like :meth:`pandas.concat`, +:meth:`~DataFrame.rename`, etc.). Both :class:`Series` and :class:`DataFrame` +*disallow* duplicate labels by calling ``.set_flags(allows_duplicate_labels=False)``. +(the default is to allow them). If there are duplicate labels, an exception +will be raised. + +.. ipython:: python + :okexcept: + + pd.Series([0, 1, 2], index=["a", "b", "b"]).set_flags(allows_duplicate_labels=False) + +This applies to both row and column labels for a :class:`DataFrame` + +.. ipython:: python + :okexcept: + + pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags( + allows_duplicate_labels=False + ) + +This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, +which indicates whether that object can have duplicate labels. + +.. ipython:: python + + df = pd.DataFrame({"A": [0, 1, 2, 3]}, index=["x", "y", "X", "Y"]).set_flags( + allows_duplicate_labels=False + ) + df + df.flags.allows_duplicate_labels + +:meth:`DataFrame.set_flags` can be used to return a new ``DataFrame`` with attributes +like ``allows_duplicate_labels`` set to some value + +.. ipython:: python + + df2 = df.set_flags(allows_duplicate_labels=True) + df2.flags.allows_duplicate_labels + +The new ``DataFrame`` returned is a view on the same data as the old ``DataFrame``. +Or the property can just be set directly on the same object + + +.. ipython:: python + + df2.flags.allows_duplicate_labels = False + df2.flags.allows_duplicate_labels + +When processing raw, messy data you might initially read in the messy data +(which potentially has duplicate labels), deduplicate, and then disallow duplicates +going forward, to ensure that your data pipeline doesn't introduce duplicates. + + +.. code-block:: python + + >>> raw = pd.read_csv("...") + >>> deduplicated = raw.groupby(level=0).first() # remove duplicates + >>> deduplicated.flags.allows_duplicate_labels = False # disallow going forward + +Setting ``allows_duplicate_labels=True`` on a ``Series`` or ``DataFrame`` with duplicate +labels or performing an operation that introduces duplicate labels on a ``Series`` or +``DataFrame`` that disallows duplicates will raise an +:class:`errors.DuplicateLabelError`. + +.. ipython:: python + :okexcept: + + df.rename(str.upper) + +This error message contains the labels that are duplicated, and the numeric positions +of all the duplicates (including the "original") in the ``Series`` or ``DataFrame`` + +Duplicate Label Propagation +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In general, disallowing duplicates is "sticky". It's preserved through +operations. + +.. ipython:: python + :okexcept: + + s1 = pd.Series(0, index=["a", "b"]).set_flags(allows_duplicate_labels=False) + s1 + s1.head().rename({"a": "b"}) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 24fcb369804c6..42621c032416d 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -48,10 +48,14 @@ We have a ``DataFrame`` to which we want to apply a function row-wise. .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(1000), - 'b': np.random.randn(1000), - 'N': np.random.randint(100, 1000, (1000)), - 'x': 'x'}) + df = pd.DataFrame( + { + "a": np.random.randn(1000), + "b": np.random.randn(1000), + "N": np.random.randint(100, 1000, (1000)), + "x": "x", + } + ) df Here's the function in pure Python: @@ -61,6 +65,7 @@ Here's the function in pure Python: def f(x): return x * (x - 1) + def integrate_f(a, b, N): s = 0 dx = (b - a) / N @@ -72,7 +77,7 @@ We achieve our result by using ``apply`` (row-wise): .. code-block:: ipython - In [7]: %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + In [7]: %timeit df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]), axis=1) 10 loops, best of 3: 174 ms per loop But clearly this isn't fast enough for us. Let's take a look and see where the @@ -81,7 +86,7 @@ four calls) using the `prun ipython magic function `__ +by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools +` for an extensive example. + Vectorize ~~~~~~~~~ @@ -396,15 +400,15 @@ Consider the following toy example of doubling each observation: .. code-block:: ipython # Custom function without numba - In [5]: %timeit df['col1_doubled'] = df['a'].apply(double_every_value_nonumba) # noqa E501 + In [5]: %timeit df["col1_doubled"] = df["a"].apply(double_every_value_nonumba) # noqa E501 1000 loops, best of 3: 797 us per loop # Standard implementation (faster than a custom function) - In [6]: %timeit df['col1_doubled'] = df['a'] * 2 + In [6]: %timeit df["col1_doubled"] = df["a"] * 2 1000 loops, best of 3: 233 us per loop # Custom function with numba - In [7]: %timeit df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy()) + In [7]: %timeit df["col1_doubled"] = double_every_value_withnumba(df["a"].to_numpy()) 1000 loops, best of 3: 145 us per loop Caveats @@ -480,10 +484,10 @@ These operations are supported by :func:`pandas.eval`: * ``list`` and ``tuple`` literals, e.g., ``[1, 2]`` or ``(1, 2)`` * Attribute access, e.g., ``df.a`` * Subscript expressions, e.g., ``df[0]`` -* Simple variable evaluation, e.g., ``pd.eval('df')`` (this is not very useful) -* Math functions: `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, - `sqrt`, `sinh`, `cosh`, `tanh`, `arcsin`, `arccos`, `arctan`, `arccosh`, - `arcsinh`, `arctanh`, `abs`, `arctan2` and `log10`. +* Simple variable evaluation, e.g., ``pd.eval("df")`` (this is not very useful) +* Math functions: ``sin``, ``cos``, ``exp``, ``log``, ``expm1``, ``log1p``, + ``sqrt``, ``sinh``, ``cosh``, ``tanh``, ``arcsin``, ``arccos``, ``arctan``, ``arccosh``, + ``arcsinh``, ``arctanh``, ``abs``, ``arctan2`` and ``log10``. This Python syntax is **not** allowed: @@ -530,7 +534,7 @@ Now let's compare adding them together using plain ol' Python versus .. ipython:: python - %timeit pd.eval('df1 + df2 + df3 + df4') + %timeit pd.eval("df1 + df2 + df3 + df4") Now let's do the same thing but with comparisons: @@ -541,7 +545,7 @@ Now let's do the same thing but with comparisons: .. ipython:: python - %timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') + %timeit pd.eval("(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)") :func:`~pandas.eval` also works with unaligned pandas objects: @@ -553,7 +557,7 @@ Now let's do the same thing but with comparisons: .. ipython:: python - %timeit pd.eval('df1 + df2 + df3 + df4 + s') + %timeit pd.eval("df1 + df2 + df3 + df4 + s") .. note:: @@ -580,19 +584,19 @@ evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. :suppress: try: - del a + del a except NameError: - pass + pass try: - del b + del b except NameError: - pass + pass .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 2), columns=['a', 'b']) - df.eval('a + b') + df = pd.DataFrame(np.random.randn(5, 2), columns=["a", "b"]) + df.eval("a + b") Any expression that is a valid :func:`pandas.eval` expression is also a valid :meth:`DataFrame.eval` expression, with the added benefit that you don't have to @@ -610,9 +614,9 @@ on the original ``DataFrame`` or return a copy with the new column. .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.eval('c = a + b', inplace=True) - df.eval('d = a + b + c', inplace=True) - df.eval('a = 1', inplace=True) + df.eval("c = a + b", inplace=True) + df.eval("d = a + b + c", inplace=True) + df.eval("a = 1", inplace=True) df When ``inplace`` is set to ``False``, the default, a copy of the ``DataFrame`` with the @@ -621,7 +625,7 @@ new or modified columns is returned and the original frame is unchanged. .. ipython:: python df - df.eval('e = a - c', inplace=False) + df.eval("e = a - c", inplace=False) df As a convenience, multiple assignments can be performed by using a @@ -629,19 +633,22 @@ multi-line string. .. ipython:: python - df.eval(""" + df.eval( + """ c = a + b d = a + b + c - a = 1""", inplace=False) + a = 1""", + inplace=False, + ) The equivalent in standard Python would be .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df['c'] = df['a'] + df['b'] - df['d'] = df['a'] + df['b'] + df['c'] - df['a'] = 1 + df["c"] = df["a"] + df["b"] + df["d"] = df["a"] + df["b"] + df["c"] + df["a"] = 1 df The ``query`` method has a ``inplace`` keyword which determines @@ -650,8 +657,8 @@ whether the query modifies the original frame. .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.query('a > 2') - df.query('a > 2', inplace=True) + df.query("a > 2") + df.query("a > 2", inplace=True) df Local variables @@ -662,10 +669,10 @@ expression by placing the ``@`` character in front of the name. For example, .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 2), columns=list('ab')) + df = pd.DataFrame(np.random.randn(5, 2), columns=list("ab")) newcol = np.random.randn(len(df)) - df.eval('b + @newcol') - df.query('b < @newcol') + df.eval("b + @newcol") + df.query("b < @newcol") If you don't prefix the local variable with ``@``, pandas will raise an exception telling you the variable is undefined. @@ -678,25 +685,25 @@ name in an expression. .. ipython:: python a = np.random.randn() - df.query('@a < a') - df.loc[a < df['a']] # same as the previous expression + df.query("@a < a") + df.loc[a < df["a"]] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it -isn't defined in that context. ``pandas`` will let you know this if you try to +isn't defined in that context. pandas will let you know this if you try to use ``@`` in a top-level call to :func:`pandas.eval`. For example, .. ipython:: python :okexcept: a, b = 1, 2 - pd.eval('@a + b') + pd.eval("@a + b") In this case, you should simply refer to the variables like you would in standard Python. .. ipython:: python - pd.eval('a + b') + pd.eval("a + b") :func:`pandas.eval` parsers @@ -716,10 +723,10 @@ semantics. .. ipython:: python - expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' - x = pd.eval(expr, parser='python') - expr_no_parens = 'df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0' - y = pd.eval(expr_no_parens, parser='pandas') + expr = "(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)" + x = pd.eval(expr, parser="python") + expr_no_parens = "df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0" + y = pd.eval(expr_no_parens, parser="pandas") np.all(x == y) @@ -728,10 +735,10 @@ well: .. ipython:: python - expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' - x = pd.eval(expr, parser='python') - expr_with_ands = 'df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0' - y = pd.eval(expr_with_ands, parser='pandas') + expr = "(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)" + x = pd.eval(expr, parser="python") + expr_with_ands = "df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0" + y = pd.eval(expr_with_ands, parser="pandas") np.all(x == y) @@ -761,7 +768,7 @@ is a bit slower (not by much) than evaluating the same expression in Python .. ipython:: python - %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') + %timeit pd.eval("df1 + df2 + df3 + df4", engine="python") :func:`pandas.eval` performance @@ -805,10 +812,11 @@ you have an expression--for example .. ipython:: python - df = pd.DataFrame({'strings': np.repeat(list('cba'), 3), - 'nums': np.repeat(range(3), 3)}) + df = pd.DataFrame( + {"strings": np.repeat(list("cba"), 3), "nums": np.repeat(range(3), 3)} + ) df - df.query('strings == "a" and nums == 1') + df.query("strings == 'a' and nums == 1") the numeric part of the comparison (``nums == 1``) will be evaluated by ``numexpr``. diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index a96c70405d859..07c856c96426d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -21,12 +21,19 @@ when calling :meth:`~DataFrame.info`: .. ipython:: python - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] n = 5000 data = {t: np.random.randint(100, size=n).astype(t) for t in dtypes} df = pd.DataFrame(data) - df['categorical'] = df['object'].astype('category') + df["categorical"] = df["object"].astype("category") df.info() @@ -40,7 +47,7 @@ as it can be expensive to do this deeper introspection. .. ipython:: python - df.info(memory_usage='deep') + df.info(memory_usage="deep") By default the display option is set to ``True`` but can be explicitly overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. @@ -155,7 +162,7 @@ index, not membership among the values. .. ipython:: python - s = pd.Series(range(5), index=list('abcde')) + s = pd.Series(range(5), index=list("abcde")) 2 in s 'b' in s @@ -206,11 +213,11 @@ arrays. For example: .. ipython:: python - s = pd.Series([1, 2, 3, 4, 5], index=list('abcde')) + s = pd.Series([1, 2, 3, 4, 5], index=list("abcde")) s s.dtype - s2 = s.reindex(['a', 'b', 'c', 'f', 'u']) + s2 = s.reindex(["a", "b", "c", "f", "u"]) s2 s2.dtype @@ -227,12 +234,11 @@ the nullable-integer extension dtypes provided by pandas .. ipython:: python - s_int = pd.Series([1, 2, 3, 4, 5], index=list('abcde'), - dtype=pd.Int64Dtype()) + s_int = pd.Series([1, 2, 3, 4, 5], index=list("abcde"), dtype=pd.Int64Dtype()) s_int s_int.dtype - s2_int = s_int.reindex(['a', 'b', 'c', 'f', 'u']) + s2_int = s_int.reindex(["a", "b", "c", "f", "u"]) s2_int s2_int.dtype @@ -334,7 +340,7 @@ constructors using something similar to the following: .. ipython:: python - x = np.array(list(range(10)), '>i4') # big endian + x = np.array(list(range(10)), ">i4") # big endian newx = x.byteswap().newbyteorder() # force native byteorder s = pd.Series(newx) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ddba3dc452e28..d6081155b58db 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -68,30 +68,32 @@ object (more on what the GroupBy object is later), you may do the following: .. ipython:: python - df = pd.DataFrame([('bird', 'Falconiformes', 389.0), - ('bird', 'Psittaciformes', 24.0), - ('mammal', 'Carnivora', 80.2), - ('mammal', 'Primates', np.nan), - ('mammal', 'Carnivora', 58)], - index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'], - columns=('class', 'order', 'max_speed')) + df = pd.DataFrame( + [ + ("bird", "Falconiformes", 389.0), + ("bird", "Psittaciformes", 24.0), + ("mammal", "Carnivora", 80.2), + ("mammal", "Primates", np.nan), + ("mammal", "Carnivora", 58), + ], + index=["falcon", "parrot", "lion", "monkey", "leopard"], + columns=("class", "order", "max_speed"), + ) df # default is axis=0 - grouped = df.groupby('class') - grouped = df.groupby('order', axis='columns') - grouped = df.groupby(['class', 'order']) + grouped = df.groupby("class") + grouped = df.groupby("order", axis="columns") + grouped = df.groupby(["class", "order"]) The mapping can be specified many different ways: * A Python function, to be called on each of the axis labels. * A list or NumPy array of the same length as the selected axis. * A dict or ``Series``, providing a ``label -> group name`` mapping. -* For ``DataFrame`` objects, a string indicating a column to be used to group. - Of course ``df.groupby('A')`` is just syntactic sugar for - ``df.groupby(df['A'])``, but it makes life simpler. -* For ``DataFrame`` objects, a string indicating an index level to be used to - group. +* For ``DataFrame`` objects, a string indicating either a column name or + an index level name to be used to group. +* ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``. * A list of any of the above things. Collectively we refer to the grouping objects as the **keys**. For example, @@ -105,12 +107,14 @@ consider the following ``DataFrame``: .. ipython:: python - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. @@ -118,8 +122,8 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: .. ipython:: python - grouped = df.groupby('A') - grouped = df.groupby(['A', 'B']) + grouped = df.groupby("A") + grouped = df.groupby(["A", "B"]) .. versionadded:: 0.24 @@ -128,8 +132,8 @@ but the specified columns .. ipython:: python - df2 = df.set_index(['A', 'B']) - grouped = df2.groupby(level=df2.index.names.difference(['B'])) + df2 = df.set_index(["A", "B"]) + grouped = df2.groupby(level=df2.index.names.difference(["B"])) grouped.sum() These will split the DataFrame on its index (rows). We could also split by the @@ -183,9 +187,9 @@ By default the group keys are sorted during the ``groupby`` operation. You may h .. ipython:: python - df2 = pd.DataFrame({'X': ['B', 'B', 'A', 'A'], 'Y': [1, 2, 3, 4]}) - df2.groupby(['X']).sum() - df2.groupby(['X'], sort=False).sum() + df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]}) + df2.groupby(["X"]).sum() + df2.groupby(["X"], sort=False).sum() Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. @@ -193,10 +197,10 @@ For example, the groups created by ``groupby()`` below are in the order they app .. ipython:: python - df3 = pd.DataFrame({'X': ['A', 'B', 'A', 'B'], 'Y': [1, 4, 3, 2]}) - df3.groupby(['X']).get_group('A') + df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + df3.groupby(["X"]).get_group("A") - df3.groupby(['X']).get_group('B') + df3.groupby(["X"]).get_group("B") .. _groupby.dropna: @@ -218,10 +222,10 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna .. ipython:: python - # Default `dropna` is set to True, which will exclude NaNs in keys + # Default ``dropna`` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() - # In order to allow NaN in keys, set `dropna` to False + # In order to allow NaN in keys, set ``dropna`` to False df_dropna.groupby(by=["b"], dropna=False).sum() The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. @@ -238,7 +242,7 @@ above example we have: .. ipython:: python - df.groupby('A').groups + df.groupby("A").groups df.groupby(get_letter_type, axis=1).groups Calling the standard Python ``len`` function on the GroupBy object just returns @@ -246,7 +250,7 @@ the length of the ``groups`` dict, so it is largely just a convenience: .. ipython:: python - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.groups len(grouped) @@ -261,15 +265,16 @@ the length of the ``groups`` dict, so it is largely just a convenience: n = 10 weight = np.random.normal(166, 20, size=n) height = np.random.normal(60, 10, size=n) - time = pd.date_range('1/1/2000', periods=n) - gender = np.random.choice(['male', 'female'], size=n) - df = pd.DataFrame({'height': height, 'weight': weight, - 'gender': gender}, index=time) + time = pd.date_range("1/1/2000", periods=n) + gender = np.random.choice(["male", "female"], size=n) + df = pd.DataFrame( + {"height": height, "weight": weight, "gender": gender}, index=time + ) .. ipython:: python df - gb = df.groupby('gender') + gb = df.groupby("gender") .. ipython:: @@ -293,9 +298,11 @@ Let's create a Series with a two-level ``MultiIndex``. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) s = pd.Series(np.random.randn(8), index=index) s @@ -311,7 +318,7 @@ number: .. ipython:: python - s.groupby(level='second').sum() + s.groupby(level="second").sum() The aggregation functions such as ``sum`` will take the level parameter directly. Additionally, the resulting index will be named according to the @@ -319,30 +326,32 @@ chosen level: .. ipython:: python - s.sum(level='second') + s.sum(level="second") Grouping with multiple levels is supported. .. ipython:: python :suppress: - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["doo", "doo", "bee", "bee", "bop", "bop", "bop", "bop"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = list(zip(*arrays)) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) s = pd.Series(np.random.randn(8), index=index) .. ipython:: python s - s.groupby(level=['first', 'second']).sum() + s.groupby(level=["first", "second"]).sum() Index level names may be supplied as keys. .. ipython:: python - s.groupby(['first', 'second']).sum() + s.groupby(["first", "second"]).sum() More on the ``sum`` function and aggregation later. @@ -354,14 +363,14 @@ objects. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) - df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], - 'B': np.arange(8)}, - index=index) + df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index) df @@ -370,19 +379,19 @@ the ``A`` column. .. ipython:: python - df.groupby([pd.Grouper(level=1), 'A']).sum() + df.groupby([pd.Grouper(level=1), "A"]).sum() Index levels may also be specified by name. .. ipython:: python - df.groupby([pd.Grouper(level='second'), 'A']).sum() + df.groupby([pd.Grouper(level="second"), "A"]).sum() Index level names may be specified as keys directly to ``groupby``. .. ipython:: python - df.groupby(['second', 'A']).sum() + df.groupby(["second", "A"]).sum() DataFrame column selection in GroupBy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -394,24 +403,26 @@ getting a column from a DataFrame, you can do: .. ipython:: python :suppress: - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) .. ipython:: python - grouped = df.groupby(['A']) - grouped_C = grouped['C'] - grouped_D = grouped['D'] + grouped = df.groupby(["A"]) + grouped_C = grouped["C"] + grouped_D = grouped["D"] This is mainly syntactic sugar for the alternative and much more verbose: .. ipython:: python - df['C'].groupby(df['A']) + df["C"].groupby(df["A"]) Additionally this method avoids recomputing the internal grouping information derived from the passed key. @@ -452,13 +463,13 @@ A single group can be selected using .. ipython:: python - grouped.get_group('bar') + grouped.get_group("bar") Or for an object grouped on multiple columns: .. ipython:: python - df.groupby(['A', 'B']).get_group(('bar', 'one')) + df.groupby(["A", "B"]).get_group(("bar", "one")) .. _groupby.aggregate: @@ -467,7 +478,7 @@ Aggregation Once the GroupBy object has been created, several methods are available to perform a computation on the grouped data. These operations are similar to the -:ref:`aggregating API `, :ref:`window functions API `, +:ref:`aggregating API `, :ref:`window API `, and :ref:`resample API `. An obvious one is aggregation via the @@ -476,10 +487,10 @@ An obvious one is aggregation via the .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.aggregate(np.sum) - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.aggregate(np.sum) As you can see, the result of the aggregation will have the group names as the @@ -489,17 +500,17 @@ changed by using the ``as_index`` option: .. ipython:: python - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) grouped.aggregate(np.sum) - df.groupby('A', as_index=False).sum() + df.groupby("A", as_index=False).sum() Note that you could use the ``reset_index`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: .. ipython:: python - df.groupby(['A', 'B']).sum().reset_index() + df.groupby(["A", "B"]).sum().reset_index() Another simple aggregation example is to compute the size of each group. This is included in GroupBy as the ``size`` method. It returns a Series whose @@ -513,6 +524,15 @@ index are the group names and whose values are the sizes of each group. grouped.describe() +Another aggregation example is to compute the number of unique values of each group. This is similar to the ``value_counts`` function, except that it only counts unique values. + +.. ipython:: python + + ll = [['foo', 1], ['foo', 2], ['foo', 2], ['bar', 1], ['bar', 1]] + df4 = pd.DataFrame(ll, columns=["A", "B"]) + df4 + df4.groupby("A")["B"].nunique() + .. note:: Aggregation functions **will not** return the groups that you are aggregating over @@ -561,8 +581,8 @@ aggregation with, outputting a DataFrame: .. ipython:: python - grouped = df.groupby('A') - grouped['C'].agg([np.sum, np.mean, np.std]) + grouped = df.groupby("A") + grouped["C"].agg([np.sum, np.mean, np.std]) On a grouped ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -577,19 +597,21 @@ need to rename, then you can add in a chained operation for a ``Series`` like th .. ipython:: python - (grouped['C'].agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped["C"] + .agg([np.sum, np.mean, np.std]) + .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) + ) For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python - (grouped.agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped.agg([np.sum, np.mean, np.std]).rename( + columns={"sum": "foo", "mean": "bar", "std": "baz"} + ) + ) .. note:: @@ -600,17 +622,16 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python :okexcept: - grouped['C'].agg(['sum', 'sum']) + grouped["C"].agg(["sum", "sum"]) - Pandas *does* allow you to provide multiple lambdas. In this case, pandas + pandas *does* allow you to provide multiple lambdas. In this case, pandas will mangle the name of the (nameless) lambda functions, appending ``_`` to each subsequent lambda. .. ipython:: python - grouped['C'].agg([lambda x: x.max() - x.min(), - lambda x: x.median() - x.mean()]) + grouped["C"].agg([lambda x: x.max() - x.min(), lambda x: x.median() - x.mean()]) @@ -626,22 +647,26 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", - The keywords are the *output* column names - The values are tuples whose first element is the column to select - and the second element is the aggregation to apply to that column. Pandas + and the second element is the aggregation to apply to that column. pandas provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias. .. ipython:: python - animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) animals animals.groupby("kind").agg( - min_height=pd.NamedAgg(column='height', aggfunc='min'), - max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), + min_height=pd.NamedAgg(column="height", aggfunc="min"), + max_height=pd.NamedAgg(column="height", aggfunc="max"), + average_weight=pd.NamedAgg(column="weight", aggfunc=np.mean), ) @@ -650,20 +675,22 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", .. ipython:: python animals.groupby("kind").agg( - min_height=('height', 'min'), - max_height=('height', 'max'), - average_weight=('weight', np.mean), + min_height=("height", "min"), + max_height=("height", "max"), + average_weight=("weight", np.mean), ) -If your desired output column names are not valid python keywords, construct a dictionary +If your desired output column names are not valid Python keywords, construct a dictionary and unpack the keyword arguments .. ipython:: python - animals.groupby("kind").agg(**{ - 'total weight': pd.NamedAgg(column='weight', aggfunc=sum), - }) + animals.groupby("kind").agg( + **{ + "total weight": pd.NamedAgg(column="weight", aggfunc=sum) + } + ) Additional keyword arguments are not passed through to the aggregation functions. Only pairs of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions @@ -682,8 +709,8 @@ no column selection, so the values are just the functions. .. ipython:: python animals.groupby("kind").height.agg( - min_height='min', - max_height='max', + min_height="min", + max_height="max", ) Applying different functions to DataFrame columns @@ -694,8 +721,7 @@ columns of a DataFrame: .. ipython:: python - grouped.agg({'C': np.sum, - 'D': lambda x: np.std(x, ddof=1)}) + grouped.agg({"C": np.sum, "D": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be either implemented on GroupBy or available via :ref:`dispatching @@ -703,7 +729,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. ipython:: python - grouped.agg({'C': 'sum', 'D': 'std'}) + grouped.agg({"C": "sum", "D": "std"}) .. _groupby.aggregate.cython: @@ -715,8 +741,8 @@ optimized Cython implementations: .. ipython:: python - df.groupby('A').sum() - df.groupby(['A', 'B']).mean() + df.groupby("A").sum() + df.groupby(["A", "B"]).mean() Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above code would work even without the special versions via dispatching (see below). @@ -745,15 +771,17 @@ For example, suppose we wished to standardize the data within each group: .. ipython:: python - index = pd.date_range('10/1/1999', periods=1100) + index = pd.date_range("10/1/1999", periods=1100) ts = pd.Series(np.random.normal(0.5, 2, 1100), index) ts = ts.rolling(window=100, min_periods=100).mean().dropna() ts.head() ts.tail() - transformed = (ts.groupby(lambda x: x.year) - .transform(lambda x: (x - x.mean()) / x.std())) + transformed = ts.groupby(lambda x: x.year).transform( + lambda x: (x - x.mean()) / x.std() + ) + We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check: @@ -774,7 +802,7 @@ We can also visually compare the original and transformed data sets. .. ipython:: python - compare = pd.DataFrame({'Original': ts, 'Transformed': transformed}) + compare = pd.DataFrame({"Original": ts, "Transformed": transformed}) @savefig groupby_transform_plot.png compare.plot() @@ -790,8 +818,8 @@ Alternatively, the built-in methods could be used to produce the same outputs. .. ipython:: python - max = ts.groupby(lambda x: x.year).transform('max') - min = ts.groupby(lambda x: x.year).transform('min') + max = ts.groupby(lambda x: x.year).transform("max") + min = ts.groupby(lambda x: x.year).transform("min") max - min @@ -800,7 +828,7 @@ Another common data transform is to replace missing data with the group mean. .. ipython:: python :suppress: - cols = ['A', 'B', 'C'] + cols = ["A", "B", "C"] values = np.random.randn(1000, 3) values[np.random.randint(0, 1000, 100), 0] = np.nan values[np.random.randint(0, 1000, 50), 1] = np.nan @@ -811,7 +839,7 @@ Another common data transform is to replace missing data with the group mean. data_df - countries = np.array(['US', 'UK', 'GR', 'JP']) + countries = np.array(["US", "UK", "GR", "JP"]) key = countries[np.random.randint(0, 4, 1000)] grouped = data_df.groupby(key) @@ -861,11 +889,10 @@ the column B based on the groups of column A. .. ipython:: python - df_re = pd.DataFrame({'A': [1] * 10 + [5] * 10, - 'B': np.arange(20)}) + df_re = pd.DataFrame({"A": [1] * 10 + [5] * 10, "B": np.arange(20)}) df_re - df_re.groupby('A').rolling(4).B.mean() + df_re.groupby("A").rolling(4).B.mean() The ``expanding()`` method will accumulate a given operation @@ -874,7 +901,7 @@ group. .. ipython:: python - df_re.groupby('A').expanding().sum() + df_re.groupby("A").expanding().sum() Suppose you want to use the ``resample()`` method to get a daily @@ -883,13 +910,16 @@ missing values with the ``ffill()`` method. .. ipython:: python - df_re = pd.DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df_re = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") df_re - df_re.groupby('group').resample('1D').ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -913,8 +943,8 @@ with only a couple members. .. ipython:: python - dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) - dff.groupby('B').filter(lambda x: len(x) > 2) + dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")}) + dff.groupby("B").filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are filled @@ -922,14 +952,14 @@ with NaNs. .. ipython:: python - dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False) For DataFrames with multiple columns, filters should explicitly specify a column as the filter criterion. .. ipython:: python - dff['C'] = np.arange(8) - dff.groupby('B').filter(lambda x: len(x['C']) > 2) + dff["C"] = np.arange(8) + dff.groupby("B").filter(lambda x: len(x["C"]) > 2) .. note:: @@ -941,7 +971,7 @@ For DataFrames with multiple columns, filters should explicitly specify a column .. ipython:: python - dff.groupby('B').head(2) + dff.groupby("B").head(2) .. _groupby.dispatch: @@ -955,7 +985,7 @@ functions: .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.agg(lambda x: x.std()) But, it's rather verbose and can be untidy if you need to pass additional @@ -975,12 +1005,14 @@ next). This enables some operations to be carried out rather succinctly: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + tsdf = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) tsdf.iloc[::2] = np.nan grouped = tsdf.groupby(lambda x: x.year) - grouped.fillna(method='pad') + grouped.fillna(method="pad") In this example, we chopped the collection of time series into yearly chunks then independently called :ref:`fillna ` on the @@ -991,7 +1023,7 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys: .. ipython:: python s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3]) - g = pd.Series(list('abababab')) + g = pd.Series(list("abababab")) gb = s.groupby(g) gb.nlargest(3) gb.nsmallest(3) @@ -1010,10 +1042,10 @@ for both ``aggregate`` and ``transform`` in many standard use cases. However, .. ipython:: python df - grouped = df.groupby('A') + grouped = df.groupby("A") # could also just call .describe() - grouped['C'].apply(lambda x: x.describe()) + grouped["C"].apply(lambda x: x.describe()) The dimension of the returned result can also change: @@ -1034,7 +1066,8 @@ that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python def f(x): - return pd.Series([x, x ** 2], index=['x', 'x^2']) + return pd.Series([x, x ** 2], index=["x", "x^2"]) + s = pd.Series(np.random.rand(5)) s @@ -1066,7 +1099,7 @@ will be passed into ``values``, and the group index will be passed into ``index` .. warning:: When using ``engine='numba'``, there will be no "fall back" behavior internally. The group - data and group index will be passed as numpy arrays to the JITed user defined function, and no + data and group index will be passed as NumPy arrays to the JITed user defined function, and no alternative execution attempts will be tried. .. note:: @@ -1135,7 +1168,7 @@ will be (silently) dropped. Thus, this does not pose any problems: .. ipython:: python - df.groupby('A').std() + df.groupby("A").std() Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function @@ -1153,23 +1186,29 @@ is only interesting over one column (here ``colname``), it may be filtered .. ipython:: python from decimal import Decimal + df_dec = pd.DataFrame( - {'id': [1, 2, 1, 2], - 'int_column': [1, 2, 3, 4], - 'dec_column': [Decimal('0.50'), Decimal('0.15'), - Decimal('0.25'), Decimal('0.40')] - } + { + "id": [1, 2, 1, 2], + "int_column": [1, 2, 3, 4], + "dec_column": [ + Decimal("0.50"), + Decimal("0.15"), + Decimal("0.25"), + Decimal("0.40"), + ], + } ) # Decimal columns can be sum'd explicitly by themselves... - df_dec.groupby(['id'])[['dec_column']].sum() + df_dec.groupby(["id"])[["dec_column"]].sum() # ...but cannot be combined with standard data types or they will be excluded - df_dec.groupby(['id'])[['int_column', 'dec_column']].sum() + df_dec.groupby(["id"])[["int_column", "dec_column"]].sum() # Use .agg function to aggregate over standard and "nuisance" data types # at the same time - df_dec.groupby(['id']).agg({'int_column': 'sum', 'dec_column': 'sum'}) + df_dec.groupby(["id"]).agg({"int_column": "sum", "dec_column": "sum"}) .. _groupby.observed: @@ -1184,25 +1223,27 @@ Show all values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False + ).count() Show only the observed values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=True).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=True + ).count() The returned dtype of the grouped will *always* include *all* of the categories that were grouped. .. ipython:: python - s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + s = ( + pd.Series([1, 1, 1]) + .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False) + .count() + ) s.index.dtype .. _groupby.missing: @@ -1226,7 +1267,7 @@ can be used as group keys. If so, the order of the levels will be preserved: data = pd.Series(np.random.randn(100)) - factor = pd.qcut(data, [0, .25, .5, .75, 1.]) + factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) data.groupby(factor).mean() @@ -1242,19 +1283,23 @@ use the ``pd.Grouper`` to provide this local control. import datetime - df = pd.DataFrame({'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime.datetime(2013, 1, 1, 13, 0), - datetime.datetime(2013, 1, 1, 13, 5), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 12, 2, 12, 0), - datetime.datetime(2013, 12, 2, 14, 0)] - }) + df = pd.DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime.datetime(2013, 1, 1, 13, 0), + datetime.datetime(2013, 1, 1, 13, 5), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 12, 2, 12, 0), + datetime.datetime(2013, 12, 2, 14, 0), + ], + } + ) df @@ -1262,18 +1307,18 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() You have an ambiguous specification in that you have a named index and a column that could be potential groupers. .. ipython:: python - df = df.set_index('Date') - df['Date'] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq='6M', key='Date'), 'Buyer']).sum() + df = df.set_index("Date") + df["Date"] = df.index + pd.offsets.MonthEnd(2) + df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum() - df.groupby([pd.Grouper(freq='6M', level='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum() Taking the first rows of each group @@ -1283,10 +1328,10 @@ Just like for a DataFrame or Series you can call head and tail on a groupby: .. ipython:: python - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) df - g = df.groupby('A') + g = df.groupby("A") g.head(1) g.tail(1) @@ -1304,8 +1349,8 @@ will return a single row (or no row) per group if you pass an int for n: .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") g.nth(0) g.nth(-1) @@ -1316,21 +1361,21 @@ If you want to select the nth not-null item, use the ``dropna`` kwarg. For a Dat .. ipython:: python # nth(0) is the same as g.first() - g.nth(0, dropna='any') + g.nth(0, dropna="any") g.first() # nth(-1) is the same as g.last() - g.nth(-1, dropna='any') # NaNs denote group exhausted when using dropna + g.nth(-1, dropna="any") # NaNs denote group exhausted when using dropna g.last() - g.B.nth(0, dropna='all') + g.B.nth(0, dropna="all") As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row. .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A', as_index=False) + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A", as_index=False) g.nth(0) g.nth(-1) @@ -1339,8 +1384,8 @@ You can also select multiple rows from each group by specifying multiple nth val .. ipython:: python - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') - df = pd.DataFrame(1, index=business_dates, columns=['a', 'b']) + business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = pd.DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, 4th, and last date index for each month df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) @@ -1352,12 +1397,12 @@ To see the order in which each row appears within its group, use the .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').cumcount() + dfg.groupby("A").cumcount() - dfg.groupby('A').cumcount(ascending=False) + dfg.groupby("A").cumcount(ascending=False) .. _groupby.ngroup: @@ -1376,12 +1421,12 @@ order they are first observed. .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').ngroup() + dfg.groupby("A").ngroup() - dfg.groupby('A').ngroup(ascending=False) + dfg.groupby("A").ngroup(ascending=False) Plotting ~~~~~~~~ @@ -1394,8 +1439,8 @@ the values in column 1 where the group is "B" are 3 higher on average. np.random.seed(1234) df = pd.DataFrame(np.random.randn(50, 2)) - df['g'] = np.random.choice(['A', 'B'], size=50) - df.loc[df['g'] == 'B', 1] += 3 + df["g"] = np.random.choice(["A", "B"], size=50) + df.loc[df["g"] == "B", 1] += 3 We can easily visualize this with a boxplot: @@ -1403,7 +1448,7 @@ We can easily visualize this with a boxplot: :okwarning: @savefig groupby_boxplot.png - df.groupby('g').boxplot() + df.groupby("g").boxplot() The result of calling ``boxplot`` is a dictionary whose keys are the values of our grouping column ``g`` ("A" and "B"). The values of the resulting dictionary @@ -1438,20 +1483,26 @@ code more readable. First we set the data: .. ipython:: python n = 1000 - df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', - 'Product_2'], n), - 'Revenue': (np.random.random(n) * 50 + 10).round(2), - 'Quantity': np.random.randint(1, 10, size=n)}) + df = pd.DataFrame( + { + "Store": np.random.choice(["Store_1", "Store_2"], n), + "Product": np.random.choice(["Product_1", "Product_2"], n), + "Revenue": (np.random.random(n) * 50 + 10).round(2), + "Quantity": np.random.randint(1, 10, size=n), + } + ) df.head(2) Now, to find prices per store/product, we can simply do: .. ipython:: python - (df.groupby(['Store', 'Product']) - .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) - .unstack().round(2)) + ( + df.groupby(["Store", "Product"]) + .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) + .unstack() + .round(2) + ) Piping can also be expressive when you want to deliver a grouped object to some arbitrary function, for example: @@ -1461,7 +1512,8 @@ arbitrary function, for example: def mean(groupby): return groupby.mean() - df.groupby(['Store', 'Product']).pipe(mean) + + df.groupby(["Store", "Product"]).pipe(mean) where ``mean`` takes a GroupBy object and finds the mean of the Revenue and Quantity columns respectively for each Store-Product combination. The ``mean`` function can @@ -1478,8 +1530,7 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on .. ipython:: python - df = pd.DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], - 'c': [1, 0, 0], 'd': [2, 3, 4]}) + df = pd.DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "c": [1, 0, 0], "d": [2, 3, 4]}) df df.groupby(df.sum(), axis=1).sum() @@ -1538,16 +1589,22 @@ column index name will be used as the name of the inserted column: .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], - 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], - 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], - 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]}) + df = pd.DataFrame( + { + "a": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + "b": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + "c": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + "d": [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + } + ) + def compute_metrics(x): - result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} - return pd.Series(result, name='metrics') + result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} + return pd.Series(result, name="metrics") + - result = df.groupby('a').apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics) result diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 8226e72779588..901f42097b911 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -33,12 +33,14 @@ Further information on any specific method can be obtained in the reshaping text missing_data + duplicates categorical integer_na boolean visualization computation groupby + window timeseries timedeltas style diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 6843dd1eadc81..817ea3445f995 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -46,7 +46,7 @@ Different choices for indexing ------------------------------ Object selection has had a number of user-requested additions in order to -support more explicit location based indexing. Pandas now supports three types +support more explicit location based indexing. pandas now supports three types of multi-axis indexing. * ``.loc`` is primarily label based, but may also be used with a boolean array. ``.loc`` will raise ``KeyError`` when the items are not found. Allowed inputs are: @@ -55,7 +55,7 @@ of multi-axis indexing. *label* of the index. This use is **not** an integer position along the index.). * A list or array of labels ``['a', 'b', 'c']``. - * A slice object with labels ``'a':'f'`` (Note that contrary to usual python + * A slice object with labels ``'a':'f'`` (Note that contrary to usual Python slices, **both** the start and the stop are included, when present in the index! See :ref:`Slicing with labels ` and :ref:`Endpoints are inclusive `.) @@ -313,8 +313,10 @@ Selection by label .. warning:: - Starting in 0.21.0, pandas will show a ``FutureWarning`` if indexing with a list with missing labels. In the future - this will raise a ``KeyError``. See :ref:`list-like Using loc with missing keys in a list is Deprecated `. + .. versionchanged:: 1.0.0 + + pandas will raise a ``KeyError`` if indexing with a list with missing labels. See :ref:`list-like Using loc with + missing keys in a list is Deprecated `. pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. Every label asked for must be in the index, or a ``KeyError`` will be raised. @@ -325,7 +327,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp * A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index.). * A list or array of labels ``['a', 'b', 'c']``. -* A slice object with labels ``'a':'f'`` (Note that contrary to usual python +* A slice object with labels ``'a':'f'`` (Note that contrary to usual Python slices, **both** the start and the stop are included, when present in the index! See :ref:`Slicing with labels `. * A boolean array. @@ -420,6 +422,17 @@ above example, ``s.loc[1:6]`` would raise ``KeyError``. For the rationale behind this behavior, see :ref:`Endpoints are inclusive `. +.. ipython:: python + + s = pd.Series(list('abcdef'), index=[0, 3, 2, 5, 4, 2]) + s.loc[3:5] + +Also, if the index has duplicate labels *and* either the start or the stop label is dupulicated, +an error will be raised. For instance, in the above example, ``s.loc[2:5]`` would raise a ``KeyError``. + +For more information about duplicate labels, see +:ref:`Duplicate Labels `. + .. _indexing.integer: Selection by position @@ -431,7 +444,7 @@ Selection by position This is sometimes called ``chained assignment`` and should be avoided. See :ref:`Returning a View versus Copy `. -Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. +pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. The ``.iloc`` attribute is the primary access method. The following are valid inputs: @@ -496,11 +509,11 @@ For getting a cross section using an integer position (equiv to ``df.xs(1)``): df1.iloc[1] -Out of range slice indexes are handled gracefully just as in Python/Numpy. +Out of range slice indexes are handled gracefully just as in Python/NumPy. .. ipython:: python - # these are allowed in python/numpy. + # these are allowed in Python/NumPy. x = list('abcdef') x x[4:10] @@ -571,47 +584,20 @@ without using a temporary variable. (bb.groupby(['year', 'team']).sum() .loc[lambda df: df['r'] > 100]) -.. _indexing.deprecate_ix: - -IX indexer is deprecated ------------------------- - -.. warning:: - - Starting in 0.20.0, the ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` - and ``.loc`` indexers. -``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide -to index *positionally* OR via *labels* depending on the data type of the index. This has caused quite a -bit of user confusion over the years. +.. _combining_positional_and_label_based_indexing: -The recommended methods of indexing are: +Combining positional and label-based indexing +--------------------------------------------- -* ``.loc`` if you want to *label* index. -* ``.iloc`` if you want to *positionally* index. +If you wish to get the 0th and the 2nd elements from the index in the 'A' column, you can do: .. ipython:: python dfd = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=list('abc')) - dfd - -Previous behavior, where you wish to get the 0th and the 2nd elements from the index in the 'A' column. - -.. code-block:: ipython - - In [3]: dfd.ix[[0, 2], 'A'] - Out[3]: - a 1 - c 3 - Name: A, dtype: int64 - -Using ``.loc``. Here we will select the appropriate indexes from the index, then use *label* indexing. - -.. ipython:: python - dfd.loc[dfd.index[[0, 2]], 'A'] This can also be expressed using ``.iloc``, by explicitly getting locations on the indexers, and using @@ -636,11 +622,13 @@ Indexing with list with missing labels is deprecated .. warning:: - Starting in 0.21.0, using ``.loc`` or ``[]`` with a list with one or more missing labels, is deprecated, in favor of ``.reindex``. + .. versionchanged:: 1.0.0 + + Using ``.loc`` or ``[]`` with a list with one or more missing labels will no longer reindex, in favor of ``.reindex``. In prior versions, using ``.loc[list-of-labels]`` would work as long as *at least 1* of the keys was found (otherwise it -would raise a ``KeyError``). This behavior is deprecated and will show a warning message pointing to this section. The -recommended alternative is to use ``.reindex()``. +would raise a ``KeyError``). This behavior was changed and will now raise a ``KeyError`` if at least one label is missing. +The recommended alternative is to use ``.reindex()``. For example. @@ -928,6 +916,24 @@ and :ref:`Advanced Indexing ` you may select along more than one axis df2.loc[criterion & (df2['b'] == 'x'), 'b':'c'] +.. warning:: + + ``iloc`` supports two kinds of boolean indexing. If the indexer is a boolean ``Series``, + an error will be raised. For instance, in the following example, ``df.iloc[s.values, 1]`` is ok. + The boolean indexer is an array. But ``df.iloc[s, 1]`` would raise ``ValueError``. + + .. ipython:: python + + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], + index=list('abc'), + columns=['A', 'B']) + s = (df['A'] > 2) + s + + df.loc[s, 'B'] + + df.iloc[s.values, 1] + .. _indexing.basics.indexing_isin: Indexing with isin @@ -1124,6 +1130,40 @@ Mask s.mask(s >= 0) df.mask(df >= 0) +.. _indexing.np_where: + +Setting with enlargement conditionally using :func:`numpy` +---------------------------------------------------------- + +An alternative to :meth:`~pandas.DataFrame.where` is to use :func:`numpy.where`. +Combined with setting a new column, you can use it to enlarge a dataframe where the +values are determined conditionally. + +Consider you have two choices to choose from in the following dataframe. And you want to +set a new column color to 'green' when the second column has 'Z'. You can do the +following: + +.. ipython:: python + + df = pd.DataFrame({'col1': list('ABBC'), 'col2': list('ZZXY')}) + df['color'] = np.where(df['col2'] == 'Z', 'green', 'red') + df + +If you have multiple conditions, you can use :func:`numpy.select` to achieve that. Say +corresponding to three conditions there are three choice of colors, with a fourth color +as a fallback, you can do the following. + +.. ipython:: python + + conditions = [ + (df['col2'] == 'Z') & (df['col1'] == 'A'), + (df['col2'] == 'Z') & (df['col1'] == 'B'), + (df['col1'] == 'B') + ] + choices = ['yellow', 'blue', 'purple'] + df['color'] = np.select(conditions, choices, default='black') + df + .. _indexing.query: The :meth:`~pandas.DataFrame.query` Method @@ -1475,17 +1515,27 @@ default value. s.get('a') # equivalent to s['a'] s.get('x', default=-1) -The :meth:`~pandas.DataFrame.lookup` method -------------------------------------------- +.. _indexing.lookup: + +Looking up values by index/column labels +---------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels -and column labels, and the ``lookup`` method allows for this and returns a -NumPy array. For instance: +and column labels, this can be achieved by ``DataFrame.melt`` combined by filtering the corresponding +rows with ``DataFrame.loc``. For instance: .. ipython:: python - dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D']) - dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D']) + df = pd.DataFrame({'col': ["A", "A", "B", "B"], + 'A': [80, 23, np.nan, 22], + 'B': [80, 55, 76, 67]}) + df + melt = df.melt('col') + melt = melt.loc[melt['col'] == melt['variable'], 'value'] + melt.reset_index(drop=True) + +Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method +which was deprecated in version 1.2.0. .. _indexing.class: @@ -1532,12 +1582,8 @@ Setting metadata ~~~~~~~~~~~~~~~~ Indexes are "mostly immutable", but it is possible to set and change their -metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and -``codes``). - -You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_codes`` -to set these attributes directly. They default to returning a copy; however, -you can specify ``inplace=True`` to have the data change in place. +``name`` attribute. You can use the ``rename``, ``set_names`` to set these attributes +directly, and they default to returning a copy. See :ref:`Advanced Indexing ` for usage of MultiIndexes. @@ -1565,19 +1611,16 @@ See :ref:`Advanced Indexing ` for usage of MultiIndexes. Set operations on Index objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The two main operations are ``union (|)`` and ``intersection (&)``. -These can be directly called as instance methods or used via overloaded -operators. Difference is provided via the ``.difference()`` method. +The two main operations are ``union`` and ``intersection``. +Difference is provided via the ``.difference()`` method. .. ipython:: python a = pd.Index(['c', 'b', 'a']) b = pd.Index(['c', 'e', 'd']) - a | b - a & b a.difference(b) -Also available is the ``symmetric_difference (^)`` operation, which returns elements +Also available is the ``symmetric_difference`` operation, which returns elements that appear in either ``idx1`` or ``idx2``, but not in both. This is equivalent to the Index created by ``idx1.difference(idx2).union(idx2.difference(idx1))``, with duplicates dropped. @@ -1587,7 +1630,6 @@ with duplicates dropped. idx1 = pd.Index([1, 2, 3, 4]) idx2 = pd.Index([2, 3, 4, 5]) idx1.symmetric_difference(idx2) - idx1 ^ idx2 .. note:: @@ -1602,7 +1644,7 @@ integer values are converted to float idx1 = pd.Index([0, 1, 2]) idx2 = pd.Index([0.5, 1.5]) - idx1 | idx2 + idx1.union(idx2) .. _indexing.missing: @@ -1801,7 +1843,7 @@ about! Sometimes a ``SettingWithCopy`` warning will arise at times when there's no obvious chained indexing going on. **These** are the bugs that -``SettingWithCopy`` is designed to catch! Pandas is probably trying to warn you +``SettingWithCopy`` is designed to catch! pandas is probably trying to warn you that you've done this: .. code-block:: python @@ -1824,7 +1866,7 @@ When you use chained indexing, the order and type of the indexing operation partially determine whether the result is a slice into the original object, or a copy of the slice. -Pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a +pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a slice is frequently not intentional, but a mistake caused by chained indexing returning a copy where a slice was expected. diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index a45d7a4fa1547..2d5673fe53be3 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -30,7 +30,7 @@ numbers. Construction ------------ -Pandas can represent integer data with possibly missing values using +pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. @@ -112,16 +112,16 @@ dtype if needed. s.iloc[1:3] # operate with other dtypes - s + s.iloc[1:3].astype('Int8') + s + s.iloc[1:3].astype("Int8") # coerce when needed s + 0.01 -These dtypes can operate as part of of ``DataFrame``. +These dtypes can operate as part of ``DataFrame``. .. ipython:: python - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df = pd.DataFrame({"A": s, "B": [1, 1, 3], "C": list("aab")}) df df.dtypes @@ -130,15 +130,15 @@ These dtypes can be merged & reshaped & casted. .. ipython:: python - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) + pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes + df["A"].astype(float) Reduction and groupby operations such as 'sum' work as well. .. ipython:: python df.sum() - df.groupby('B').A.sum() + df.groupby("B").A.sum() Scalar NA Value --------------- diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d4be9d802d697..965833c013c03 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -23,7 +23,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` - ;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` + binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` binary;`OpenDocument `__;:ref:`read_excel`; binary;`HDF5 Format `__;:ref:`read_hdf`;:ref:`to_hdf` binary;`Feather Format `__;:ref:`read_feather`;:ref:`to_feather` @@ -117,9 +117,9 @@ index_col : int, str, sequence of int / str, or False, default ``None`` usecols : list-like or callable, default ``None`` Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in `names` or + that correspond to column names provided either by the user in ``names`` or inferred from the document header row(s). For example, a valid list-like - `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To instantiate a DataFrame from ``data`` with element order preserved use @@ -135,12 +135,10 @@ usecols : list-like or callable, default ``None`` import pandas as pd from io import StringIO - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"]) Using this parameter results in much faster parsing time and lower memory usage. squeeze : boolean, default ``False`` @@ -157,7 +155,7 @@ General parsing configuration dtype : Type name or dict of column -> type, default ``None`` Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` - (unsupported with ``engine='python'``). Use `str` or `object` together + (unsupported with ``engine='python'``). Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve and not interpret dtype. engine : {``'c'``, ``'python'``} @@ -181,10 +179,7 @@ skiprows : list-like or integer, default ``None`` .. ipython:: python - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) @@ -215,19 +210,19 @@ na_values : scalar, str, list-like, or dict, default ``None`` keep_default_na : boolean, default ``True`` Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: + Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If `keep_default_na` is ``True``, and `na_values` are specified, `na_values` + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` is appended to the default NaN values used for parsing. - * If `keep_default_na` is ``True``, and `na_values` are not specified, only + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only the default NaN values are used for parsing. - * If `keep_default_na` is ``False``, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is ``False``, and `na_values` are not specified, no + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the NaN values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no strings will be parsed as NaN. - Note that if `na_filter` is passed in as ``False``, the `keep_default_na` and - `na_values` parameters will be ignored. + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. na_filter : boolean, default ``True`` Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing ``na_filter=False`` can improve the performance @@ -276,10 +271,10 @@ Iteration +++++++++ iterator : boolean, default ``False`` - Return `TextFileReader` object for iteration or getting chunks with + Return ``TextFileReader`` object for iteration or getting chunks with ``get_chunk()``. chunksize : int, default ``None`` - Return `TextFileReader` object for iteration. See :ref:`iterating and chunking + Return ``TextFileReader`` object for iteration. See :ref:`iterating and chunking ` below. Quoting, compression, and file format @@ -287,16 +282,19 @@ Quoting, compression, and file format compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', + bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` - set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to - compression settings. As an example, the following could be passed for - faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are + forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``. + As an example, the following could be passed for faster compression and to + create a reproducible gzip archive: + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. + .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to ``gzip.open``. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` @@ -324,17 +322,17 @@ comment : str, default ``None`` Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), fully - commented lines are ignored by the parameter `header` but not by `skiprows`. + commented lines are ignored by the parameter ``header`` but not by ``skiprows``. For example, if ``comment='#'``, parsing '#empty\\na,b,c\\n1,2,3' with - `header=0` will result in 'a,b,c' being treated as the header. + ``header=0`` will result in 'a,b,c' being treated as the header. encoding : str, default ``None`` Encoding to use for UTF when reading/writing (e.g. ``'utf-8'``). `List of Python standard encodings `_. dialect : str or :class:`python:csv.Dialect` instance, default ``None`` If provided, this parameter will override values (default or not) for the - following parameters: `delimiter`, `doublequote`, `escapechar`, - `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to override values, a ParserWarning will be issued. See :class:`python:csv.Dialect` documentation for more details. @@ -362,17 +360,14 @@ columns: .. ipython:: python import numpy as np - data = ('a,b,c,d\n' - '1,2,3,4\n' - '5,6,7,8\n' - '9,10,11') + + data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11" print(data) df = pd.read_csv(StringIO(data), dtype=object) df - df['a'][0] - df = pd.read_csv(StringIO(data), - dtype={'b': object, 'c': np.float64, 'd': 'Int64'}) + df["a"][0] + df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"}) df.dtypes Fortunately, pandas offers more than one way to ensure that your column(s) @@ -387,14 +382,10 @@ of :func:`~pandas.read_csv`: .. ipython:: python - data = ("col_1\n" - "1\n" - "2\n" - "'A'\n" - "4.22") - df = pd.read_csv(StringIO(data), converters={'col_1': str}) + data = "col_1\n1\n2\n'A'\n4.22" + df = pd.read_csv(StringIO(data), converters={"col_1": str}) df - df['col_1'].apply(type).value_counts() + df["col_1"].apply(type).value_counts() Or you can use the :func:`~pandas.to_numeric` function to coerce the dtypes after reading in the data, @@ -402,9 +393,9 @@ dtypes after reading in the data, .. ipython:: python df2 = pd.read_csv(StringIO(data)) - df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce') + df2["col_1"] = pd.to_numeric(df2["col_1"], errors="coerce") df2 - df2['col_1'].apply(type).value_counts() + df2["col_1"].apply(type).value_counts() which will convert all valid parsing to floats, leaving the invalid parsing as ``NaN``. @@ -426,14 +417,14 @@ worth trying. .. ipython:: python :okwarning: - col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000)) - df = pd.DataFrame({'col_1': col_1}) - df.to_csv('foo.csv') - mixed_df = pd.read_csv('foo.csv') - mixed_df['col_1'].apply(type).value_counts() - mixed_df['col_1'].dtype + col_1 = list(range(500000)) + ["a", "b"] + list(range(500000)) + df = pd.DataFrame({"col_1": col_1}) + df.to_csv("foo.csv") + mixed_df = pd.read_csv("foo.csv") + mixed_df["col_1"].apply(type).value_counts() + mixed_df["col_1"].dtype - will result with `mixed_df` containing an ``int`` dtype for certain chunks + will result with ``mixed_df`` containing an ``int`` dtype for certain chunks of the column, and ``str`` for others due to the mixed dtypes from the data that was read in. It is important to note that the overall column will be marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. @@ -442,7 +433,8 @@ worth trying. :suppress: import os - os.remove('foo.csv') + + os.remove("foo.csv") .. _io.categorical: @@ -454,21 +446,18 @@ Specifying categorical dtype .. ipython:: python - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data)).dtypes - pd.read_csv(StringIO(data), dtype='category').dtypes + pd.read_csv(StringIO(data), dtype="category").dtypes Individual columns can be parsed as a ``Categorical`` using a dict specification: .. ipython:: python - pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes Specifying ``dtype='category'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more @@ -479,16 +468,17 @@ that column's ``dtype``. .. ipython:: python from pandas.api.types import CategoricalDtype - dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) - pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes + + dtype = CategoricalDtype(["d", "c", "b", "a"], ordered=True) + pd.read_csv(StringIO(data), dtype={"col1": dtype}).dtypes When using ``dtype=CategoricalDtype``, "unexpected" values outside of ``dtype.categories`` are treated as missing values. .. ipython:: python - dtype = CategoricalDtype(['a', 'b', 'd']) # No 'c' - pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1 + dtype = CategoricalDtype(["a", "b", "d"]) # No 'c' + pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1 This matches the behavior of :meth:`Categorical.set_categories`. @@ -504,11 +494,11 @@ This matches the behavior of :meth:`Categorical.set_categories`. .. ipython:: python - df = pd.read_csv(StringIO(data), dtype='category') + df = pd.read_csv(StringIO(data), dtype="category") df.dtypes - df['col3'] - df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) - df['col3'] + df["col3"] + df["col3"].cat.categories = pd.to_numeric(df["col3"].cat.categories) + df["col3"] Naming and using columns @@ -524,10 +514,7 @@ used as the column names: .. ipython:: python - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = "a,b,c\n1,2,3\n4,5,6\n7,8,9" print(data) pd.read_csv(StringIO(data)) @@ -538,19 +525,15 @@ any): .. ipython:: python print(data) - pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0) - pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None) + pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0) + pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None) If the header is in a row other than the first, pass the row number to ``header``. This will skip the preceding rows: .. ipython:: python - data = ('skip this skip it\n' - 'a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = "skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9" pd.read_csv(StringIO(data), header=1) .. note:: @@ -571,9 +554,7 @@ distinguish between them so as to prevent overwriting data: .. ipython:: python - data = ('a,b,a\n' - '0,1,2\n' - '3,4,5') + data = "a,b,a\n0,1,2\n3,4,5" pd.read_csv(StringIO(data)) There is no more duplicate data because ``mangle_dupe_cols=True`` by default, @@ -610,18 +591,18 @@ file, either using the column names, position numbers or a callable: .. ipython:: python - data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz' + data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz" pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), usecols=['b', 'd']) + pd.read_csv(StringIO(data), usecols=["b", "d"]) pd.read_csv(StringIO(data), usecols=[0, 2, 3]) - pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C']) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["A", "C"]) The ``usecols`` argument can also be used to specify which columns not to use in the final result: .. ipython:: python - pd.read_csv(StringIO(data), usecols=lambda x: x not in ['a', 'c']) + pd.read_csv(StringIO(data), usecols=lambda x: x not in ["a", "c"]) In this case, the callable is specifying that we exclude the "a" and "c" columns from the output. @@ -639,26 +620,15 @@ be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python - data = ('\n' - 'a,b,c\n' - ' \n' - '# commented line\n' - '1,2,3\n' - '\n' - '4,5,6') + data = "\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6" print(data) - pd.read_csv(StringIO(data), comment='#') + pd.read_csv(StringIO(data), comment="#") If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = ('a,b,c\n' - '\n' - '1,2,3\n' - '\n' - '\n' - '4,5,6') + data = "a,b,c\n\n1,2,3\n\n\n4,5,6" pd.read_csv(StringIO(data), skip_blank_lines=False) .. warning:: @@ -669,32 +639,28 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = ('#comment\n' - 'a,b,c\n' - 'A,B,C\n' - '1,2,3') - pd.read_csv(StringIO(data), comment='#', header=1) - data = ('A,B,C\n' - '#comment\n' - 'a,b,c\n' - '1,2,3') - pd.read_csv(StringIO(data), comment='#', skiprows=2) + data = "#comment\na,b,c\nA,B,C\n1,2,3" + pd.read_csv(StringIO(data), comment="#", header=1) + data = "A,B,C\n#comment\na,b,c\n1,2,3" + pd.read_csv(StringIO(data), comment="#", skiprows=2) If both ``header`` and ``skiprows`` are specified, ``header`` will be relative to the end of ``skiprows``. For example: .. ipython:: python - data = ('# empty\n' - '# second empty line\n' - '# third emptyline\n' - 'X,Y,Z\n' - '1,2,3\n' - 'A,B,C\n' - '1,2.,4.\n' - '5.,NaN,10.0\n') + data = ( + "# empty\n" + "# second empty line\n" + "# third emptyline\n" + "X,Y,Z\n" + "1,2,3\n" + "A,B,C\n" + "1,2.,4.\n" + "5.,NaN,10.0\n" + ) print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + pd.read_csv(StringIO(data), comment="#", skiprows=4, header=1) .. _io.comments: @@ -706,36 +672,38 @@ Sometimes comments or meta data may be included in a file: .. ipython:: python :suppress: - data = ("ID,level,category\n" - "Patient1,123000,x # really unpleasant\n" - "Patient2,23000,y # wouldn't take his medicine\n" - "Patient3,1234018,z # awesome") + data = ( + "ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) By default, the parser includes the comments in the output: .. ipython:: python - df = pd.read_csv('tmp.csv') + df = pd.read_csv("tmp.csv") df We can suppress the comments using the ``comment`` keyword: .. ipython:: python - df = pd.read_csv('tmp.csv', comment='#') + df = pd.read_csv("tmp.csv", comment="#") df .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.unicode: @@ -748,13 +716,12 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python from io import BytesIO - data = (b'word,length\n' - b'Tr\xc3\xa4umen,7\n' - b'Gr\xc3\xbc\xc3\x9fe,5') - data = data.decode('utf8').encode('latin-1') - df = pd.read_csv(BytesIO(data), encoding='latin-1') + + data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5" + data = data.decode("utf8").encode("latin-1") + df = pd.read_csv(BytesIO(data), encoding="latin-1") df - df['word'][1] + df["word"][1] Some formats which encode all characters as multiple bytes, like UTF-16, won't parse correctly at all without specifying the encoding. `Full list of Python @@ -771,16 +738,12 @@ first column will be used as the ``DataFrame``'s row names: .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,5.7\n' - '8,orange,cow,10') + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" pd.read_csv(StringIO(data)) .. ipython:: python - data = ('index,a,b,c\n' - '4,apple,bat,5.7\n' - '8,orange,cow,10') + data = "index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" pd.read_csv(StringIO(data), index_col=0) Ordinarily, you can achieve this behavior using the ``index_col`` option. @@ -791,9 +754,7 @@ index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,\n' - '8,orange,cow,') + data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), index_col=False) @@ -803,12 +764,10 @@ If a subset of data is being parsed using the ``usecols`` option, the .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,\n' - '8,orange,cow,') + data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) - pd.read_csv(StringIO(data), usecols=['b', 'c']) - pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0) + pd.read_csv(StringIO(data), usecols=["b", "c"]) + pd.read_csv(StringIO(data), usecols=["b", "c"], index_col=0) .. _io.parse_dates: @@ -828,14 +787,14 @@ The simplest case is to just pass in ``parse_dates=True``: .. ipython:: python :suppress: - f = open('foo.csv', 'w') - f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f = open("foo.csv", "w") + f.write("date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") f.close() .. ipython:: python # Use a column as an index, and parse it as dates. - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True) df # These are Python datetime objects @@ -853,20 +812,22 @@ column names: .. ipython:: python :suppress: - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]]) + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) df By default the parser removes the component date columns, but you can choose @@ -874,8 +835,9 @@ to retain them via the ``keep_date_col`` keyword: .. ipython:: python - df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]], - keep_date_col=True) + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True + ) df Note that if you wish to combine multiple columns into a single date column, a @@ -888,21 +850,22 @@ You can also use a dict to specify custom name columns: .. ipython:: python - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec) + date_spec = {"nominal": [1, 2], "actual": [1, 3]} + df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) df It is important to remember that if multiple text columns are to be parsed into -a single date column, then a new column is prepended to the data. The `index_col` +a single date column, then a new column is prepended to the data. The ``index_col`` specification is based off of this new set of columns rather than the original data columns: .. ipython:: python - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - index_col=0) # index is the nominal column + date_spec = {"nominal": [1, 2], "actual": [1, 3]} + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=date_spec, index_col=0 + ) # index is the nominal column df .. note:: @@ -926,24 +889,20 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=pd.io.date_converters.parse_date_time) + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=date_spec, date_parser=pd.to_datetime + ) df -Pandas will try to call the ``date_parser`` function in three different ways. If +pandas will try to call the ``date_parser`` function in three different ways. If an exception is raised, the next one is tried: 1. ``date_parser`` is first called with one or more arrays as arguments, - as defined using `parse_dates` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``). + as defined using ``parse_dates`` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``). 2. If #1 fails, ``date_parser`` is called with all the columns concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``). -3. If #2 fails, ``date_parser`` is called once for every row with one or more - string arguments from the columns indicated with `parse_dates` - (e.g., ``date_parser('2013', '1')`` for the first row, ``date_parser('2013', '2')`` - for the second, etc.). - Note that performance-wise, you should try these methods of parsing dates in order: 1. Try to infer the format using ``infer_datetime_format=True`` (see section below). @@ -955,19 +914,11 @@ Note that performance-wise, you should try these methods of parsing dates in ord For optimal performance, this should be vectorized, i.e., it should accept arrays as arguments. -You can explore the date parsing functionality in -`date_converters.py `__ -and add your own. We would love to turn this module into a community supported -set of date/time parsers. To get you started, ``date_converters.py`` contains -functions to parse dual date and time columns, year/month/day columns, -and year/month/day/hour/minute/second columns. It also contains a -``generic_parser`` function so you can curry it with a function that deals with -a single date rather than the entire array. .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.csv.mixed_timezones: @@ -975,7 +926,7 @@ a single date rather than the entire array. Parsing a CSV with mixed timezones ++++++++++++++++++++++++++++++++++ -Pandas cannot natively represent a column or index with mixed timezones. If your CSV +pandas cannot natively represent a column or index with mixed timezones. If your CSV file contains columns with a mixture of timezones, the default result will be an object-dtype column with strings, even with ``parse_dates``. @@ -986,17 +937,20 @@ an object-dtype column with strings, even with ``parse_dates``. a 2000-01-01T00:00:00+05:00 2000-01-01T00:00:00+06:00""" - df = pd.read_csv(StringIO(content), parse_dates=['a']) - df['a'] + df = pd.read_csv(StringIO(content), parse_dates=["a"]) + df["a"] To parse the mixed-timezone values as a datetime column, pass a partially-applied :func:`to_datetime` with ``utc=True`` as the ``date_parser``. .. ipython:: python - df = pd.read_csv(StringIO(content), parse_dates=['a'], - date_parser=lambda col: pd.to_datetime(col, utc=True)) - df['a'] + df = pd.read_csv( + StringIO(content), + parse_dates=["a"], + date_parser=lambda col: pd.to_datetime(col, utc=True), + ) + df["a"] .. _io.dayfirst: @@ -1032,14 +986,18 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With .. ipython:: python # Try to infer the format for the index column - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, - infer_datetime_format=True) + df = pd.read_csv( + "foo.csv", + index_col=0, + parse_dates=True, + infer_datetime_format=True, + ) df .. ipython:: python :suppress: - os.remove('foo.csv') + os.remove("foo.csv") International date formats ++++++++++++++++++++++++++ @@ -1050,19 +1008,34 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python :suppress: - data = ("date,value,cat\n" - "1/6/2000,5,a\n" - "2/6/2000,10,b\n" - "3/6/2000,15,c") - with open('tmp.csv', 'w') as fh: + data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) + + pd.read_csv("tmp.csv", parse_dates=[0]) + pd.read_csv("tmp.csv", dayfirst=True, parse_dates=[0]) + +Writing CSVs to binary file objects ++++++++++++++++++++++++++++++++++++ + +.. versionadded:: 1.2.0 + +``df.to_csv(..., mode="wb")`` allows writing a CSV to a file object +opened binary mode. In most cases, it is not necessary to specify +``mode`` as Pandas will auto-detect whether the file object is +opened in text or binary mode. + +.. ipython:: python + + import io - pd.read_csv('tmp.csv', parse_dates=[0]) - pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, encoding="utf-8", compression="gzip") .. _io.float_precision: @@ -1077,14 +1050,26 @@ writing to a file). For example: .. ipython:: python - val = '0.3066101993807095471566981359501369297504425048828125' - data = 'a,b,c\n1,2,{0}'.format(val) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision=None)['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision='high')['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision='round_trip')['c'][0] - float(val)) + val = "0.3066101993807095471566981359501369297504425048828125" + data = "a,b,c\n1,2,{0}".format(val) + abs( + pd.read_csv( + StringIO(data), + engine="c", + float_precision=None, + )["c"][0] - float(val) + ) + abs( + pd.read_csv( + StringIO(data), + engine="c", + float_precision="high", + )["c"][0] - float(val) + ) + abs( + pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0] + - float(val) + ) .. _io.thousands: @@ -1099,20 +1084,22 @@ correctly: .. ipython:: python :suppress: - data = ("ID|level|category\n" - "Patient1|123,000|x\n" - "Patient2|23,000|y\n" - "Patient3|1,234,018|z") + data = ( + "ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) By default, numbers with a thousands separator will be parsed as strings: .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', sep='|') + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", sep="|") df df.level.dtype @@ -1121,8 +1108,8 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', sep='|', thousands=',') + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", sep="|", thousands=",") df df.level.dtype @@ -1130,7 +1117,7 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.na_values: @@ -1155,7 +1142,7 @@ Let us consider some examples: .. code-block:: python - pd.read_csv('path_to_file.csv', na_values=[5]) + pd.read_csv("path_to_file.csv", na_values=[5]) In the example above ``5`` and ``5.0`` will be recognized as ``NaN``, in addition to the defaults. A string will first be interpreted as a numerical @@ -1163,19 +1150,19 @@ addition to the defaults. A string will first be interpreted as a numerical .. code-block:: python - pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=[""]) + pd.read_csv("path_to_file.csv", keep_default_na=False, na_values=[""]) Above, only an empty field will be recognized as ``NaN``. .. code-block:: python - pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=["NA", "0"]) + pd.read_csv("path_to_file.csv", keep_default_na=False, na_values=["NA", "0"]) Above, both ``NA`` and ``0`` as strings are ``NaN``. .. code-block:: python - pd.read_csv('path_to_file.csv', na_values=["Nope"]) + pd.read_csv("path_to_file.csv", na_values=["Nope"]) The default values, in addition to the string ``"Nope"`` are recognized as ``NaN``. @@ -1198,19 +1185,16 @@ as a ``Series``: .. ipython:: python :suppress: - data = ("level\n" - "Patient1,123000\n" - "Patient2,23000\n" - "Patient3,1234018") + data = "level\nPatient1,123000\nPatient2,23000\nPatient3,1234018" - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) - output = pd.read_csv('tmp.csv', squeeze=True) + output = pd.read_csv("tmp.csv", squeeze=True) output type(output) @@ -1218,7 +1202,7 @@ as a ``Series``: .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.boolean: @@ -1232,12 +1216,10 @@ options as follows: .. ipython:: python - data = ('a,b,c\n' - '1,Yes,2\n' - '3,No,4') + data = "a,b,c\n1,Yes,2\n3,No,4" print(data) pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) + pd.read_csv(StringIO(data), true_values=["Yes"], false_values=["No"]) .. _io.bad_lines: @@ -1251,10 +1233,7 @@ too many fields will raise an error by default: .. ipython:: python :okexcept: - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6,7\n' - '8,9,10') + data = "a,b,c\n1,2,3\n4,5,6,7\n8,9,10" pd.read_csv(StringIO(data)) You can elect to skip bad lines: @@ -1294,9 +1273,7 @@ or a :class:`python:csv.Dialect` instance. .. ipython:: python :suppress: - data = ('label1,label2,label3\n' - 'index1,"a,c,e\n' - 'index2,b,d,f') + data = "label1,label2,label3\n" 'index1,"a,c,e\n' "index2,b,d,f" Suppose you had data with unenclosed quotes: @@ -1314,6 +1291,7 @@ We can get around this using ``dialect``: :okwarning: import csv + dia = csv.excel() dia.quoting = csv.QUOTE_NONE pd.read_csv(StringIO(data), dialect=dia) @@ -1322,15 +1300,15 @@ All of the dialect options can be specified separately by keyword arguments: .. ipython:: python - data = 'a,b,c~1,2,3~4,5,6' - pd.read_csv(StringIO(data), lineterminator='~') + data = "a,b,c~1,2,3~4,5,6" + pd.read_csv(StringIO(data), lineterminator="~") Another common dialect option is ``skipinitialspace``, to skip any whitespace after a delimiter: .. ipython:: python - data = 'a, b, c\n1, 2, 3\n4, 5, 6' + data = "a, b, c\n1, 2, 3\n4, 5, 6" print(data) pd.read_csv(StringIO(data), skipinitialspace=True) @@ -1352,7 +1330,7 @@ should pass the ``escapechar`` option: data = 'a,b\n"hello, \\"Bob\\", nice to see you",5' print(data) - pd.read_csv(StringIO(data), escapechar='\\') + pd.read_csv(StringIO(data), escapechar="\\") .. _io.fwf_reader: .. _io.fwf: @@ -1362,7 +1340,7 @@ Files with fixed width columns While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters -to ``read_fwf`` are largely the same as `read_csv` with two extra parameters, and +to ``read_fwf`` are largely the same as ``read_csv`` with two extra parameters, and a different usage of the ``delimiter`` parameter: * ``colspecs``: A list of pairs (tuples) giving the extents of the @@ -1379,12 +1357,14 @@ a different usage of the ``delimiter`` parameter: .. ipython:: python :suppress: - f = open('bar.csv', 'w') - data1 = ("id8141 360.242940 149.910199 11950.7\n" - "id1594 444.953632 166.985655 11788.4\n" - "id1849 364.136849 183.628767 11806.2\n" - "id1230 413.836124 184.375703 11916.8\n" - "id1948 502.953953 173.237159 12468.3") + f = open("bar.csv", "w") + data1 = ( + "id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3" + ) f.write(data1) f.close() @@ -1392,16 +1372,16 @@ Consider a typical fixed-width data file: .. ipython:: python - print(open('bar.csv').read()) + print(open("bar.csv").read()) In order to parse this file into a ``DataFrame``, we simply need to supply the -column specifications to the `read_fwf` function along with the file name: +column specifications to the ``read_fwf`` function along with the file name: .. ipython:: python # Column specifications are a list of half-intervals colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] - df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0) + df = pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0) df Note how the parser automatically picks column names X. when @@ -1412,7 +1392,7 @@ column widths for contiguous columns: # Widths are a list of integers widths = [6, 14, 13, 10] - df = pd.read_fwf('bar.csv', widths=widths, header=None) + df = pd.read_fwf("bar.csv", widths=widths, header=None) df The parser will take care of extra white spaces around the columns @@ -1425,7 +1405,7 @@ is whitespace). .. ipython:: python - df = pd.read_fwf('bar.csv', header=None, index_col=0) + df = pd.read_fwf("bar.csv", header=None, index_col=0) df ``read_fwf`` supports the ``dtype`` parameter for specifying the types of @@ -1433,13 +1413,13 @@ parsed columns to be different from the inferred type. .. ipython:: python - pd.read_fwf('bar.csv', header=None, index_col=0).dtypes - pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes + pd.read_fwf("bar.csv", header=None, index_col=0).dtypes + pd.read_fwf("bar.csv", header=None, dtype={2: "object"}).dtypes .. ipython:: python :suppress: - os.remove('bar.csv') + os.remove("bar.csv") Indexes @@ -1451,8 +1431,8 @@ Files with an "implicit" index column .. ipython:: python :suppress: - f = open('foo.csv', 'w') - f.write('A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f = open("foo.csv", "w") + f.write("A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") f.close() Consider a file with one less entry in the header than the number of data @@ -1460,27 +1440,27 @@ column: .. ipython:: python - print(open('foo.csv').read()) + print(open("foo.csv").read()) In this special case, ``read_csv`` assumes that the first column is to be used as the index of the ``DataFrame``: .. ipython:: python - pd.read_csv('foo.csv') + pd.read_csv("foo.csv") Note that the dates weren't automatically parsed. In that case you would need to do as before: .. ipython:: python - df = pd.read_csv('foo.csv', parse_dates=True) + df = pd.read_csv("foo.csv", parse_dates=True) df.index .. ipython:: python :suppress: - os.remove('foo.csv') + os.remove("foo.csv") Reading an index with a ``MultiIndex`` @@ -1492,7 +1472,7 @@ Suppose you have data indexed by two columns: .. ipython:: python - print(open('data/mindex_ex.csv').read()) + print(open("data/mindex_ex.csv").read()) The ``index_col`` argument to ``read_csv`` can take a list of column numbers to turn multiple columns into a ``MultiIndex`` for the index of the @@ -1516,10 +1496,11 @@ rows will skip the intervening rows. .. ipython:: python from pandas._testing import makeCustomDataframe as mkdf + df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - df.to_csv('mi.csv') - print(open('mi.csv').read()) - pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1]) + df.to_csv("mi.csv") + print(open("mi.csv").read()) + pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) ``read_csv`` is also able to interpret a more common format of multi-columns indices. @@ -1528,14 +1509,14 @@ of multi-columns indices. :suppress: data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12" - fh = open('mi2.csv', 'w') + fh = open("mi2.csv", "w") fh.write(data) fh.close() .. ipython:: python - print(open('mi2.csv').read()) - pd.read_csv('mi2.csv', header=[0, 1], index_col=0) + print(open("mi2.csv").read()) + pd.read_csv("mi2.csv", header=[0, 1], index_col=0) Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will be *lost*. @@ -1543,8 +1524,8 @@ with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index wi .. ipython:: python :suppress: - os.remove('mi.csv') - os.remove('mi2.csv') + os.remove("mi.csv") + os.remove("mi2.csv") .. _io.sniff: @@ -1559,13 +1540,13 @@ class of the csv module. For this, you have to specify ``sep=None``. :suppress: df = pd.DataFrame(np.random.randn(10, 4)) - df.to_csv('tmp.sv', sep='|') - df.to_csv('tmp2.sv', sep=':') + df.to_csv("tmp.sv", sep="|") + df.to_csv("tmp2.sv", sep=":") .. ipython:: python - print(open('tmp2.sv').read()) - pd.read_csv('tmp2.sv', sep=None, engine='python') + print(open("tmp2.sv").read()) + pd.read_csv("tmp2.sv", sep=None, engine="python") .. _io.multiple_files: @@ -1586,8 +1567,8 @@ rather than reading the entire file into memory, such as the following: .. ipython:: python - print(open('tmp.sv').read()) - table = pd.read_csv('tmp.sv', sep='|') + print(open("tmp.sv").read()) + table = pd.read_csv("tmp.sv", sep="|") table @@ -1596,25 +1577,27 @@ value will be an iterable object of type ``TextFileReader``: .. ipython:: python - reader = pd.read_csv('tmp.sv', sep='|', chunksize=4) - reader + with pd.read_csv("tmp.sv", sep="|", chunksize=4) as reader: + reader + for chunk in reader: + print(chunk) - for chunk in reader: - print(chunk) +.. versionchanged:: 1.2 + ``read_csv/json/sas`` return a context-manager when iterating through a file. Specifying ``iterator=True`` will also return the ``TextFileReader`` object: .. ipython:: python - reader = pd.read_csv('tmp.sv', sep='|', iterator=True) - reader.get_chunk(5) + with pd.read_csv("tmp.sv", sep="|", iterator=True) as reader: + reader.get_chunk(5) .. ipython:: python :suppress: - os.remove('tmp.sv') - os.remove('tmp2.sv') + os.remove("tmp.sv") + os.remove("tmp2.sv") Specifying the parser engine '''''''''''''''''''''''''''' @@ -1632,29 +1615,75 @@ options include: Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. -Reading remote files -'''''''''''''''''''' +.. _io.remote: + +Reading/writing remote files +'''''''''''''''''''''''''''' -You can pass in a URL to a CSV file: +You can pass in a URL to read or write remote files to many of pandas' IO +functions - the following example shows reading a CSV file: .. code-block:: python - df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', - sep='\t') + df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t") -S3 URLs are handled as well but require installing the `S3Fs +All URLs which are not local files or HTTP(s) are handled by +`fsspec`_, if installed, and its various filesystem implementations +(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...). +Some of these implementations will require additional packages to be +installed, for example +S3 URLs require the `s3fs `_ library: .. code-block:: python - df = pd.read_csv('s3://pandas-test/tips.csv') + df = pd.read_json("s3://pandas-test/adatafile.json") -If your S3 bucket requires credentials you will need to set them as environment -variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs -documentation on credentials -`_. +When dealing with remote storage systems, you might need +extra configuration with environment variables or config files in +special locations. For example, to access data in your S3 bucket, +you will need to define credentials in one of the several ways listed in +the `S3Fs documentation +`_. The same is true +for several of the storage backends, and you should follow the links +at `fsimpl1`_ for implementations built into ``fsspec`` and `fsimpl2`_ +for those not included in the main ``fsspec`` +distribution. +You can also pass parameters directly to the backend driver. For example, +if you do *not* have S3 credentials, you can still access public data by +specifying an anonymous connection, such as +.. versionadded:: 1.2.0 + +.. code-block:: python + + pd.read_csv( + "s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" + "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"anon": True}, + ) + +``fsspec`` also allows complex URLs, for accessing data in compressed +archives, local caching of files, and more. To locally cache the above +example, you would modify the call to + +.. code-block:: python + + pd.read_csv( + "simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" + "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"s3": {"anon": True}}, + ) + +where we specify that the "anon" parameter is meant for the "s3" part of +the implementation, not to the caching implementation. Note that this caches to a temporary +directory for the duration of the session only, but you can also specify +a permanent store. + +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ +.. _fsimpl1: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _fsimpl2: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations Writing out data '''''''''''''''' @@ -1668,7 +1697,7 @@ The ``Series`` and ``DataFrame`` objects have an instance method ``to_csv`` whic allows storing the contents of the object as a comma-separated-values file. The function takes a number of arguments. Only the first is required. -* ``path_or_buf``: A string path to the file to write or a file object. If a file object it must be opened with `newline=''` +* ``path_or_buf``: A string path to the file to write or a file object. If a file object it must be opened with ``newline=''`` * ``sep`` : Field delimiter for the output file (default ",") * ``na_rep``: A string representation of a missing value (default '') * ``float_format``: Format string for floating point numbers @@ -1676,13 +1705,13 @@ function takes a number of arguments. Only the first is required. * ``header``: Whether to write out the column names (default True) * ``index``: whether to write row (index) names (default True) * ``index_label``: Column label(s) for index column(s) if desired. If None - (default), and `header` and `index` are True, then the index names are + (default), and ``header`` and ``index`` are True, then the index names are used. (A sequence should be given if the ``DataFrame`` uses MultiIndex). * ``mode`` : Python write mode, default 'w' * ``encoding``: a string representing the encoding to use if the contents are non-ASCII, for Python versions prior to 3 -* ``line_terminator``: Character sequence denoting line end (default `os.linesep`) -* ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a `float_format` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric +* ``line_terminator``: Character sequence denoting line end (default ``os.linesep``) +* ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a ``float_format`` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric * ``quotechar``: Character used to quote fields (default '"') * ``doublequote``: Control quoting of ``quotechar`` in fields (default True) * ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when @@ -1769,7 +1798,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet .. ipython:: python - dfj = pd.DataFrame(np.random.randn(5, 2), columns=list('AB')) + dfj = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) json = dfj.to_json() json @@ -1781,10 +1810,13 @@ file / string. Consider the following ``DataFrame`` and ``Series``: .. ipython:: python - dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), - columns=list('ABC'), index=list('xyz')) + dfjo = pd.DataFrame( + dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), + columns=list("ABC"), + index=list("xyz"), + ) dfjo - sjo = pd.Series(dict(x=15, y=16, z=17), name='D') + sjo = pd.Series(dict(x=15, y=16, z=17), name="D") sjo **Column oriented** (the default for ``DataFrame``) serializes the data as @@ -1835,7 +1867,7 @@ preservation of metadata including but not limited to dtypes and index names. Any orient option that encodes to a JSON object will not preserve the ordering of index and column labels during round-trip serialization. If you wish to preserve - label ordering use the `split` option as it uses ordered containers. + label ordering use the ``split`` option as it uses ordered containers. Date handling +++++++++++++ @@ -1844,24 +1876,24 @@ Writing in ISO date format: .. ipython:: python - dfd = pd.DataFrame(np.random.randn(5, 2), columns=list('AB')) - dfd['date'] = pd.Timestamp('20130101') + dfd = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) + dfd["date"] = pd.Timestamp("20130101") dfd = dfd.sort_index(1, ascending=False) - json = dfd.to_json(date_format='iso') + json = dfd.to_json(date_format="iso") json Writing in ISO date format, with microseconds: .. ipython:: python - json = dfd.to_json(date_format='iso', date_unit='us') + json = dfd.to_json(date_format="iso", date_unit="us") json Epoch timestamps, in seconds: .. ipython:: python - json = dfd.to_json(date_format='epoch', date_unit='s') + json = dfd.to_json(date_format="epoch", date_unit="s") json Writing to a file, with a date index and a date column: @@ -1869,13 +1901,13 @@ Writing to a file, with a date index and a date column: .. ipython:: python dfj2 = dfj.copy() - dfj2['date'] = pd.Timestamp('20130101') - dfj2['ints'] = list(range(5)) - dfj2['bools'] = True - dfj2.index = pd.date_range('20130101', periods=5) - dfj2.to_json('test.json') + dfj2["date"] = pd.Timestamp("20130101") + dfj2["ints"] = list(range(5)) + dfj2["bools"] = True + dfj2.index = pd.date_range("20130101", periods=5) + dfj2.to_json("test.json") - with open('test.json') as fh: + with open("test.json") as fh: print(fh.read()) Fallback behavior @@ -2010,26 +2042,27 @@ Reading from a file: .. ipython:: python - pd.read_json('test.json') + pd.read_json("test.json") Don't convert any data (but still convert axes and dates): .. ipython:: python - pd.read_json('test.json', dtype=object).dtypes + pd.read_json("test.json", dtype=object).dtypes Specify dtypes for conversion: .. ipython:: python - pd.read_json('test.json', dtype={'A': 'float32', 'bools': 'int8'}).dtypes + pd.read_json("test.json", dtype={"A": "float32", "bools": "int8"}).dtypes Preserve string indices: .. ipython:: python - si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), - index=[str(i) for i in range(4)]) + si = pd.DataFrame( + np.zeros((4, 4)), columns=list(range(4)), index=[str(i) for i in range(4)] + ) si si.index si.columns @@ -2044,10 +2077,10 @@ Dates written in nanoseconds need to be read back in nanoseconds: .. ipython:: python - json = dfj2.to_json(date_unit='ns') + json = dfj2.to_json(date_unit="ns") # Try to parse timestamps as milliseconds -> Won't Work - dfju = pd.read_json(json, date_unit='ms') + dfju = pd.read_json(json, date_unit="ms") dfju # Let pandas detect the correct precision @@ -2055,7 +2088,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: dfju # Or specify that all timestamps are in nanoseconds - dfju = pd.read_json(json, date_unit='ns') + dfju = pd.read_json(json, date_unit="ns") dfju The Numpy parameter @@ -2077,7 +2110,7 @@ data: randfloats = np.random.uniform(-100, 1000, 10000) randfloats.shape = (1000, 10) - dffloats = pd.DataFrame(randfloats, columns=list('ABCDEFGHIJ')) + dffloats = pd.DataFrame(randfloats, columns=list("ABCDEFGHIJ")) jsonfloats = dffloats.to_json() @@ -2124,7 +2157,7 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python :suppress: - os.remove('test.json') + os.remove("test.json") .. _io.json_normalize: @@ -2136,38 +2169,54 @@ into a flat table. .. ipython:: python - data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, - {'name': {'given': 'Mose', 'family': 'Regner'}}, - {'id': 2, 'name': 'Faye Raker'}] + data = [ + {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + {"name": {"given": "Mose", "family": "Regner"}}, + {"id": 2, "name": "Faye Raker"}, + ] pd.json_normalize(data) .. ipython:: python - data = [{'state': 'Florida', - 'shortname': 'FL', - 'info': {'governor': 'Rick Scott'}, - 'county': [{'name': 'Dade', 'population': 12345}, - {'name': 'Broward', 'population': 40000}, - {'name': 'Palm Beach', 'population': 60000}]}, - {'state': 'Ohio', - 'shortname': 'OH', - 'info': {'governor': 'John Kasich'}, - 'county': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] - - pd.json_normalize(data, 'county', ['state', 'shortname', ['info', 'governor']]) + data = [ + { + "state": "Florida", + "shortname": "FL", + "info": {"governor": "Rick Scott"}, + "county": [ + {"name": "Dade", "population": 12345}, + {"name": "Broward", "population": 40000}, + {"name": "Palm Beach", "population": 60000}, + ], + }, + { + "state": "Ohio", + "shortname": "OH", + "info": {"governor": "John Kasich"}, + "county": [ + {"name": "Summit", "population": 1234}, + {"name": "Cuyahoga", "population": 1337}, + ], + }, + ] + + pd.json_normalize(data, "county", ["state", "shortname", ["info", "governor"]]) The max_level parameter provides more control over which level to end normalization. With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict. .. ipython:: python - data = [{'CreatedBy': {'Name': 'User001'}, - 'Lookup': {'TextField': 'Some text', - 'UserField': {'Id': 'ID001', - 'Name': 'Name001'}}, - 'Image': {'a': 'b'} - }] + data = [ + { + "CreatedBy": {"Name": "User001"}, + "Lookup": { + "TextField": "Some text", + "UserField": {"Id": "ID001", "Name": "Name001"}, + }, + "Image": {"a": "b"}, + } + ] pd.json_normalize(data, max_level=1) .. _io.jsonl: @@ -2182,19 +2231,19 @@ For line-delimited json files, pandas can also return an iterator which reads in .. ipython:: python - jsonl = ''' + jsonl = """ {"a": 1, "b": 2} {"a": 3, "b": 4} - ''' + """ df = pd.read_json(jsonl, lines=True) df - df.to_json(orient='records', lines=True) + df.to_json(orient="records", lines=True) - # reader is an iterator that returns `chunksize` lines each iteration - reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) - reader - for chunk in reader: - print(chunk) + # reader is an iterator that returns ``chunksize`` lines each iteration + with pd.read_json(StringIO(jsonl), lines=True, chunksize=1) as reader: + reader + for chunk in reader: + print(chunk) .. _io.table_schema: @@ -2208,12 +2257,16 @@ a JSON string with two fields, ``schema`` and ``data``. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) + df = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["a", "b", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=3), + }, + index=pd.Index(range(3), name="idx"), + ) df - df.to_json(orient='table', date_format="iso") + df.to_json(orient="table", date_format="iso") The ``schema`` field contains the ``fields`` key, which itself contains a list of column name to type pairs, including the ``Index`` or ``MultiIndex`` @@ -2230,7 +2283,7 @@ The full list of types supported are described in the Table Schema spec. This table shows the mapping from pandas types: =============== ================= -Pandas type Table Schema type +pandas type Table Schema type =============== ================= int64 integer float64 number @@ -2252,7 +2305,8 @@ A few notes on the generated table schema: .. ipython:: python from pandas.io.json import build_table_schema - s = pd.Series(pd.date_range('2016', periods=4)) + + s = pd.Series(pd.date_range("2016", periods=4)) build_table_schema(s) * datetimes with a timezone (before serializing), include an additional field @@ -2260,8 +2314,7 @@ A few notes on the generated table schema: .. ipython:: python - s_tz = pd.Series(pd.date_range('2016', periods=12, - tz='US/Central')) + s_tz = pd.Series(pd.date_range("2016", periods=12, tz="US/Central")) build_table_schema(s_tz) * Periods are converted to timestamps before serialization, and so have the @@ -2270,8 +2323,7 @@ A few notes on the generated table schema: .. ipython:: python - s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC', - periods=4)) + s_per = pd.Series(1, index=pd.period_range("2016", freq="A-DEC", periods=4)) build_table_schema(s_per) * Categoricals use the ``any`` type and an ``enum`` constraint listing @@ -2279,7 +2331,7 @@ A few notes on the generated table schema: .. ipython:: python - s_cat = pd.Series(pd.Categorical(['a', 'b', 'a'])) + s_cat = pd.Series(pd.Categorical(["a", "b", "a"])) build_table_schema(s_cat) * A ``primaryKey`` field, containing an array of labels, is included @@ -2295,8 +2347,7 @@ A few notes on the generated table schema: .. ipython:: python - s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'), - (0, 1)])) + s_multi = pd.Series(1, index=pd.MultiIndex.from_product([("a", "b"), (0, 1)])) build_table_schema(s_multi) * The default naming roughly follows these rules: @@ -2310,24 +2361,26 @@ A few notes on the generated table schema: then ``level_`` is used. -.. versionadded:: 0.23.0 - ``read_json`` also accepts ``orient='table'`` as an argument. This allows for the preservation of metadata such as dtypes and index names in a round-trippable manner. .. ipython:: python - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c']) - }, index=pd.Index(range(4), name='idx')) + df = pd.DataFrame( + { + "foo": [1, 2, 3, 4], + "bar": ["a", "b", "c", "d"], + "baz": pd.date_range("2018-01-01", freq="d", periods=4), + "qux": pd.Categorical(["a", "b", "c", "c"]), + }, + index=pd.Index(range(4), name="idx"), + ) df df.dtypes - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') + df.to_json("test.json", orient="table") + new_df = pd.read_json("test.json", orient="table") new_df new_df.dtypes @@ -2339,17 +2392,17 @@ indicate missing values and the subsequent read cannot distinguish the intent. .. ipython:: python :okwarning: - df.index.name = 'index' - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') + df.index.name = "index" + df.to_json("test.json", orient="table") + new_df = pd.read_json("test.json", orient="table") print(new_df.index.name) .. ipython:: python :suppress: - os.remove('test.json') + os.remove("test.json") -.. _Table Schema: https://specs.frictionlessdata.io/json-table-schema/ +.. _Table Schema: https://specs.frictionlessdata.io/table-schema/ HTML ---- @@ -2377,7 +2430,7 @@ Read a URL with no options: .. ipython:: python - url = 'https://www.fdic.gov/bank/individual/failed/banklist.html' + url = "https://www.fdic.gov/bank/individual/failed/banklist.html" dfs = pd.read_html(url) dfs @@ -2392,11 +2445,11 @@ as a string: .. ipython:: python :suppress: - file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) + file_path = os.path.abspath(os.path.join("source", "_static", "banklist.html")) .. ipython:: python - with open(file_path, 'r') as f: + with open(file_path, "r") as f: dfs = pd.read_html(f.read()) dfs @@ -2404,7 +2457,7 @@ You can even pass in an instance of ``StringIO`` if you so desire: .. ipython:: python - with open(file_path, 'r') as f: + with open(file_path, "r") as f: sio = StringIO(f.read()) dfs = pd.read_html(sio) @@ -2423,7 +2476,7 @@ Read a URL and match a table that contains specific text: .. code-block:: python - match = 'Metcalf Bank' + match = "Metcalf Bank" df_list = pd.read_html(url, match=match) Specify a header row (by default ```` or ```` elements located within a @@ -2458,15 +2511,15 @@ Specify an HTML attribute: .. code-block:: python - dfs1 = pd.read_html(url, attrs={'id': 'table'}) - dfs2 = pd.read_html(url, attrs={'class': 'sortable'}) + dfs1 = pd.read_html(url, attrs={"id": "table"}) + dfs2 = pd.read_html(url, attrs={"class": "sortable"}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True Specify values that should be converted to NaN: .. code-block:: python - dfs = pd.read_html(url, na_values=['No Acquirer']) + dfs = pd.read_html(url, na_values=["No Acquirer"]) Specify whether to keep the default set of NaN values: @@ -2481,22 +2534,26 @@ columns to strings. .. code-block:: python - url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' - dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, - converters={'MNC': str}) + url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code" + dfs = pd.read_html( + url_mcc, + match="Telekom Albania", + header=0, + converters={"MNC": str}, + ) Use some combination of the above: .. code-block:: python - dfs = pd.read_html(url, match='Metcalf Bank', index_col=0) + dfs = pd.read_html(url, match="Metcalf Bank", index_col=0) Read in pandas ``to_html`` output (with some loss of floating point precision): .. code-block:: python df = pd.DataFrame(np.random.randn(2, 2)) - s = df.to_html(float_format='{0:.40g}'.format) + s = df.to_html(float_format="{0:.40g}".format) dfin = pd.read_html(s, index_col=0) The ``lxml`` backend will raise an error on a failed parse if that is the only @@ -2506,13 +2563,13 @@ for example, the function expects a sequence of strings. You may use: .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml"]) Or you could pass ``flavor='lxml'`` without a list: .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor="lxml") However, if you have bs4 and html5lib installed and pass ``None`` or ``['lxml', 'bs4']`` then the parse will most likely succeed. Note that *as soon as a parse @@ -2520,7 +2577,7 @@ succeeds, the function will return*. .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) .. _io.html: @@ -2542,8 +2599,8 @@ in the method ``to_string`` described above. :suppress: def write_html(df, filename, *args, **kwargs): - static = os.path.abspath(os.path.join('source', '_static')) - with open(os.path.join(static, filename + '.html'), 'w') as f: + static = os.path.abspath(os.path.join("source", "_static")) + with open(os.path.join(static, filename + ".html"), "w") as f: df.to_html(f, *args, **kwargs) .. ipython:: python @@ -2555,7 +2612,7 @@ in the method ``to_string`` described above. .. ipython:: python :suppress: - write_html(df, 'basic') + write_html(df, "basic") HTML: @@ -2571,7 +2628,7 @@ The ``columns`` argument will limit the columns shown: .. ipython:: python :suppress: - write_html(df, 'columns', columns=[0]) + write_html(df, "columns", columns=[0]) HTML: @@ -2583,12 +2640,12 @@ point values: .. ipython:: python - print(df.to_html(float_format='{0:.10f}'.format)) + print(df.to_html(float_format="{0:.10f}".format)) .. ipython:: python :suppress: - write_html(df, 'float_format', float_format='{0:.10f}'.format) + write_html(df, "float_format", float_format="{0:.10f}".format) HTML: @@ -2605,7 +2662,7 @@ off: .. ipython:: python :suppress: - write_html(df, 'nobold', bold_rows=False) + write_html(df, "nobold", bold_rows=False) .. raw:: html :file: ../_static/nobold.html @@ -2616,7 +2673,7 @@ table CSS classes. Note that these classes are *appended* to the existing .. ipython:: python - print(df.to_html(classes=['awesome_table_class', 'even_more_awesome_class'])) + print(df.to_html(classes=["awesome_table_class", "even_more_awesome_class"])) The ``render_links`` argument provides the ability to add hyperlinks to cells that contain URLs. @@ -2625,15 +2682,18 @@ that contain URLs. .. ipython:: python - url_df = pd.DataFrame({ - 'name': ['Python', 'Pandas'], - 'url': ['https://www.python.org/', 'https://pandas.pydata.org']}) + url_df = pd.DataFrame( + { + "name": ["Python", "pandas"], + "url": ["https://www.python.org/", "https://pandas.pydata.org"], + } + ) print(url_df.to_html(render_links=True)) .. ipython:: python :suppress: - write_html(url_df, 'render_links', render_links=True) + write_html(url_df, "render_links", render_links=True) HTML: @@ -2646,14 +2706,14 @@ Finally, the ``escape`` argument allows you to control whether the .. ipython:: python - df = pd.DataFrame({'a': list('&<>'), 'b': np.random.randn(3)}) + df = pd.DataFrame({"a": list("&<>"), "b": np.random.randn(3)}) .. ipython:: python :suppress: - write_html(df, 'escape') - write_html(df, 'noescape', escape=False) + write_html(df, "escape") + write_html(df, "noescape", escape=False) Escaped: @@ -2780,7 +2840,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', sheet_name='Sheet1') + pd.read_excel("path_to_file.xls", sheet_name="Sheet1") .. _io.excel.excelfile_class: @@ -2795,16 +2855,16 @@ read into memory only once. .. code-block:: python - xlsx = pd.ExcelFile('path_to_file.xls') - df = pd.read_excel(xlsx, 'Sheet1') + xlsx = pd.ExcelFile("path_to_file.xls") + df = pd.read_excel(xlsx, "Sheet1") The ``ExcelFile`` class can also be used as a context manager. .. code-block:: python - with pd.ExcelFile('path_to_file.xls') as xls: - df1 = pd.read_excel(xls, 'Sheet1') - df2 = pd.read_excel(xls, 'Sheet2') + with pd.ExcelFile("path_to_file.xls") as xls: + df1 = pd.read_excel(xls, "Sheet1") + df2 = pd.read_excel(xls, "Sheet2") The ``sheet_names`` property will generate a list of the sheet names in the file. @@ -2816,10 +2876,9 @@ different parameters: data = {} # For when Sheet1's format differs from Sheet2 - with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=1) + with pd.ExcelFile("path_to_file.xls") as xls: + data["Sheet1"] = pd.read_excel(xls, "Sheet1", index_col=None, na_values=["NA"]) + data["Sheet2"] = pd.read_excel(xls, "Sheet2", index_col=1) Note that if the same parsing parameters are used for all sheets, a list of sheet names can simply be passed to ``read_excel`` with no loss in performance. @@ -2828,15 +2887,14 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # using the ExcelFile class data = {} - with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=None, - na_values=['NA']) + with pd.ExcelFile("path_to_file.xls") as xls: + data["Sheet1"] = pd.read_excel(xls, "Sheet1", index_col=None, na_values=["NA"]) + data["Sheet2"] = pd.read_excel(xls, "Sheet2", index_col=None, na_values=["NA"]) # equivalent using the read_excel function - data = pd.read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], - index_col=None, na_values=['NA']) + data = pd.read_excel( + "path_to_file.xls", ["Sheet1", "Sheet2"], index_col=None, na_values=["NA"] + ) ``ExcelFile`` can also be called with a ``xlrd.book.Book`` object as a parameter. This allows the user to control how the excel file is read. @@ -2846,10 +2904,11 @@ with ``on_demand=True``. .. code-block:: python import xlrd - xlrd_book = xlrd.open_workbook('path_to_file.xls', on_demand=True) + + xlrd_book = xlrd.open_workbook("path_to_file.xls", on_demand=True) with pd.ExcelFile(xlrd_book) as xls: - df1 = pd.read_excel(xls, 'Sheet1') - df2 = pd.read_excel(xls, 'Sheet2') + df1 = pd.read_excel(xls, "Sheet1") + df2 = pd.read_excel(xls, "Sheet2") .. _io.excel.specifying_sheets: @@ -2871,35 +2930,35 @@ Specifying sheets .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"]) Using the sheet index: .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + pd.read_excel("path_to_file.xls", 0, index_col=None, na_values=["NA"]) Using all default values: .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls') + pd.read_excel("path_to_file.xls") Using None to get all sheets: .. code-block:: python # Returns a dictionary of DataFrames - pd.read_excel('path_to_file.xls', sheet_name=None) + pd.read_excel("path_to_file.xls", sheet_name=None) Using a list to get multiple sheets: .. code-block:: python # Returns the 1st and 4th sheet, as a dictionary of DataFrames. - pd.read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) + pd.read_excel("path_to_file.xls", sheet_name=["Sheet1", 3]) ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. @@ -2920,10 +2979,12 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, - index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']])) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) + df = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([["a", "b"], ["c", "d"]]), + ) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df If the index has level names, they will parsed as well, using the same @@ -2931,9 +2992,9 @@ parameters. .. ipython:: python - df.index = df.index.set_names(['lvl1', 'lvl2']) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) + df.index = df.index.set_names(["lvl1", "lvl2"]) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df @@ -2942,16 +3003,15 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], - names=['c1', 'c2']) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) + df.columns = pd.MultiIndex.from_product([["a"], ["b", "d"]], names=["c1", "c2"]) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], header=[0, 1]) df .. ipython:: python :suppress: - os.remove('path_to_file.xlsx') + os.remove("path_to_file.xlsx") Parsing specific columns @@ -2961,30 +3021,23 @@ It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. ``read_excel`` takes a ``usecols`` keyword to allow you to specify a subset of columns to parse. -.. deprecated:: 0.24.0 +.. versionchanged:: 1.0.0 -Passing in an integer for ``usecols`` has been deprecated. Please pass in a list +Passing in an integer for ``usecols`` will no longer work. Please pass in a list of ints from 0 to ``usecols`` inclusive instead. -If ``usecols`` is an integer, then it is assumed to indicate the last column -to be parsed. +You can specify a comma-delimited set of Excel columns and ranges as a string: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=2) - -You can also specify a comma-delimited set of Excel columns and ranges as a string: - -.. code-block:: python - - pd.read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') + pd.read_excel("path_to_file.xls", "Sheet1", usecols="A,C:E") If ``usecols`` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=[0, 2, 3]) Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. @@ -2996,7 +3049,7 @@ document header row(s). Those strings define which columns will be parsed: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=["foo", "bar"]) Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. @@ -3007,7 +3060,7 @@ the column names, returning names where the callable function evaluates to ``Tru .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=lambda x: x.isalpha()) Parsing dates +++++++++++++ @@ -3019,7 +3072,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + pd.read_excel("path_to_file.xls", "Sheet1", parse_dates=["date_strings"]) Cell converters @@ -3030,7 +3083,7 @@ option. For instance, to convert a column to boolean: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + pd.read_excel("path_to_file.xls", "Sheet1", converters={"MyBools": bool}) This options handles missing values and treats exceptions in the converters as missing data. Transformations are applied cell by cell rather than to the @@ -3045,19 +3098,19 @@ missing data to recover integer dtype: return int(x) if x else -1 - pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + pd.read_excel("path_to_file.xls", "Sheet1", converters={"MyInts": cfun}) Dtype specifications ++++++++++++++++++++ As an alternative to converters, the type for an entire column can -be specified using the `dtype` keyword, which takes a dictionary +be specified using the ``dtype`` keyword, which takes a dictionary mapping column names to types. To interpret data with no type inference, use the type ``str`` or ``object``. .. code-block:: python - pd.read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + pd.read_excel("path_to_file.xls", dtype={"MyInts": "int64", "MyText": str}) .. _io.excel_writer: @@ -3075,7 +3128,7 @@ written. For example: .. code-block:: python - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1") Files with a ``.xls`` extension will be written using ``xlwt`` and those with a ``.xlsx`` extension will be written using ``xlsxwriter`` (if available) or @@ -3088,16 +3141,16 @@ row instead of the first. You can place it in the first row by setting the .. code-block:: python - df.to_excel('path_to_file.xlsx', index_label='label', merge_cells=False) + df.to_excel("path_to_file.xlsx", index_label="label", merge_cells=False) In order to write separate ``DataFrames`` to separate sheets in a single Excel file, one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python - with pd.ExcelWriter('path_to_file.xlsx') as writer: - df1.to_excel(writer, sheet_name='Sheet1') - df2.to_excel(writer, sheet_name='Sheet2') + with pd.ExcelWriter("path_to_file.xlsx") as writer: + df1.to_excel(writer, sheet_name="Sheet1") + df2.to_excel(writer, sheet_name="Sheet2") .. note:: @@ -3113,7 +3166,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. Writing Excel files to memory +++++++++++++++++++++++++++++ -Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or +pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or ``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python @@ -3123,8 +3176,8 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` bio = BytesIO() # By setting the 'engine' in the ExcelWriter constructor. - writer = pd.ExcelWriter(bio, engine='xlsxwriter') - df.to_excel(writer, sheet_name='Sheet1') + writer = pd.ExcelWriter(bio, engine="xlsxwriter") + df.to_excel(writer, sheet_name="Sheet1") # Save the workbook writer.save() @@ -3147,7 +3200,14 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` Excel writer engines '''''''''''''''''''' -Pandas chooses an Excel writer via two methods: +.. deprecated:: 1.2.0 + + As the `xlwt `__ package is no longer + maintained, the ``xlwt`` engine will be removed from a future version + of pandas. This is the only engine in pandas that supports writing to + ``.xls`` files. + +pandas chooses an Excel writer via two methods: 1. the ``engine`` keyword argument 2. the filename extension (via the default specified in config options) @@ -3173,16 +3233,17 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: .. code-block:: python # By setting the 'engine' in the DataFrame 'to_excel()' methods. - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1", engine="xlsxwriter") # By setting the 'engine' in the ExcelWriter constructor. - writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter("path_to_file.xlsx", engine="xlsxwriter") # Or via pandas configuration. from pandas import options # noqa: E402 - options.io.excel.xlsx.writer = 'xlsxwriter' - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + options.io.excel.xlsx.writer = "xlsxwriter" + + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1") .. _io.excel.style: @@ -3213,7 +3274,7 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.ods', engine='odf') + pd.read_excel("path_to_file.ods", engine="odf") .. note:: @@ -3236,7 +3297,7 @@ in files and will return floats instead. .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xlsb', engine='pyxlsb') + pd.read_excel("path_to_file.xlsb", engine="pyxlsb") .. note:: @@ -3279,10 +3340,10 @@ applications (CTRL-V on many operating systems). Here we illustrate writing a .. code-block:: python - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [4, 5, 6], - ... 'C': ['p', 'q', 'r']}, - ... index=['x', 'y', 'z']) + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3], "B": [4, 5, 6], "C": ["p", "q", "r"]}, index=["x", "y", "z"] + ... ) + >>> df A B C x 1 4 p @@ -3312,7 +3373,7 @@ All pandas objects are equipped with ``to_pickle`` methods which use Python's .. ipython:: python df - df.to_pickle('foo.pkl') + df.to_pickle("foo.pkl") The ``read_pickle`` function in the ``pandas`` namespace can be used to load any pickled pandas object (or any other pickled object) from file: @@ -3320,12 +3381,12 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python - pd.read_pickle('foo.pkl') + pd.read_pickle("foo.pkl") .. ipython:: python :suppress: - os.remove('foo.pkl') + os.remove("foo.pkl") .. warning:: @@ -3359,10 +3420,13 @@ the underlying compression library. .. ipython:: python - df = pd.DataFrame({ - 'A': np.random.randn(1000), - 'B': 'foo', - 'C': pd.date_range('20130101', periods=1000, freq='s')}) + df = pd.DataFrame( + { + "A": np.random.randn(1000), + "B": "foo", + "C": pd.date_range("20130101", periods=1000, freq="s"), + } + ) df Using an explicit compression type: @@ -3397,10 +3461,7 @@ Passing options to the compression protocol in order to speed up compression: .. ipython:: python - df.to_pickle( - "data.pkl.gz", - compression={"method": "gzip", 'compresslevel': 1} - ) + df.to_pickle("data.pkl.gz", compression={"method": "gzip", "compresslevel": 1}) .. ipython:: python :suppress: @@ -3421,11 +3482,13 @@ Example pyarrow usage: .. code-block:: python - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame({'A': [1, 2, 3]}) - >>> context = pa.default_serialization_context() - >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes() + import pandas as pd + import pyarrow as pa + + df = pd.DataFrame({"A": [1, 2, 3]}) + + context = pa.default_serialization_context() + df_bytestring = context.serialize(df).to_buffer().to_pybytes() For documentation on pyarrow, see `here `__. @@ -3441,20 +3504,21 @@ for some advanced strategies .. warning:: - pandas requires ``PyTables`` >= 3.0.0. - There is a indexing bug in ``PyTables`` < 3.2 which may appear when querying stores using an index. - If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. - Stores created previously will need to be rewritten using the updated version. + pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle. Loading pickled data received from + untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. .. ipython:: python :suppress: :okexcept: - os.remove('store.h5') + os.remove("store.h5") .. ipython:: python - store = pd.HDFStore('store.h5') + store = pd.HDFStore("store.h5") print(store) Objects can be written to the file just like adding key-value pairs to a @@ -3462,15 +3526,14 @@ dict: .. ipython:: python - index = pd.date_range('1/1/2000', periods=8) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = pd.DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.date_range("1/1/2000", periods=8) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) # store.put('s', s) is an equivalent method - store['s'] = s + store["s"] = s - store['df'] = df + store["df"] = df store @@ -3479,7 +3542,7 @@ In a current or later Python session, you can retrieve stored objects: .. ipython:: python # store.get('df') is an equivalent method - store['df'] + store["df"] # dotted (attribute) access provides get as well store.df @@ -3489,7 +3552,7 @@ Deletion of the object specified by the key: .. ipython:: python # store.remove('df') is an equivalent method - del store['df'] + del store["df"] store @@ -3502,14 +3565,14 @@ Closing a Store and using a context manager: store.is_open # Working with, and automatically closing the store using a context manager - with pd.HDFStore('store.h5') as store: + with pd.HDFStore("store.h5") as store: store.keys() .. ipython:: python :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") @@ -3521,15 +3584,15 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python - df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))}) - df_tl.to_hdf('store_tl.h5', 'table', append=True) - pd.read_hdf('store_tl.h5', 'table', where=['index>2']) + df_tl = pd.DataFrame({"A": list(range(5)), "B": list(range(5))}) + df_tl.to_hdf("store_tl.h5", "table", append=True) + pd.read_hdf("store_tl.h5", "table", where=["index>2"]) .. ipython:: python :suppress: :okexcept: - os.remove('store_tl.h5') + os.remove("store_tl.h5") HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting ``dropna=True``. @@ -3537,24 +3600,28 @@ HDFStore will by default not drop rows that are all missing. This behavior can b .. ipython:: python - df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2], - 'col2': [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame( + { + "col1": [0, np.nan, 2], + "col2": [1, np.nan, np.nan], + } + ) df_with_missing - df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w') + df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") - pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf("file.h5", "df_with_missing") - df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w', dropna=True) - pd.read_hdf('file.h5', 'df_with_missing') + df_with_missing.to_hdf( + "file.h5", "df_with_missing", format="table", mode="w", dropna=True + ) + pd.read_hdf("file.h5", "df_with_missing") .. ipython:: python :suppress: - os.remove('file.h5') + os.remove("file.h5") .. _io.hdf5-fixed: @@ -3575,8 +3642,8 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. code-block:: python - >>> pd.DataFrame(np.random.randn(10, 2)).to_hdf('test_fixed.h5', 'df') - >>> pd.read_hdf('test_fixed.h5', 'df', where='index>5') + >>> pd.DataFrame(np.random.randn(10, 2)).to_hdf("test_fixed.h5", "df") + >>> pd.read_hdf("test_fixed.h5", "df", where="index>5") TypeError: cannot pass a where specification when reading a fixed format. this store must be selected in its entirety @@ -3600,21 +3667,21 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format. :suppress: :okexcept: - os.remove('store.h5') + os.remove("store.h5") .. ipython:: python - store = pd.HDFStore('store.h5') + store = pd.HDFStore("store.h5") df1 = df[0:4] df2 = df[4:] # append data (creates a table automatically) - store.append('df', df1) - store.append('df', df2) + store.append("df", df1) + store.append("df", df2) store # select the entire object - store.select('df') + store.select("df") # the type of stored data store.root.df._v_attrs.pandas_type @@ -3637,16 +3704,16 @@ everything in the sub-store and **below**, so be *careful*. .. ipython:: python - store.put('foo/bar/bah', df) - store.append('food/orange', df) - store.append('food/apple', df) + store.put("foo/bar/bah", df) + store.append("food/orange", df) + store.append("food/apple", df) store # a list of keys are returned store.keys() # remove all nodes under this level - store.remove('food') + store.remove("food") store @@ -3660,10 +3727,10 @@ will yield a tuple for each group key along with the relative keys of its conten for (path, subgroups, subkeys) in store.walk(): for subgroup in subgroups: - print('GROUP: {}/{}'.format(path, subgroup)) + print("GROUP: {}/{}".format(path, subgroup)) for subkey in subkeys: - key = '/'.join([path, subkey]) - print('KEY: {}'.format(key)) + key = "/".join([path, subkey]) + print("KEY: {}".format(key)) print(store.get(key)) @@ -3687,7 +3754,7 @@ will yield a tuple for each group key along with the relative keys of its conten .. ipython:: python - store['foo/bar/bah'] + store["foo/bar/bah"] .. _io.hdf5-types: @@ -3706,24 +3773,27 @@ Passing ``min_itemsize={`values`: size}`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools, datetime64`` are currently supported. For string columns, passing ``nan_rep = 'nan'`` to append will change the default -nan representation on disk (which converts to/from `np.nan`), this -defaults to `nan`. - -.. ipython:: python - - df_mixed = pd.DataFrame({'A': np.random.randn(8), - 'B': np.random.randn(8), - 'C': np.array(np.random.randn(8), dtype='float32'), - 'string': 'string', - 'int': 1, - 'bool': True, - 'datetime64': pd.Timestamp('20010102')}, - index=list(range(8))) - df_mixed.loc[df_mixed.index[3:5], - ['A', 'B', 'string', 'datetime64']] = np.nan - - store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) - df_mixed1 = store.select('df_mixed') +nan representation on disk (which converts to/from ``np.nan``), this +defaults to ``nan``. + +.. ipython:: python + + df_mixed = pd.DataFrame( + { + "A": np.random.randn(8), + "B": np.random.randn(8), + "C": np.array(np.random.randn(8), dtype="float32"), + "string": "string", + "int": 1, + "bool": True, + "datetime64": pd.Timestamp("20010102"), + }, + index=list(range(8)), + ) + df_mixed.loc[df_mixed.index[3:5], ["A", "B", "string", "datetime64"]] = np.nan + + store.append("df_mixed", df_mixed, min_itemsize={"values": 50}) + df_mixed1 = store.select("df_mixed") df_mixed1 df_mixed1.dtypes.value_counts() @@ -3738,20 +3808,19 @@ storing/selecting from homogeneous index ``DataFrames``. .. ipython:: python - index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) df_mi - store.append('df_mi', df_mi) - store.select('df_mi') + store.append("df_mi", df_mi) + store.select("df_mi") # the levels are automatically included as data columns - store.select('df_mi', 'foo=bar') + store.select("df_mi", "foo=bar") .. note:: The ``index`` keyword is reserved and cannot be use as a level name. @@ -3828,7 +3897,7 @@ The right-hand side of the sub-expression (after a comparison operator) can be: .. code-block:: python string = "HolyMoly'" - store.select('df', 'index == string') + store.select("df", "index == string") instead of this @@ -3845,7 +3914,7 @@ The right-hand side of the sub-expression (after a comparison operator) can be: .. code-block:: python - store.select('df', 'index == %r' % string) + store.select("df", "index == %r" % string) which will quote ``string``. @@ -3854,21 +3923,24 @@ Here are some examples: .. ipython:: python - dfq = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'), - index=pd.date_range('20130101', periods=10)) - store.append('dfq', dfq, format='table', data_columns=True) + dfq = pd.DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=pd.date_range("20130101", periods=10), + ) + store.append("dfq", dfq, format="table", data_columns=True) Use boolean expressions, with in-line function evaluation. .. ipython:: python - store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']") + store.select("dfq", "index>pd.Timestamp('20130104') & columns=['A', 'B']") Use inline column reference. .. ipython:: python - store.select('dfq', where="A>0 or C>0") + store.select("dfq", where="A>0 or C>0") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3876,7 +3948,7 @@ returned, this is equivalent to passing a .. ipython:: python - store.select('df', "columns=['A', 'B']") + store.select("df", "columns=['A', 'B']") ``start`` and ``stop`` parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. @@ -3902,14 +3974,20 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame({'A': pd.Timestamp('20130101'), - 'B': [pd.Timestamp('20130101') + timedelta(days=i, - seconds=10) - for i in range(10)]}) - dftd['C'] = dftd['A'] - dftd['B'] + + dftd = pd.DataFrame( + { + "A": pd.Timestamp("20130101"), + "B": [ + pd.Timestamp("20130101") + timedelta(days=i, seconds=10) + for i in range(10) + ], + } + ) + dftd["C"] = dftd["A"] - dftd["B"] dftd - store.append('dftd', dftd, data_columns=True) - store.select('dftd', "C<'-3.5D'") + store.append("dftd", dftd, data_columns=True) + store.select("dftd", "C<'-3.5D'") .. _io.query_multi: @@ -3921,7 +3999,7 @@ Selecting from a ``MultiIndex`` can be achieved by using the name of the level. .. ipython:: python df_mi.index.names - store.select('df_mi', "foo=baz and bar=two") + store.select("df_mi", "foo=baz and bar=two") If the ``MultiIndex`` levels names are ``None``, the levels are automatically made available via the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to select from. @@ -3932,8 +4010,7 @@ the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to s levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], ) - df_mi_2 = pd.DataFrame(np.random.randn(10, 3), - index=index, columns=["A", "B", "C"]) + df_mi_2 = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) df_mi_2 store.append("df_mi_2", df_mi_2) @@ -3964,7 +4041,7 @@ indexed dimension as the ``where``. i.optlevel, i.kind # change an index by passing new parameters - store.create_table_index('df', optlevel=9, kind='full') + store.create_table_index("df", optlevel=9, kind="full") i = store.root.df.table.cols.index.index i.optlevel, i.kind @@ -3972,20 +4049,20 @@ Oftentimes when appending large amounts of data to a store, it is useful to turn .. ipython:: python - df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) - df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) + df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) + df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) - st = pd.HDFStore('appends.h5', mode='w') - st.append('df', df_1, data_columns=['B'], index=False) - st.append('df', df_2, data_columns=['B'], index=False) - st.get_storer('df').table + st = pd.HDFStore("appends.h5", mode="w") + st.append("df", df_1, data_columns=["B"], index=False) + st.append("df", df_2, data_columns=["B"], index=False) + st.get_storer("df").table Then create the index when finished appending. .. ipython:: python - st.create_table_index('df', columns=['B'], optlevel=9, kind='full') - st.get_storer('df').table + st.create_table_index("df", columns=["B"], optlevel=9, kind="full") + st.get_storer("df").table st.close() @@ -3993,7 +4070,7 @@ Then create the index when finished appending. :suppress: :okexcept: - os.remove('appends.h5') + os.remove("appends.h5") See `here `__ for how to create a completely-sorted-index (CSI) on an existing store. @@ -4003,7 +4080,7 @@ Query via data columns ++++++++++++++++++++++ You can designate (and index) certain columns that you want to be able -to perform queries (other than the `indexable` columns, which you can +to perform queries (other than the ``indexable`` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. You can specify ``data_columns = True`` to force all columns to @@ -4012,29 +4089,29 @@ be ``data_columns``. .. ipython:: python df_dc = df.copy() - df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4:6], 'string'] = np.nan - df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' - df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0 + df_dc["string"] = "foo" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" + df_dc["string2"] = "cool" + df_dc.loc[df_dc.index[1:3], ["B", "C"]] = 1.0 df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2']) - store.select('df_dc', where='B > 0') + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + store.select("df_dc", where="B > 0") # getting creative - store.select('df_dc', 'B > 0 & C > 0 & string == foo') + store.select("df_dc", "B > 0 & C > 0 & string == foo") # this is in-memory version of this type of selection - df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] # we have automagically created this index and the B/C/string/string2 # columns are stored separately as ``PyTables`` columns store.root.df_dc.table There is some performance degradation by making lots of columns into -`data columns`, so it is up to the user to designate these. In addition, +``data columns``, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!). @@ -4048,7 +4125,7 @@ The default is 50,000 rows returned in a chunk. .. ipython:: python - for df in store.select('df', chunksize=3): + for df in store.select("df", chunksize=3): print(df) .. note:: @@ -4058,7 +4135,7 @@ The default is 50,000 rows returned in a chunk. .. code-block:: python - for df in pd.read_hdf('store.h5', 'df', chunksize=3): + for df in pd.read_hdf("store.h5", "df", chunksize=3): print(df) Note, that the chunksize keyword applies to the **source** rows. So if you @@ -4070,18 +4147,20 @@ chunks. .. ipython:: python - dfeq = pd.DataFrame({'number': np.arange(1, 11)}) + dfeq = pd.DataFrame({"number": np.arange(1, 11)}) dfeq - store.append('dfeq', dfeq, data_columns=['number']) + store.append("dfeq", dfeq, data_columns=["number"]) + def chunks(l, n): - return [l[i:i + n] for i in range(0, len(l), n)] + return [l[i: i + n] for i in range(0, len(l), n)] + evens = [2, 4, 6, 8, 10] - coordinates = store.select_as_coordinates('dfeq', 'number=evens') + coordinates = store.select_as_coordinates("dfeq", "number=evens") for c in chunks(coordinates, 2): - print(store.select('dfeq', where=c)) + print(store.select("dfeq", where=c)) Advanced queries ++++++++++++++++ @@ -4096,8 +4175,8 @@ These do not currently accept the ``where`` selector. .. ipython:: python - store.select_column('df_dc', 'index') - store.select_column('df_dc', 'string') + store.select_column("df_dc", "index") + store.select_column("df_dc", "string") .. _io.hdf5-selecting_coordinates: @@ -4110,12 +4189,13 @@ Sometimes you want to get the coordinates (a.k.a the index locations) of your qu .. ipython:: python - df_coord = pd.DataFrame(np.random.randn(1000, 2), - index=pd.date_range('20000101', periods=1000)) - store.append('df_coord', df_coord) - c = store.select_as_coordinates('df_coord', 'index > 20020101') + df_coord = pd.DataFrame( + np.random.randn(1000, 2), index=pd.date_range("20000101", periods=1000) + ) + store.append("df_coord", df_coord) + c = store.select_as_coordinates("df_coord", "index > 20020101") c - store.select('df_coord', where=c) + store.select("df_coord", where=c) .. _io.hdf5-where_mask: @@ -4128,12 +4208,13 @@ a datetimeindex which are 5. .. ipython:: python - df_mask = pd.DataFrame(np.random.randn(1000, 2), - index=pd.date_range('20000101', periods=1000)) - store.append('df_mask', df_mask) - c = store.select_column('df_mask', 'index') + df_mask = pd.DataFrame( + np.random.randn(1000, 2), index=pd.date_range("20000101", periods=1000) + ) + store.append("df_mask", df_mask) + c = store.select_column("df_mask", "index") where = c[pd.DatetimeIndex(c).month == 5].index - store.select('df_mask', where=where) + store.select("df_mask", where=where) Storer object ^^^^^^^^^^^^^ @@ -4144,7 +4225,7 @@ of rows in an object. .. ipython:: python - store.get_storer('df_dc').nrows + store.get_storer("df_dc").nrows Multiple table queries @@ -4161,7 +4242,7 @@ having a very wide table, but enables more efficient queries. The ``append_to_multiple`` method splits a given single DataFrame into multiple tables according to ``d``, a dictionary that maps the -table names to a list of 'columns' you want in that table. If `None` +table names to a list of 'columns' you want in that table. If ``None`` is used in place of a list, that table will have the remaining unspecified columns of the given DataFrame. The argument ``selector`` defines which table is the selector table (which you can make queries from). @@ -4177,24 +4258,30 @@ results. .. ipython:: python - df_mt = pd.DataFrame(np.random.randn(8, 6), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) - df_mt['foo'] = 'bar' - df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan + df_mt = pd.DataFrame( + np.random.randn(8, 6), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C", "D", "E", "F"], + ) + df_mt["foo"] = "bar" + df_mt.loc[df_mt.index[1], ("A", "B")] = np.nan # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, - df_mt, selector='df1_mt') + store.append_to_multiple( + {"df1_mt": ["A", "B"], "df2_mt": None}, df_mt, selector="df1_mt" + ) store # individual tables were created - store.select('df1_mt') - store.select('df2_mt') + store.select("df1_mt") + store.select("df2_mt") # as a multiple - store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector='df1_mt') + store.select_as_multiple( + ["df1_mt", "df2_mt"], + where=["A>0", "B>0"], + selector="df1_mt", + ) Delete from a table @@ -4303,14 +4390,15 @@ Enable compression for all objects within the file: .. code-block:: python - store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, - complib='blosc:blosclz') + store_compressed = pd.HDFStore( + "store_compressed.h5", complevel=9, complib="blosc:blosclz" + ) Or on-the-fly compression (this only applies to tables) in stores where compression is not enabled: .. code-block:: python - store.append('df', df, complib='zlib', complevel=5) + store.append("df", df, complib="zlib", complevel=5) .. _io.hdf5-ptrepack: @@ -4399,13 +4487,14 @@ stored in a more efficient manner. .. ipython:: python - dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), - 'B': np.random.randn(8)}) + dfcat = pd.DataFrame( + {"A": pd.Series(list("aabbcdba")).astype("category"), "B": np.random.randn(8)} + ) dfcat dfcat.dtypes - cstore = pd.HDFStore('cats.h5', mode='w') - cstore.append('dfcat', dfcat, format='table', data_columns=['A']) - result = cstore.select('dfcat', where="A in ['b', 'c']") + cstore = pd.HDFStore("cats.h5", mode="w") + cstore.append("dfcat", dfcat, format="table", data_columns=["A"]) + result = cstore.select("dfcat", where="A in ['b', 'c']") result result.dtypes @@ -4414,7 +4503,7 @@ stored in a more efficient manner. :okexcept: cstore.close() - os.remove('cats.h5') + os.remove("cats.h5") String columns @@ -4441,17 +4530,17 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5))) + dfs = pd.DataFrame({"A": "foo", "B": "bar"}, index=list(range(5))) dfs # A and B have a size of 30 - store.append('dfs', dfs, min_itemsize=30) - store.get_storer('dfs').table + store.append("dfs", dfs, min_itemsize=30) + store.get_storer("dfs").table # A is created as a data_column with a size of 30 # B is size is calculated - store.append('dfs2', dfs, min_itemsize={'A': 30}) - store.get_storer('dfs2').table + store.append("dfs2", dfs, min_itemsize={"A": 30}) + store.get_storer("dfs2").table **nan_rep** @@ -4460,15 +4549,15 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']}) + dfss = pd.DataFrame({"A": ["foo", "bar", "nan"]}) dfss - store.append('dfss', dfss) - store.select('dfss') + store.append("dfss", dfss) + store.select("dfss") # here you need to specify a different nan rep - store.append('dfss2', dfss, nan_rep='_nan_') - store.select('dfss2') + store.append("dfss2", dfss, nan_rep="_nan_") + store.select("dfss2") .. _io.external_compatibility: @@ -4487,21 +4576,25 @@ It is possible to write an ``HDFStore`` object that can easily be imported into .. ipython:: python - df_for_r = pd.DataFrame({"first": np.random.rand(100), - "second": np.random.rand(100), - "class": np.random.randint(0, 2, (100, ))}, - index=range(100)) + df_for_r = pd.DataFrame( + { + "first": np.random.rand(100), + "second": np.random.rand(100), + "class": np.random.randint(0, 2, (100,)), + }, + index=range(100), + ) df_for_r.head() - store_export = pd.HDFStore('export.h5') - store_export.append('df_for_r', df_for_r, data_columns=df_dc.columns) + store_export = pd.HDFStore("export.h5") + store_export.append("df_for_r", df_for_r, data_columns=df_dc.columns) store_export .. ipython:: python :suppress: store_export.close() - os.remove('export.h5') + os.remove("export.h5") In R this file can be read into a ``data.frame`` object using the ``rhdf5`` library. The following example function reads the corresponding column names @@ -4588,7 +4681,7 @@ Performance :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") .. _io.feather: @@ -4618,21 +4711,26 @@ See the `Full Documentation `__. :suppress: import warnings + # This can be removed once building with pyarrow >=0.15.0 warnings.filterwarnings("ignore", "The Sparse", FutureWarning) .. ipython:: python - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'i': pd.date_range('20130101', periods=3, freq='ns')}) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, freq="ns"), + } + ) df df.dtypes @@ -4641,13 +4739,13 @@ Write to a feather file. .. ipython:: python - df.to_feather('example.feather') + df.to_feather("example.feather") Read from a feather file. .. ipython:: python - result = pd.read_feather('example.feather') + result = pd.read_feather("example.feather") result # we preserve dtypes @@ -4656,7 +4754,7 @@ Read from a feather file. .. ipython:: python :suppress: - os.remove('example.feather') + os.remove("example.feather") .. _io.parquet: @@ -4676,7 +4774,7 @@ Several caveats. * Duplicate column names and non-string columns names are not supported. * The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default - indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can + indexes. This extra column can cause problems for non-pandas consumers that are not expecting it. You can force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. @@ -4701,15 +4799,19 @@ See the documentation for `pyarrow `__ an .. ipython:: python - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.date_range('20130101', periods=3), - 'g': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'h': pd.Categorical(list('abc')), - 'i': pd.Categorical(list('abc'), ordered=True)}) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.Categorical(list("abc")), + "i": pd.Categorical(list("abc"), ordered=True), + } + ) df df.dtypes @@ -4719,15 +4821,15 @@ Write to a parquet file. .. ipython:: python :okwarning: - df.to_parquet('example_pa.parquet', engine='pyarrow') - df.to_parquet('example_fp.parquet', engine='fastparquet') + df.to_parquet("example_pa.parquet", engine="pyarrow") + df.to_parquet("example_fp.parquet", engine="fastparquet") Read from a parquet file. .. ipython:: python - result = pd.read_parquet('example_fp.parquet', engine='fastparquet') - result = pd.read_parquet('example_pa.parquet', engine='pyarrow') + result = pd.read_parquet("example_fp.parquet", engine="fastparquet") + result = pd.read_parquet("example_pa.parquet", engine="pyarrow") result.dtypes @@ -4735,18 +4837,24 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet('example_fp.parquet', - engine='fastparquet', columns=['a', 'b']) - result = pd.read_parquet('example_pa.parquet', - engine='pyarrow', columns=['a', 'b']) + result = pd.read_parquet( + "example_fp.parquet", + engine="fastparquet", + columns=["a", "b"], + ) + result = pd.read_parquet( + "example_pa.parquet", + engine="pyarrow", + columns=["a", "b"], + ) result.dtypes .. ipython:: python :suppress: - os.remove('example_pa.parquet') - os.remove('example_fp.parquet') + os.remove("example_pa.parquet") + os.remove("example_fp.parquet") Handling indexes @@ -4757,8 +4865,8 @@ more columns in the output file. Thus, this code: .. ipython:: python - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - df.to_parquet('test.parquet', engine='pyarrow') + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.to_parquet("test.parquet", engine="pyarrow") creates a parquet file with *three* columns if you use ``pyarrow`` for serialization: ``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the @@ -4773,7 +4881,7 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to .. ipython:: python - df.to_parquet('test.parquet', index=False) + df.to_parquet("test.parquet", index=False) This creates a parquet file with just the two expected columns, ``a`` and ``b``. If your ``DataFrame`` has a custom index, you won't get it back when you load @@ -4785,7 +4893,7 @@ underlying engine's default behavior. .. ipython:: python :suppress: - os.remove('test.parquet') + os.remove("test.parquet") Partitioning Parquet files @@ -4797,12 +4905,11 @@ Parquet supports partitioning of data based on the values of one or more columns .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) - df.to_parquet(path='test', engine='pyarrow', - partition_cols=['a'], compression=None) + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}) + df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None) -The `path` specifies the parent directory to which data will be saved. -The `partition_cols` are the column names by which the dataset will be partitioned. +The ``path`` specifies the parent directory to which data will be saved. +The ``partition_cols`` are the column names by which the dataset will be partitioned. Columns are partitioned in the order they are given. The partition splits are determined by the unique values in the partition columns. The above example creates a partitioned dataset that may look like: @@ -4821,8 +4928,9 @@ The above example creates a partitioned dataset that may look like: :suppress: from shutil import rmtree + try: - rmtree('test') + rmtree("test") except OSError: pass @@ -4834,7 +4942,7 @@ ORC .. versionadded:: 1.0.0 Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization -for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the +for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. .. _io.sql: @@ -4890,15 +4998,16 @@ below and the SQLAlchemy `documentation / # where is relative: - engine = create_engine('sqlite:///foo.db') + engine = create_engine("sqlite:///foo.db") # or absolute, starting with a slash: - engine = create_engine('sqlite:////absolute/path/to/foo.db') + engine = create_engine("sqlite:////absolute/path/to/foo.db") For more information see the examples the SQLAlchemy `documentation `__ @@ -5215,21 +5332,25 @@ Use :func:`sqlalchemy.text` to specify query parameters in a backend-neutral way .. ipython:: python import sqlalchemy as sa - pd.read_sql(sa.text('SELECT * FROM data where Col_1=:col1'), - engine, params={'col1': 'X'}) + + pd.read_sql( + sa.text("SELECT * FROM data where Col_1=:col1"), engine, params={"col1": "X"} + ) If you have an SQLAlchemy description of your database you can express where conditions using SQLAlchemy expressions .. ipython:: python metadata = sa.MetaData() - data_table = sa.Table('data', metadata, - sa.Column('index', sa.Integer), - sa.Column('Date', sa.DateTime), - sa.Column('Col_1', sa.String), - sa.Column('Col_2', sa.Float), - sa.Column('Col_3', sa.Boolean), - ) + data_table = sa.Table( + "data", + metadata, + sa.Column("index", sa.Integer), + sa.Column("Date", sa.DateTime), + sa.Column("Col_1", sa.String), + sa.Column("Col_2", sa.Float), + sa.Column("Col_3", sa.Boolean), + ) pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine) @@ -5238,8 +5359,9 @@ You can combine SQLAlchemy expressions with parameters passed to :func:`read_sql .. ipython:: python import datetime as dt - expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam('date')) - pd.read_sql(expr, engine, params={'date': dt.datetime(2010, 10, 18)}) + + expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam("date")) + pd.read_sql(expr, engine, params={"date": dt.datetime(2010, 10, 18)}) Sqlite fallback @@ -5254,13 +5376,14 @@ You can create connections like so: .. code-block:: python import sqlite3 - con = sqlite3.connect(':memory:') + + con = sqlite3.connect(":memory:") And then issue the following queries: .. code-block:: python - data.to_sql('data', con) + data.to_sql("data", con) pd.read_sql_query("SELECT * FROM data", con) @@ -5297,8 +5420,8 @@ into a .dta file. The format version of this file is always 115 (Stata 12). .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) - df.to_stata('stata.dta') + df = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.to_stata("stata.dta") *Stata* data files have limited data type support; only strings with 244 or fewer characters, ``int8``, ``int16``, ``int32``, ``float32`` @@ -5348,7 +5471,7 @@ be used to read the file incrementally. .. ipython:: python - pd.read_stata('stata.dta') + pd.read_stata("stata.dta") Specifying a ``chunksize`` yields a :class:`~pandas.io.stata.StataReader` instance that can be used to @@ -5357,9 +5480,9 @@ object can be used as an iterator. .. ipython:: python - reader = pd.read_stata('stata.dta', chunksize=3) - for df in reader: - print(df.shape) + with pd.read_stata("stata.dta", chunksize=3) as reader: + for df in reader: + print(df.shape) For more fine-grained control, use ``iterator=True`` and specify ``chunksize`` with each call to @@ -5367,9 +5490,9 @@ For more fine-grained control, use ``iterator=True`` and specify .. ipython:: python - reader = pd.read_stata('stata.dta', iterator=True) - chunk1 = reader.read(5) - chunk2 = reader.read(5) + with pd.read_stata("stata.dta", iterator=True) as reader: + chunk1 = reader.read(5) + chunk2 = reader.read(5) Currently the ``index`` is retrieved as a column. @@ -5399,7 +5522,7 @@ values will have ``object`` data type. .. ipython:: python :suppress: - os.remove('stata.dta') + os.remove("stata.dta") .. _io.stata-categorical: @@ -5453,7 +5576,7 @@ SAS formats ----------- The top-level function :func:`read_sas` can read (but not write) SAS -`xport` (.XPT) and (since *v0.18.0*) `SAS7BDAT` (.sas7bdat) format files. +XPORT (.xpt) and (since *v0.18.0*) SAS7BDAT (.sas7bdat) format files. SAS files only contain two value types: ASCII text and floating point values (usually 8 bytes but sometimes truncated). For xport files, @@ -5471,7 +5594,7 @@ Read a SAS7BDAT file: .. code-block:: python - df = pd.read_sas('sas_data.sas7bdat') + df = pd.read_sas("sas_data.sas7bdat") Obtain an iterator and read an XPORT file 100,000 lines at a time: @@ -5480,9 +5603,10 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: def do_something(chunk): pass - rdr = pd.read_sas('sas_xport.xpt', chunk=100000) - for chunk in rdr: - do_something(chunk) + + with pd.read_sas("sas_xport.xpt", chunk=100000) as rdr: + for chunk in rdr: + do_something(chunk) The specification_ for the xport file format is available from the SAS web site. @@ -5501,7 +5625,7 @@ SPSS formats .. versionadded:: 0.25.0 The top-level function :func:`read_spss` can read (but not write) SPSS -`sav` (.sav) and `zsav` (.zsav) format files. +SAV (.sav) and ZSAV (.zsav) format files. SPSS files contain column names. By default the whole file is read, categorical columns are converted into ``pd.Categorical``, @@ -5514,17 +5638,20 @@ Read an SPSS file: .. code-block:: python - df = pd.read_spss('spss_data.sav') + df = pd.read_spss("spss_data.sav") Extract a subset of columns contained in ``usecols`` from an SPSS file and avoid converting categorical columns into ``pd.Categorical``: .. code-block:: python - df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'], - convert_categoricals=False) + df = pd.read_spss( + "spss_data.sav", + usecols=["foo", "bar"], + convert_categoricals=False, + ) -More information about the `sav` and `zsav` file format is available here_. +More information about the SAV and ZSAV file formats is available here_. .. _here: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm @@ -5569,7 +5696,7 @@ ignored. dtypes: float64(1), int64(1) memory usage: 15.3 MB -Given the next test set: +The following test functions will be used below to compare the performance of several IO methods: .. code-block:: python @@ -5580,80 +5707,101 @@ Given the next test set: import os sz = 1000000 - df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) + df = pd.DataFrame({"A": np.random.randn(sz), "B": [1] * sz}) sz = 1000000 np.random.seed(42) - df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) + df = pd.DataFrame({"A": np.random.randn(sz), "B": [1] * sz}) + def test_sql_write(df): - if os.path.exists('test.sql'): - os.remove('test.sql') - sql_db = sqlite3.connect('test.sql') - df.to_sql(name='test_table', con=sql_db) + if os.path.exists("test.sql"): + os.remove("test.sql") + sql_db = sqlite3.connect("test.sql") + df.to_sql(name="test_table", con=sql_db) sql_db.close() + def test_sql_read(): - sql_db = sqlite3.connect('test.sql') + sql_db = sqlite3.connect("test.sql") pd.read_sql_query("select * from test_table", sql_db) sql_db.close() + def test_hdf_fixed_write(df): - df.to_hdf('test_fixed.hdf', 'test', mode='w') + df.to_hdf("test_fixed.hdf", "test", mode="w") + def test_hdf_fixed_read(): - pd.read_hdf('test_fixed.hdf', 'test') + pd.read_hdf("test_fixed.hdf", "test") + def test_hdf_fixed_write_compress(df): - df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') + df.to_hdf("test_fixed_compress.hdf", "test", mode="w", complib="blosc") + def test_hdf_fixed_read_compress(): - pd.read_hdf('test_fixed_compress.hdf', 'test') + pd.read_hdf("test_fixed_compress.hdf", "test") + def test_hdf_table_write(df): - df.to_hdf('test_table.hdf', 'test', mode='w', format='table') + df.to_hdf("test_table.hdf", "test", mode="w", format="table") + def test_hdf_table_read(): - pd.read_hdf('test_table.hdf', 'test') + pd.read_hdf("test_table.hdf", "test") + def test_hdf_table_write_compress(df): - df.to_hdf('test_table_compress.hdf', 'test', mode='w', - complib='blosc', format='table') + df.to_hdf( + "test_table_compress.hdf", "test", mode="w", complib="blosc", format="table" + ) + def test_hdf_table_read_compress(): - pd.read_hdf('test_table_compress.hdf', 'test') + pd.read_hdf("test_table_compress.hdf", "test") + def test_csv_write(df): - df.to_csv('test.csv', mode='w') + df.to_csv("test.csv", mode="w") + def test_csv_read(): - pd.read_csv('test.csv', index_col=0) + pd.read_csv("test.csv", index_col=0) + def test_feather_write(df): - df.to_feather('test.feather') + df.to_feather("test.feather") + def test_feather_read(): - pd.read_feather('test.feather') + pd.read_feather("test.feather") + def test_pickle_write(df): - df.to_pickle('test.pkl') + df.to_pickle("test.pkl") + def test_pickle_read(): - pd.read_pickle('test.pkl') + pd.read_pickle("test.pkl") + def test_pickle_write_compress(df): - df.to_pickle('test.pkl.compress', compression='xz') + df.to_pickle("test.pkl.compress", compression="xz") + def test_pickle_read_compress(): - pd.read_pickle('test.pkl.compress', compression='xz') + pd.read_pickle("test.pkl.compress", compression="xz") + def test_parquet_write(df): - df.to_parquet('test.parquet') + df.to_parquet("test.parquet") + def test_parquet_read(): - pd.read_parquet('test.parquet') + pd.read_parquet("test.parquet") -When writing, the top-three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``. +When writing, the top three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``. .. code-block:: ipython @@ -5687,7 +5835,7 @@ When writing, the top-three functions in terms of speed are ``test_feather_write In [13]: %timeit test_parquet_write(df) 67.6 ms ± 706 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) -When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and +When reading, the top three functions in terms of speed are ``test_feather_read``, ``test_pickle_read`` and ``test_hdf_fixed_read``. @@ -5724,8 +5872,7 @@ When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and 24.4 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) -For this test case ``test.pkl.compress``, ``test.parquet`` and ``test.feather`` took the least space on disk. -Space on disk (in bytes) +The files ``test.pkl.compress``, ``test.parquet`` and ``test.feather`` took the least space on disk (in bytes). .. code-block:: none diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 0639e4a7bb5e4..d8998a9a0a6e1 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -7,6 +7,7 @@ from matplotlib import pyplot as plt import pandas.util._doctools as doctools + p = doctools.TablePlotter() @@ -38,23 +39,35 @@ a simple example: .. ipython:: python - df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=[0, 1, 2, 3]) + df1 = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + }, + index=[0, 1, 2, 3], + ) - df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], - 'B': ['B4', 'B5', 'B6', 'B7'], - 'C': ['C4', 'C5', 'C6', 'C7'], - 'D': ['D4', 'D5', 'D6', 'D7']}, - index=[4, 5, 6, 7]) + df2 = pd.DataFrame( + { + "A": ["A4", "A5", "A6", "A7"], + "B": ["B4", "B5", "B6", "B7"], + "C": ["C4", "C5", "C6", "C7"], + "D": ["D4", "D5", "D6", "D7"], + }, + index=[4, 5, 6, 7], + ) - df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], - 'B': ['B8', 'B9', 'B10', 'B11'], - 'C': ['C8', 'C9', 'C10', 'C11'], - 'D': ['D8', 'D9', 'D10', 'D11']}, - index=[8, 9, 10, 11]) + df3 = pd.DataFrame( + { + "A": ["A8", "A9", "A10", "A11"], + "B": ["B8", "B9", "B10", "B11"], + "C": ["C8", "C9", "C10", "C11"], + "D": ["D8", "D9", "D10", "D11"], + }, + index=[8, 9, 10, 11], + ) frames = [df1, df2, df3] result = pd.concat(frames) @@ -63,9 +76,8 @@ a simple example: :suppress: @savefig merging_concat_basic.png - p.plot(frames, result, - labels=['df1', 'df2', 'df3'], vertical=True); - plt.close('all'); + p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True); + plt.close("all"); Like its sibling function on ndarrays, ``numpy.concatenate``, ``pandas.concat`` takes a list or dict of homogeneously-typed objects and concatenates them with @@ -73,11 +85,20 @@ some configurable handling of "what to do with the other axes": :: - pd.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, - levels=None, names=None, verify_integrity=False, copy=True) + pd.concat( + objs, + axis=0, + join="outer", + ignore_index=False, + keys=None, + levels=None, + names=None, + verify_integrity=False, + copy=True, + ) * ``objs`` : a sequence or mapping of Series or DataFrame objects. If a - dict is passed, the sorted keys will be used as the `keys` argument, unless + dict is passed, the sorted keys will be used as the ``keys`` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. @@ -109,15 +130,14 @@ with each of the pieces of the chopped up DataFrame. We can do this using the .. ipython:: python - result = pd.concat(frames, keys=['x', 'y', 'z']) + result = pd.concat(frames, keys=["x", "y", "z"]) .. ipython:: python :suppress: @savefig merging_concat_keys.png - p.plot(frames, result, - labels=['df1', 'df2', 'df3'], vertical=True) - plt.close('all'); + p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True) + plt.close("all"); As you can see (if you've read the rest of the documentation), the resulting object's index has a :ref:`hierarchical index `. This @@ -125,7 +145,7 @@ means that we can now select out each chunk by key: .. ipython:: python - result.loc['y'] + result.loc["y"] It's not a stretch to see how this can be very useful. More detail on this functionality below. @@ -141,6 +161,14 @@ functionality below. frames = [ process_your_file(f) for f in files ] result = pd.concat(frames) +.. note:: + + When concatenating DataFrames with named axes, pandas will attempt to preserve + these index/column names whenever possible. In the case where all inputs share a + common name, this name will be assigned to the result. When the input names do + not all agree, the result will be unnamed. The same is true for :class:`MultiIndex`, + but the logic is applied separately on a level-by-level basis. + Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -158,43 +186,36 @@ behavior: .. ipython:: python - df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], - 'D': ['D2', 'D3', 'D6', 'D7'], - 'F': ['F2', 'F3', 'F6', 'F7']}, - index=[2, 3, 6, 7]) - result = pd.concat([df1, df4], axis=1, sort=False) + df4 = pd.DataFrame( + { + "B": ["B2", "B3", "B6", "B7"], + "D": ["D2", "D3", "D6", "D7"], + "F": ["F2", "F3", "F6", "F7"], + }, + index=[2, 3, 6, 7], + ) + result = pd.concat([df1, df4], axis=1) .. ipython:: python :suppress: @savefig merging_concat_axis1.png - p.plot([df1, df4], result, - labels=['df1', 'df4'], vertical=False); - plt.close('all'); - -.. warning:: - - .. versionchanged:: 0.23.0 - - The default behavior with ``join='outer'`` is to sort the other axis - (columns in this case). In a future version of pandas, the default will - be to not sort. We specified ``sort=False`` to opt in to the new - behavior now. + p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); + plt.close("all"); Here is the same thing with ``join='inner'``: .. ipython:: python - result = pd.concat([df1, df4], axis=1, join='inner') + result = pd.concat([df1, df4], axis=1, join="inner") .. ipython:: python :suppress: @savefig merging_concat_axis1_inner.png - p.plot([df1, df4], result, - labels=['df1', 'df4'], vertical=False); - plt.close('all'); + p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); + plt.close("all"); Lastly, suppose we just wanted to reuse the *exact index* from the original DataFrame: @@ -213,9 +234,8 @@ Similarly, we could index before the concatenation: :suppress: @savefig merging_concat_axis1_join_axes.png - p.plot([df1, df4], result, - labels=['df1', 'df4'], vertical=False); - plt.close('all'); + p.plot([df1, df4], result, labels=["df1", "df4"], vertical=False); + plt.close("all"); .. _merging.concatenation: @@ -234,9 +254,8 @@ instance methods on ``Series`` and ``DataFrame``. These methods actually predate :suppress: @savefig merging_append1.png - p.plot([df1, df2], result, - labels=['df1', 'df2'], vertical=True); - plt.close('all'); + p.plot([df1, df2], result, labels=["df1", "df2"], vertical=True); + plt.close("all"); In the case of ``DataFrame``, the indexes must be disjoint but the columns do not need to be: @@ -249,9 +268,8 @@ need to be: :suppress: @savefig merging_append2.png - p.plot([df1, df4], result, - labels=['df1', 'df4'], vertical=True); - plt.close('all'); + p.plot([df1, df4], result, labels=["df1", "df4"], vertical=True); + plt.close("all"); ``append`` may take multiple objects to concatenate: @@ -263,9 +281,8 @@ need to be: :suppress: @savefig merging_append3.png - p.plot([df1, df2, df3], result, - labels=['df1', 'df2', 'df3'], vertical=True); - plt.close('all'); + p.plot([df1, df2, df3], result, labels=["df1", "df2", "df3"], vertical=True); + plt.close("all"); .. note:: @@ -289,9 +306,8 @@ do this, use the ``ignore_index`` argument: :suppress: @savefig merging_concat_ignore_index.png - p.plot([df1, df4], result, - labels=['df1', 'df4'], vertical=True); - plt.close('all'); + p.plot([df1, df4], result, labels=["df1", "df4"], vertical=True); + plt.close("all"); This is also a valid argument to :meth:`DataFrame.append`: @@ -303,9 +319,8 @@ This is also a valid argument to :meth:`DataFrame.append`: :suppress: @savefig merging_append_ignore_index.png - p.plot([df1, df4], result, - labels=['df1', 'df4'], vertical=True); - plt.close('all'); + p.plot([df1, df4], result, labels=["df1", "df4"], vertical=True); + plt.close("all"); .. _merging.mixed_ndims: @@ -318,16 +333,15 @@ the name of the ``Series``. .. ipython:: python - s1 = pd.Series(['X0', 'X1', 'X2', 'X3'], name='X') + s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X") result = pd.concat([df1, s1], axis=1) .. ipython:: python :suppress: @savefig merging_concat_mixed_ndim.png - p.plot([df1, s1], result, - labels=['df1', 's1'], vertical=False); - plt.close('all'); + p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False); + plt.close("all"); .. note:: @@ -340,16 +354,15 @@ If unnamed ``Series`` are passed they will be numbered consecutively. .. ipython:: python - s2 = pd.Series(['_0', '_1', '_2', '_3']) + s2 = pd.Series(["_0", "_1", "_2", "_3"]) result = pd.concat([df1, s2, s2, s2], axis=1) .. ipython:: python :suppress: @savefig merging_concat_unnamed_series.png - p.plot([df1, s2], result, - labels=['df1', 's2'], vertical=False); - plt.close('all'); + p.plot([df1, s2], result, labels=["df1", "s2"], vertical=False); + plt.close("all"); Passing ``ignore_index=True`` will drop all name references. @@ -361,9 +374,8 @@ Passing ``ignore_index=True`` will drop all name references. :suppress: @savefig merging_concat_series_ignore_index.png - p.plot([df1, s1], result, - labels=['df1', 's1'], vertical=False); - plt.close('all'); + p.plot([df1, s1], result, labels=["df1", "s1"], vertical=False); + plt.close("all"); More concatenating with group keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -375,7 +387,7 @@ inherit the parent ``Series``' name, when these existed. .. ipython:: python - s3 = pd.Series([0, 1, 2, 3], name='foo') + s3 = pd.Series([0, 1, 2, 3], name="foo") s4 = pd.Series([0, 1, 2, 3]) s5 = pd.Series([0, 1, 4, 5]) @@ -385,49 +397,46 @@ Through the ``keys`` argument we can override the existing column names. .. ipython:: python - pd.concat([s3, s4, s5], axis=1, keys=['red', 'blue', 'yellow']) + pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"]) Let's consider a variation of the very first example presented: .. ipython:: python - result = pd.concat(frames, keys=['x', 'y', 'z']) + result = pd.concat(frames, keys=["x", "y", "z"]) .. ipython:: python :suppress: @savefig merging_concat_group_keys2.png - p.plot(frames, result, - labels=['df1', 'df2', 'df3'], vertical=True); - plt.close('all'); + p.plot(frames, result, labels=["df1", "df2", "df3"], vertical=True); + plt.close("all"); You can also pass a dict to ``concat`` in which case the dict keys will be used for the ``keys`` argument (unless other keys are specified): .. ipython:: python - pieces = {'x': df1, 'y': df2, 'z': df3} + pieces = {"x": df1, "y": df2, "z": df3} result = pd.concat(pieces) .. ipython:: python :suppress: @savefig merging_concat_dict.png - p.plot([df1, df2, df3], result, - labels=['df1', 'df2', 'df3'], vertical=True); - plt.close('all'); + p.plot([df1, df2, df3], result, labels=["df1", "df2", "df3"], vertical=True); + plt.close("all"); .. ipython:: python - result = pd.concat(pieces, keys=['z', 'y']) + result = pd.concat(pieces, keys=["z", "y"]) .. ipython:: python :suppress: @savefig merging_concat_dict_keys.png - p.plot([df1, df2, df3], result, - labels=['df1', 'df2', 'df3'], vertical=True); - plt.close('all'); + p.plot([df1, df2, df3], result, labels=["df1", "df2", "df3"], vertical=True); + plt.close("all"); The MultiIndex created has levels that are constructed from the passed keys and the index of the ``DataFrame`` pieces: @@ -441,17 +450,16 @@ do so using the ``levels`` argument: .. ipython:: python - result = pd.concat(pieces, keys=['x', 'y', 'z'], - levels=[['z', 'y', 'x', 'w']], - names=['group_key']) + result = pd.concat( + pieces, keys=["x", "y", "z"], levels=[["z", "y", "x", "w"]], names=["group_key"] + ) .. ipython:: python :suppress: @savefig merging_concat_dict_keys_names.png - p.plot([df1, df2, df3], result, - labels=['df1', 'df2', 'df3'], vertical=True); - plt.close('all'); + p.plot([df1, df2, df3], result, labels=["df1", "df2", "df3"], vertical=True); + plt.close("all"); .. ipython:: python @@ -471,16 +479,15 @@ append a single row to a ``DataFrame`` by passing a ``Series`` or dict to .. ipython:: python - s2 = pd.Series(['X0', 'X1', 'X2', 'X3'], index=['A', 'B', 'C', 'D']) + s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"]) result = df1.append(s2, ignore_index=True) .. ipython:: python :suppress: @savefig merging_append_series_as_row.png - p.plot([df1, s2], result, - labels=['df1', 's2'], vertical=True); - plt.close('all'); + p.plot([df1, s2], result, labels=["df1", "s2"], vertical=True); + plt.close("all"); You should use ``ignore_index`` with this method to instruct DataFrame to discard its index. If you wish to preserve the index, you should construct an @@ -490,17 +497,15 @@ You can also pass a list of dicts or Series: .. ipython:: python - dicts = [{'A': 1, 'B': 2, 'C': 3, 'X': 4}, - {'A': 5, 'B': 6, 'C': 7, 'Y': 8}] + dicts = [{"A": 1, "B": 2, "C": 3, "X": 4}, {"A": 5, "B": 6, "C": 7, "Y": 8}] result = df1.append(dicts, ignore_index=True, sort=False) .. ipython:: python :suppress: @savefig merging_append_dits.png - p.plot([df1, pd.DataFrame(dicts)], result, - labels=['df1', 'dicts'], vertical=True); - plt.close('all'); + p.plot([df1, pd.DataFrame(dicts)], result, labels=["df1", "dicts"], vertical=True); + plt.close("all"); .. _merging.join: @@ -524,10 +529,21 @@ all standard database join operations between ``DataFrame`` or named ``Series`` :: - pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False, - validate=None) + pd.merge( + left, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=True, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, + ) * ``left``: A DataFrame or named Series object. * ``right``: Another DataFrame or named Series object. @@ -621,22 +637,29 @@ key combination: .. ipython:: python - left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3']}) + left = pd.DataFrame( + { + "key": ["K0", "K1", "K2", "K3"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) - right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}) - result = pd.merge(left, right, on='key') + right = pd.DataFrame( + { + "key": ["K0", "K1", "K2", "K3"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) + result = pd.merge(left, right, on="key") .. ipython:: python :suppress: @savefig merging_merge_on_key.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); Here is a more complicated example with multiple join keys. Only the keys appearing in ``left`` and ``right`` are present (the intersection), since @@ -644,25 +667,32 @@ appearing in ``left`` and ``right`` are present (the intersection), since .. ipython:: python - left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], - 'key2': ['K0', 'K1', 'K0', 'K1'], - 'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3']}) + left = pd.DataFrame( + { + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) - right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], - 'key2': ['K0', 'K0', 'K0', 'K0'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}) + right = pd.DataFrame( + { + "key1": ["K0", "K1", "K1", "K2"], + "key2": ["K0", "K0", "K0", "K0"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) - result = pd.merge(left, right, on=['key1', 'key2']) + result = pd.merge(left, right, on=["key1", "key2"]) .. ipython:: python :suppress: @savefig merging_merge_on_key_multiple.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); The ``how`` argument to ``merge`` specifies how to determine which keys are to be included in the resulting table. If a key combination **does not appear** in @@ -680,50 +710,46 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='left', on=['key1', 'key2']) + result = pd.merge(left, right, how="left", on=["key1", "key2"]) .. ipython:: python :suppress: @savefig merging_merge_on_key_left.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = pd.merge(left, right, how='right', on=['key1', 'key2']) + result = pd.merge(left, right, how="right", on=["key1", "key2"]) .. ipython:: python :suppress: @savefig merging_merge_on_key_right.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); + p.plot([left, right], result, labels=["left", "right"], vertical=False); .. ipython:: python - result = pd.merge(left, right, how='outer', on=['key1', 'key2']) + result = pd.merge(left, right, how="outer", on=["key1", "key2"]) .. ipython:: python :suppress: @savefig merging_merge_on_key_outer.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = pd.merge(left, right, how='inner', on=['key1', 'key2']) + result = pd.merge(left, right, how="inner", on=["key1", "key2"]) .. ipython:: python :suppress: @savefig merging_merge_on_key_inner.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); You can merge a mult-indexed Series and a DataFrame, if the names of the MultiIndex correspond to the columns from the DataFrame. Transform @@ -743,26 +769,25 @@ as shown in the following example. ) ser - pd.merge(df, ser.reset_index(), on=['Let', 'Num']) + pd.merge(df, ser.reset_index(), on=["Let", "Num"]) Here is another example with duplicate join keys in DataFrames: .. ipython:: python - left = pd.DataFrame({'A': [1, 2], 'B': [2, 2]}) + left = pd.DataFrame({"A": [1, 2], "B": [2, 2]}) - right = pd.DataFrame({'A': [4, 5, 6], 'B': [2, 2, 2]}) + right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) - result = pd.merge(left, right, on='B', how='outer') + result = pd.merge(left, right, on="B", how="outer") .. ipython:: python :suppress: @savefig merging_merge_on_key_dup.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. warning:: @@ -786,12 +811,12 @@ In the following example, there are duplicate values of ``B`` in the right .. ipython:: python - left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]}) - right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]}) + left = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) .. code-block:: ipython - In [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one") + In [53]: result = pd.merge(left, right, on="B", how="outer", validate="one_to_one") ... MergeError: Merge keys are not unique in right dataset; not a one-to-one merge @@ -801,7 +826,7 @@ ensure there are no duplicates in the left DataFrame, one can use the .. ipython:: python - pd.merge(left, right, on='B', how='outer', validate="one_to_many") + pd.merge(left, right, on="B", how="outer", validate="one_to_many") .. _merging.indicator: @@ -823,15 +848,15 @@ that takes on values: .. ipython:: python - df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']}) - df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]}) - pd.merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]}) + df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) + pd.merge(df1, df2, on="col1", how="outer", indicator=True) The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. .. ipython:: python - pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') + pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column") .. _merging.dtypes: @@ -843,25 +868,25 @@ Merging will preserve the dtype of the join keys. .. ipython:: python - left = pd.DataFrame({'key': [1], 'v1': [10]}) + left = pd.DataFrame({"key": [1], "v1": [10]}) left - right = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + right = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) right We are able to preserve the join keys: .. ipython:: python - pd.merge(left, right, how='outer') - pd.merge(left, right, how='outer').dtypes + pd.merge(left, right, how="outer") + pd.merge(left, right, how="outer").dtypes Of course if you have missing values that are introduced, then the resulting dtype will be upcast. .. ipython:: python - pd.merge(left, right, how='outer', on='key') - pd.merge(left, right, how='outer', on='key').dtypes + pd.merge(left, right, how="outer", on="key") + pd.merge(left, right, how="outer", on="key").dtypes Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals `. @@ -871,12 +896,12 @@ The left frame. from pandas.api.types import CategoricalDtype - X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype(CategoricalDtype(categories=['foo', 'bar'])) + X = pd.Series(np.random.choice(["foo", "bar"], size=(10,))) + X = X.astype(CategoricalDtype(categories=["foo", "bar"])) - left = pd.DataFrame({'X': X, - 'Y': np.random.choice(['one', 'two', 'three'], - size=(10,))}) + left = pd.DataFrame( + {"X": X, "Y": np.random.choice(["one", "two", "three"], size=(10,))} + ) left left.dtypes @@ -884,9 +909,12 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar'], - dtype=CategoricalDtype(['foo', 'bar'])), - 'Z': [1, 2]}) + right = pd.DataFrame( + { + "X": pd.Series(["foo", "bar"], dtype=CategoricalDtype(["foo", "bar"])), + "Z": [1, 2], + } + ) right right.dtypes @@ -894,7 +922,7 @@ The merged result: .. ipython:: python - result = pd.merge(left, right, how='outer') + result = pd.merge(left, right, how="outer") result result.dtypes @@ -918,13 +946,13 @@ potentially differently-indexed ``DataFrames`` into a single result .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=['K0', 'K1', 'K2']) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"] + ) - right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], - 'D': ['D0', 'D2', 'D3']}, - index=['K0', 'K2', 'K3']) + right = pd.DataFrame( + {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"] + ) result = left.join(right) @@ -932,35 +960,32 @@ potentially differently-indexed ``DataFrames`` into a single result :suppress: @savefig merging_join.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = left.join(right, how='outer') + result = left.join(right, how="outer") .. ipython:: python :suppress: @savefig merging_join_outer.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); The same as above, but with ``how='inner'``. .. ipython:: python - result = left.join(right, how='inner') + result = left.join(right, how="inner") .. ipython:: python :suppress: @savefig merging_join_inner.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); The data alignment here is on the indexes (row labels). This same behavior can be achieved using ``merge`` plus additional arguments instructing it to use the @@ -968,27 +993,25 @@ indexes: .. ipython:: python - result = pd.merge(left, right, left_index=True, right_index=True, how='outer') + result = pd.merge(left, right, left_index=True, right_index=True, how="outer") .. ipython:: python :suppress: @savefig merging_merge_index_outer.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = pd.merge(left, right, left_index=True, right_index=True, how='inner'); + result = pd.merge(left, right, left_index=True, right_index=True, how="inner") .. ipython:: python :suppress: @savefig merging_merge_index_inner.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); Joining key columns on an index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1001,8 +1024,9 @@ completely equivalent: :: left.join(right, on=key_or_keys) - pd.merge(left, right, left_on=key_or_keys, right_index=True, - how='left', sort=False) + pd.merge( + left, right, left_on=key_or_keys, right_index=True, how="left", sort=False + ) Obviously you can choose whichever form you find more convenient. For many-to-one joins (where one of the ``DataFrame``'s is already indexed by the @@ -1010,36 +1034,37 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key': ['K0', 'K1', 'K0', 'K1']}) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key": ["K0", "K1", "K0", "K1"], + } + ) - right = pd.DataFrame({'C': ['C0', 'C1'], - 'D': ['D0', 'D1']}, - index=['K0', 'K1']) + right = pd.DataFrame({"C": ["C0", "C1"], "D": ["D0", "D1"]}, index=["K0", "K1"]) - result = left.join(right, on='key') + result = left.join(right, on="key") .. ipython:: python :suppress: @savefig merging_join_key_columns.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = pd.merge(left, right, left_on='key', right_index=True, - how='left', sort=False); + result = pd.merge( + left, right, left_on="key", right_index=True, how="left", sort=False + ) .. ipython:: python :suppress: @savefig merging_merge_key_columns.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. _merging.multikey_join: @@ -1047,30 +1072,34 @@ To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key1': ['K0', 'K0', 'K1', 'K2'], - 'key2': ['K0', 'K1', 'K0', 'K1']}) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + } + ) - index = pd.MultiIndex.from_tuples([('K0', 'K0'), ('K1', 'K0'), - ('K2', 'K0'), ('K2', 'K1')]) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=index) + index = pd.MultiIndex.from_tuples( + [("K0", "K0"), ("K1", "K0"), ("K2", "K0"), ("K2", "K1")] + ) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=index + ) Now this can be joined by passing the two key column names: .. ipython:: python - result = left.join(right, on=['key1', 'key2']) + result = left.join(right, on=["key1", "key2"]) .. ipython:: python :suppress: @savefig merging_join_multikeys.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. _merging.df_inner_join: @@ -1081,15 +1110,14 @@ easily performed: .. ipython:: python - result = left.join(right, on=['key1', 'key2'], how='inner') + result = left.join(right, on=["key1", "key2"], how="inner") .. ipython:: python :suppress: @savefig merging_join_multikeys_inner.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); As you can see, this drops any rows where there was no match. @@ -1104,41 +1132,44 @@ a level name of the MultiIndexed frame. .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=pd.Index(['K0', 'K1', 'K2'], name='key')) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, + index=pd.Index(["K0", "K1", "K2"], name="key"), + ) - index = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), - ('K2', 'Y2'), ('K2', 'Y3')], - names=['key', 'Y']) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=index) + index = pd.MultiIndex.from_tuples( + [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], + names=["key", "Y"], + ) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, + index=index, + ) + + result = left.join(right, how="inner") - result = left.join(right, how='inner') .. ipython:: python :suppress: @savefig merging_join_multiindex_inner.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); This is equivalent but less verbose and more memory efficient / faster than this. .. ipython:: python - result = pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner').set_index(['key','Y']) + result = pd.merge( + left.reset_index(), right.reset_index(), on=["key"], how="inner" + ).set_index(["key","Y"]) .. ipython:: python :suppress: @savefig merging_merge_multiindex_alternative.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. _merging.join_with_two_multi_indexes: @@ -1151,55 +1182,55 @@ the left argument, as in this example: .. ipython:: python - leftindex = pd.MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], - names=['abc', 'xy', 'num']) - left = pd.DataFrame({'v1': range(12)}, index=leftindex) + leftindex = pd.MultiIndex.from_product( + [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"] + ) + left = pd.DataFrame({"v1": range(12)}, index=leftindex) left - rightindex = pd.MultiIndex.from_product([list('abc'), list('xy')], - names=['abc', 'xy']) - right = pd.DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) + rightindex = pd.MultiIndex.from_product( + [list("abc"), list("xy")], names=["abc", "xy"] + ) + right = pd.DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) right - left.join(right, on=['abc', 'xy'], how='inner') + left.join(right, on=["abc", "xy"], how="inner") If that condition is not satisfied, a join with two multi-indexes can be done using the following code. .. ipython:: python - leftindex = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=leftindex) + leftindex = pd.MultiIndex.from_tuples( + [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] + ) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex + ) - rightindex = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), - ('K2', 'Y2'), ('K2', 'Y3')], - names=['key', 'Y']) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=rightindex) + rightindex = pd.MultiIndex.from_tuples( + [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] + ) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=rightindex + ) - result = pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner').set_index(['key', 'X', 'Y']) + result = pd.merge( + left.reset_index(), right.reset_index(), on=["key"], how="inner" + ).set_index(["key", "X", "Y"]) .. ipython:: python :suppress: @savefig merging_merge_two_multiindex.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. _merging.merge_on_columns_and_levels: Merging on a combination of columns and index levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.23 - Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters may refer to either column names or index level names. This enables merging ``DataFrame`` instances on a combination of index levels and columns without @@ -1207,29 +1238,36 @@ resetting indexes. .. ipython:: python - left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + left_index = pd.Index(["K0", "K0", "K1", "K2"], name="key1") - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key2': ['K0', 'K1', 'K0', 'K1']}, - index=left_index) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key2": ["K0", "K1", "K0", "K1"], + }, + index=left_index, + ) - right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + right_index = pd.Index(["K0", "K1", "K2", "K2"], name="key1") - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3'], - 'key2': ['K0', 'K0', 'K0', 'K1']}, - index=right_index) + right = pd.DataFrame( + { + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + "key2": ["K0", "K0", "K0", "K1"], + }, + index=right_index, + ) - result = left.merge(right, on=['key1', 'key2']) + result = left.merge(right, on=["key1", "key2"]) .. ipython:: python :suppress: @savefig merge_on_index_and_column.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. note:: @@ -1238,7 +1276,7 @@ resetting indexes. DataFrame. .. note:: - When DataFrames are merged using only some of the levels of a `MultiIndex`, + When DataFrames are merged using only some of the levels of a ``MultiIndex``, the extra levels will be dropped from the resulting merge. In order to preserve those levels, use ``reset_index`` on those level names to move those levels to columns prior to doing the merge. @@ -1258,47 +1296,44 @@ columns: .. ipython:: python - left = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'v': [1, 2, 3]}) - right = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'v': [4, 5, 6]}) + left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]}) + right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]}) - result = pd.merge(left, right, on='k') + result = pd.merge(left, right, on="k") .. ipython:: python :suppress: @savefig merging_merge_overlapped.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. ipython:: python - result = pd.merge(left, right, on='k', suffixes=('_l', '_r')) + result = pd.merge(left, right, on="k", suffixes=("_l", "_r")) .. ipython:: python :suppress: @savefig merging_merge_overlapped_suffix.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); :meth:`DataFrame.join` has ``lsuffix`` and ``rsuffix`` arguments which behave similarly. .. ipython:: python - left = left.set_index('k') - right = right.set_index('k') - result = left.join(right, lsuffix='_l', rsuffix='_r') + left = left.set_index("k") + right = right.set_index("k") + result = left.join(right, lsuffix="_l", rsuffix="_r") .. ipython:: python :suppress: @savefig merging_merge_overlapped_multi_suffix.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=False); - plt.close('all'); + p.plot([left, right], result, labels=["left", "right"], vertical=False); + plt.close("all"); .. _merging.multiple_join: @@ -1310,16 +1345,20 @@ to join them together on their indexes. .. ipython:: python - right2 = pd.DataFrame({'v': [7, 8, 9]}, index=['K1', 'K1', 'K2']) + right2 = pd.DataFrame({"v": [7, 8, 9]}, index=["K1", "K1", "K2"]) result = left.join([right, right2]) .. ipython:: python :suppress: @savefig merging_join_multi_df.png - p.plot([left, right, right2], result, - labels=['left', 'right', 'right2'], vertical=False); - plt.close('all'); + p.plot( + [left, right, right2], + result, + labels=["left", "right", "right2"], + vertical=False, + ); + plt.close("all"); .. _merging.combine_first.update: @@ -1332,10 +1371,10 @@ one object from values for matching indices in the other. Here is an example: .. ipython:: python - df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan], - [np.nan, 7., np.nan]]) - df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]], - index=[1, 2]) + df1 = pd.DataFrame( + [[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]] + ) + df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) For this, use the :meth:`~DataFrame.combine_first` method: @@ -1347,9 +1386,8 @@ For this, use the :meth:`~DataFrame.combine_first` method: :suppress: @savefig merging_combine_first.png - p.plot([df1, df2], result, - labels=['df1', 'df2'], vertical=False); - plt.close('all'); + p.plot([df1, df2], result, labels=["df1", "df2"], vertical=False); + plt.close("all"); Note that this method only takes values from the right ``DataFrame`` if they are missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`, @@ -1368,9 +1406,8 @@ alters non-NA values in place: :suppress: @savefig merging_update.png - p.plot([df1_copy, df2], df1, - labels=['df1', 'df2'], vertical=False); - plt.close('all'); + p.plot([df1_copy, df2], df1, labels=["df1", "df2"], vertical=False); + plt.close("all"); .. _merging.time_series: @@ -1388,14 +1425,13 @@ fill/interpolate missing data: .. ipython:: python - left = pd.DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], - 'lv': [1, 2, 3, 4], - 's': ['a', 'b', 'c', 'd']}) + left = pd.DataFrame( + {"k": ["K0", "K1", "K1", "K2"], "lv": [1, 2, 3, 4], "s": ["a", "b", "c", "d"]} + ) - right = pd.DataFrame({'k': ['K1', 'K2', 'K4'], - 'rv': [1, 2, 3]}) + right = pd.DataFrame({"k": ["K1", "K2", "K4"], "rv": [1, 2, 3]}) - pd.merge_ordered(left, right, fill_method='ffill', left_by='s') + pd.merge_ordered(left, right, fill_method="ffill", left_by="s") .. _merging.merge_asof: @@ -1415,37 +1451,44 @@ merge them. .. ipython:: python - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL', 'GOOG', - 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) .. ipython:: python @@ -1456,18 +1499,13 @@ By default we are taking the asof of the quotes. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker') + pd.merge_asof(trades, quotes, on="time", by="ticker") We only asof within ``2ms`` between the quote time and the trade time. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('2ms')) + pd.merge_asof(trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")) We only asof within ``10ms`` between the quote time and the trade time and we exclude exact matches on time. Note that though we exclude the exact matches @@ -1475,11 +1513,14 @@ exclude exact matches on time. Note that though we exclude the exact matches .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('10ms'), - allow_exact_matches=False) + pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ) .. _merging.compare: @@ -1491,7 +1532,7 @@ compare two DataFrame or Series, respectively, and summarize their differences. This feature was added in :ref:`V1.1.0 `. -For example, you might want to compare two `DataFrame` and stack their differences +For example, you might want to compare two ``DataFrame`` and stack their differences side by side. .. ipython:: python @@ -1500,7 +1541,7 @@ side by side. { "col1": ["a", "a", "b", "b", "a"], "col2": [1.0, 2.0, 3.0, np.nan, 5.0], - "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + "col3": [1.0, 2.0, 3.0, 4.0, 5.0], }, columns=["col1", "col2", "col3"], ) @@ -1509,8 +1550,8 @@ side by side. .. ipython:: python df2 = df.copy() - df2.loc[0, 'col1'] = 'c' - df2.loc[2, 'col3'] = 4.0 + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 df2 .. ipython:: python @@ -1527,7 +1568,7 @@ If you wish, you may choose to stack the differences on rows. df.compare(df2, align_axis=0) -If you wish to keep all original rows and columns, set `keep_shape` argument +If you wish to keep all original rows and columns, set ``keep_shape`` argument to ``True``. .. ipython:: python diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2e68a0598bb71..1621b37f31b23 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -38,12 +38,15 @@ arise and we wish to also consider that "missing" or "not available" or "NA". .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], - columns=['one', 'two', 'three']) - df['four'] = 'bar' - df['five'] = df['one'] > 0 + df = pd.DataFrame( + np.random.randn(5, 3), + index=["a", "c", "e", "f", "h"], + columns=["one", "two", "three"], + ) + df["four"] = "bar" + df["five"] = df["one"] > 0 df - df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']) + df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"]) df2 To make detecting missing values easier (and across different array dtypes), @@ -53,9 +56,9 @@ Series and DataFrame objects: .. ipython:: python - df2['one'] - pd.isna(df2['one']) - df2['four'].notna() + df2["one"] + pd.isna(df2["one"]) + df2["four"].notna() df2.isna() .. warning:: @@ -65,20 +68,20 @@ Series and DataFrame objects: .. ipython:: python - None == None # noqa: E711 + None == None # noqa: E711 np.nan == np.nan So as compared to above, a scalar equality comparison versus a ``None/np.nan`` doesn't provide useful information. .. ipython:: python - df2['one'] == np.nan + df2["one"] == np.nan Integer dtypes and missing data ------------------------------- Because ``NaN`` is a float, a column of integers with even one missing values -is cast to floating-point dtype (see :ref:`gotchas.intna` for more). Pandas +is cast to floating-point dtype (see :ref:`gotchas.intna` for more). pandas provides a nullable integer array, which can be used by explicitly requesting the dtype: @@ -101,9 +104,9 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``. .. ipython:: python df2 = df.copy() - df2['timestamp'] = pd.Timestamp('20120101') + df2["timestamp"] = pd.Timestamp("20120101") df2 - df2.loc[['a', 'c', 'h'], ['one', 'timestamp']] = np.nan + df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan df2 df2.dtypes.value_counts() @@ -146,9 +149,9 @@ objects. .. ipython:: python :suppress: - df = df2.loc[:, ['one', 'two', 'three']] - a = df2.loc[df2.index[:5], ['one', 'two']].fillna(method='pad') - b = df2.loc[df2.index[:5], ['one', 'two', 'three']] + df = df2.loc[:, ["one", "two", "three"]] + a = df2.loc[df2.index[:5], ["one", "two"]].fillna(method="pad") + b = df2.loc[df2.index[:5], ["one", "two", "three"]] .. ipython:: python @@ -168,7 +171,7 @@ account for missing data. For example: .. ipython:: python df - df['one'].sum() + df["one"].sum() df.mean(1) df.cumsum() df.cumsum(skipna=False) @@ -210,7 +213,7 @@ with R, for example: .. ipython:: python df - df.groupby('one').mean() + df.groupby("one").mean() See the groupby section :ref:`here ` for more information. @@ -234,7 +237,7 @@ of ways, which we illustrate: df2 df2.fillna(0) - df2['one'].fillna('missing') + df2["one"].fillna("missing") **Fill gaps forward or backward** @@ -244,14 +247,14 @@ can propagate non-NA values forward or backward: .. ipython:: python df - df.fillna(method='pad') + df.fillna(method="pad") .. _missing_data.fillna.limit: **Limit the amount of filling** If we only want consecutive gaps filled up to a certain number of data points, -we can use the `limit` keyword: +we can use the ``limit`` keyword: .. ipython:: python :suppress: @@ -261,7 +264,7 @@ we can use the `limit` keyword: .. ipython:: python df - df.fillna(method='pad', limit=1) + df.fillna(method="pad", limit=1) To remind you, these are the available filling methods: @@ -289,21 +292,21 @@ use case of this is to fill a DataFrame with the mean of that column. .. ipython:: python - dff = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC')) + dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC")) dff.iloc[3:5, 0] = np.nan dff.iloc[4:6, 1] = np.nan dff.iloc[5:8, 2] = np.nan dff dff.fillna(dff.mean()) - dff.fillna(dff.mean()['B':'C']) + dff.fillna(dff.mean()["B":"C"]) Same result as above, but is aligning the 'fill' value which is a Series in this case. .. ipython:: python - dff.where(pd.notna(dff), dff.mean(), axis='columns') + dff.where(pd.notna(dff), dff.mean(), axis="columns") .. _missing_data.dropna: @@ -317,15 +320,15 @@ data. To do this, use :meth:`~DataFrame.dropna`: .. ipython:: python :suppress: - df['two'] = df['two'].fillna(0) - df['three'] = df['three'].fillna(0) + df["two"] = df["two"].fillna(0) + df["three"] = df["three"].fillna(0) .. ipython:: python df df.dropna(axis=0) df.dropna(axis=1) - df['one'].dropna() + df["one"].dropna() An equivalent :meth:`~Series.dropna` is available for Series. DataFrame.dropna has considerably more options than Series.dropna, which can be @@ -336,10 +339,6 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -.. versionadded:: 0.23.0 - - The ``limit_area`` keyword argument was added. - Both Series and DataFrame objects have :meth:`~DataFrame.interpolate` that, by default, performs linear interpolation at missing data points. @@ -347,7 +346,7 @@ that, by default, performs linear interpolation at missing data points. :suppress: np.random.seed(123456) - idx = pd.date_range('1/1/2000', periods=100, freq='BM') + idx = pd.date_range("1/1/2000", periods=100, freq="BM") ts = pd.Series(np.random.randn(100), index=idx) ts[1:5] = np.nan ts[20:30] = np.nan @@ -380,28 +379,32 @@ Index aware interpolation is available via the ``method`` keyword: ts2 ts2.interpolate() - ts2.interpolate(method='time') + ts2.interpolate(method="time") For a floating-point index, use ``method='values'``: .. ipython:: python :suppress: - idx = [0., 1., 10.] - ser = pd.Series([0., np.nan, 10.], idx) + idx = [0.0, 1.0, 10.0] + ser = pd.Series([0.0, np.nan, 10.0], idx) .. ipython:: python ser ser.interpolate() - ser.interpolate(method='values') + ser.interpolate(method="values") You can also interpolate with a DataFrame: .. ipython:: python - df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], - 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df = pd.DataFrame( + { + "A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], + "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4], + } + ) df df.interpolate() @@ -422,20 +425,20 @@ The appropriate interpolation method will depend on the type of data you are wor .. ipython:: python - df.interpolate(method='barycentric') + df.interpolate(method="barycentric") - df.interpolate(method='pchip') + df.interpolate(method="pchip") - df.interpolate(method='akima') + df.interpolate(method="akima") When interpolating via a polynomial or spline approximation, you must also specify the degree or order of the approximation: .. ipython:: python - df.interpolate(method='spline', order=2) + df.interpolate(method="spline", order=2) - df.interpolate(method='polynomial', order=2) + df.interpolate(method="polynomial", order=2) Compare several methods: @@ -443,10 +446,10 @@ Compare several methods: np.random.seed(2) - ser = pd.Series(np.arange(1, 10.1, .25) ** 2 + np.random.randn(37)) + ser = pd.Series(np.arange(1, 10.1, 0.25) ** 2 + np.random.randn(37)) missing = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29]) ser[missing] = np.nan - methods = ['linear', 'quadratic', 'cubic'] + methods = ["linear", "quadratic", "cubic"] df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods}) @savefig compare_interpolations.png @@ -463,8 +466,8 @@ at the new values. ser = pd.Series(np.sort(np.random.uniform(size=100))) # interpolate at new_index - new_index = ser.index | pd.Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) - interp_s = ser.reindex(new_index).interpolate(method='pchip') + new_index = ser.index.union(pd.Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75])) + interp_s = ser.reindex(new_index).interpolate(method="pchip") interp_s[49:51] .. _scipy: https://www.scipy.org @@ -482,8 +485,7 @@ filled since the last valid observation: .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, - np.nan, 13, np.nan, np.nan]) + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) ser # fill all consecutive values in a forward direction @@ -498,28 +500,28 @@ By default, ``NaN`` values are filled in a ``forward`` direction. Use .. ipython:: python # fill one consecutive value backwards - ser.interpolate(limit=1, limit_direction='backward') + ser.interpolate(limit=1, limit_direction="backward") # fill one consecutive value in both directions - ser.interpolate(limit=1, limit_direction='both') + ser.interpolate(limit=1, limit_direction="both") # fill all consecutive values in both directions - ser.interpolate(limit_direction='both') + ser.interpolate(limit_direction="both") By default, ``NaN`` values are filled whether they are inside (surrounded by) -existing valid values, or outside existing valid values. Introduced in v0.23 -the ``limit_area`` parameter restricts filling to either inside or outside values. +existing valid values, or outside existing valid values. The ``limit_area`` +parameter restricts filling to either inside or outside values. .. ipython:: python # fill one consecutive inside value in both directions - ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + ser.interpolate(limit_direction="both", limit_area="inside", limit=1) # fill all consecutive outside values backward - ser.interpolate(limit_direction='backward', limit_area='outside') + ser.interpolate(limit_direction="backward", limit_area="outside") # fill all consecutive outside values in both directions - ser.interpolate(limit_direction='both', limit_area='outside') + ser.interpolate(limit_direction="both", limit_area="outside") .. _missing_data.replace: @@ -535,7 +537,7 @@ value: .. ipython:: python - ser = pd.Series([0., 1., 2., 3., 4.]) + ser = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) ser.replace(0, 5) @@ -555,16 +557,16 @@ For a DataFrame, you can specify individual values by column: .. ipython:: python - df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) + df = pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]}) - df.replace({'a': 0, 'b': 5}, 100) + df.replace({"a": 0, "b": 5}, 100) Instead of replacing with specified values, you can treat all given values as missing and interpolate over them: .. ipython:: python - ser.replace([1, 2, 3], method='pad') + ser.replace([1, 2, 3], method="pad") .. _missing_data.replace_expression: @@ -585,67 +587,67 @@ Replace the '.' with ``NaN`` (str -> str): .. ipython:: python - d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']} + d = {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} df = pd.DataFrame(d) - df.replace('.', np.nan) + df.replace(".", np.nan) Now do it with a regular expression that removes surrounding whitespace (regex -> regex): .. ipython:: python - df.replace(r'\s*\.\s*', np.nan, regex=True) + df.replace(r"\s*\.\s*", np.nan, regex=True) Replace a few different values (list -> list): .. ipython:: python - df.replace(['a', '.'], ['b', np.nan]) + df.replace(["a", "."], ["b", np.nan]) list of regex -> list of regex: .. ipython:: python - df.replace([r'\.', r'(a)'], ['dot', r'\1stuff'], regex=True) + df.replace([r"\.", r"(a)"], ["dot", r"\1stuff"], regex=True) Only search in column ``'b'`` (dict -> dict): .. ipython:: python - df.replace({'b': '.'}, {'b': np.nan}) + df.replace({"b": "."}, {"b": np.nan}) Same as the previous example, but use a regular expression for searching instead (dict of regex -> dict): .. ipython:: python - df.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) + df.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) You can pass nested dictionaries of regular expressions that use ``regex=True``: .. ipython:: python - df.replace({'b': {'b': r''}}, regex=True) + df.replace({"b": {"b": r""}}, regex=True) Alternatively, you can pass the nested dictionary like so: .. ipython:: python - df.replace(regex={'b': {r'\s*\.\s*': np.nan}}) + df.replace(regex={"b": {r"\s*\.\s*": np.nan}}) You can also use the group of a regular expression match when replacing (dict of regex -> dict of regex), this works for lists as well. .. ipython:: python - df.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + df.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) You can pass a list of regular expressions, of which those that match will be replaced with a scalar (list of regex -> regex). .. ipython:: python - df.replace([r'\s*\.\s*', r'a|b'], np.nan, regex=True) + df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -654,7 +656,7 @@ dictionary. The previous example, in this case, would then be: .. ipython:: python - df.replace(regex=[r'\s*\.\s*', r'a|b'], value=np.nan) + df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan) This can be convenient if you do not want to pass ``regex=True`` every time you want to use a regular expression. @@ -680,7 +682,7 @@ Replacing more than one value is possible by passing a list. .. ipython:: python df00 = df.iloc[0, 0] - df.replace([1.5, df00], [np.nan, 'a']) + df.replace([1.5, df00], [np.nan, "a"]) df[1].dtype You can also operate on the DataFrame in place: @@ -689,32 +691,6 @@ You can also operate on the DataFrame in place: df.replace(1.5, np.nan, inplace=True) -.. warning:: - - When replacing multiple ``bool`` or ``datetime64`` objects, the first - argument to ``replace`` (``to_replace``) must match the type of the value - being replaced. For example, - - .. code-block:: python - - >>> s = pd.Series([True, False, True]) - >>> s.replace({'a string': 'new value', True: False}) # raises - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - will raise a ``TypeError`` because one of the ``dict`` keys is not of the - correct type for replacement. - - However, when replacing a *single* object such as, - - .. ipython:: python - - s = pd.Series([True, False, True]) - s.replace('a string', 'another string') - - the original ``NDFrame`` object will be returned untouched. We're working on - unifying this API, but for backwards compatibility reasons we cannot break - the latter behavior. See :issue:`6354` for more details. - Missing data casting rules and indexing --------------------------------------- @@ -762,7 +738,7 @@ However, these can be filled in using :meth:`~DataFrame.fillna` and it will work reindexed[crit.fillna(False)] reindexed[crit.fillna(True)] -Pandas provides a nullable integer dtype, but you must explicitly request it +pandas provides a nullable integer dtype, but you must explicitly request it when creating the series or column. Notice that we use a capital "I" in the ``dtype="Int64"``. @@ -962,7 +938,7 @@ the first 10 columns. .. ipython:: python - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") bb[bb.columns[:10]].dtypes .. ipython:: python diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 398336960e769..b8e75b0535823 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -17,6 +17,7 @@ You can get/set options directly as attributes of the top-level ``options`` attr .. ipython:: python import pandas as pd + pd.options.display.max_rows pd.options.display.max_rows = 999 pd.options.display.max_rows @@ -77,9 +78,9 @@ are available from the pandas namespace. To change an option, call .. ipython:: python - pd.get_option('mode.sim_interactive') - pd.set_option('mode.sim_interactive', True) - pd.get_option('mode.sim_interactive') + pd.get_option("mode.sim_interactive") + pd.set_option("mode.sim_interactive", True) + pd.get_option("mode.sim_interactive") **Note:** The option 'mode.sim_interactive' is mostly used for debugging purposes. @@ -109,7 +110,7 @@ It's also possible to reset multiple options at once (using a regex): ``option_context`` context manager has been exposed through the top-level API, allowing you to execute code with given option values. Option values -are restored automatically when you exit the `with` block: +are restored automatically when you exit the ``with`` block: .. ipython:: python @@ -123,20 +124,21 @@ are restored automatically when you exit the `with` block: Setting startup options in Python/IPython environment ----------------------------------------------------- -Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default ipython profile can be found at: +Using startup scripts for the Python/IPython environment to import pandas and set options makes working with pandas more efficient. To do this, create a .py or .ipy script in the startup directory of the desired profile. An example where the startup folder is in a default IPython profile can be found at: .. code-block:: none $IPYTHONDIR/profile_default/startup -More information can be found in the `ipython documentation +More information can be found in the `IPython documentation `__. An example startup script for pandas is displayed below: .. code-block:: python import pandas as pd - pd.set_option('display.max_rows', 999) - pd.set_option('precision', 5) + + pd.set_option("display.max_rows", 999) + pd.set_option("precision", 5) .. _options.frequently_used: @@ -151,27 +153,27 @@ lines are replaced by an ellipsis. .. ipython:: python df = pd.DataFrame(np.random.randn(7, 2)) - pd.set_option('max_rows', 7) + pd.set_option("max_rows", 7) df - pd.set_option('max_rows', 5) + pd.set_option("max_rows", 5) df - pd.reset_option('max_rows') + pd.reset_option("max_rows") Once the ``display.max_rows`` is exceeded, the ``display.min_rows`` options determines how many rows are shown in the truncated repr. .. ipython:: python - pd.set_option('max_rows', 8) - pd.set_option('min_rows', 4) + pd.set_option("max_rows", 8) + pd.set_option("min_rows", 4) # below max_rows -> all rows shown df = pd.DataFrame(np.random.randn(7, 2)) df # above max_rows -> only min_rows (4) rows shown df = pd.DataFrame(np.random.randn(9, 2)) df - pd.reset_option('max_rows') - pd.reset_option('min_rows') + pd.reset_option("max_rows") + pd.reset_option("min_rows") ``display.expand_frame_repr`` allows for the representation of dataframes to stretch across pages, wrapped over the full column vs row-wise. @@ -179,11 +181,11 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python df = pd.DataFrame(np.random.randn(5, 10)) - pd.set_option('expand_frame_repr', True) + pd.set_option("expand_frame_repr", True) df - pd.set_option('expand_frame_repr', False) + pd.set_option("expand_frame_repr", False) df - pd.reset_option('expand_frame_repr') + pd.reset_option("expand_frame_repr") ``display.large_repr`` lets you select whether to display dataframes that exceed ``max_columns`` or ``max_rows`` as a truncated frame, or as a summary. @@ -191,26 +193,32 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python df = pd.DataFrame(np.random.randn(10, 10)) - pd.set_option('max_rows', 5) - pd.set_option('large_repr', 'truncate') + pd.set_option("max_rows", 5) + pd.set_option("large_repr", "truncate") df - pd.set_option('large_repr', 'info') + pd.set_option("large_repr", "info") df - pd.reset_option('large_repr') - pd.reset_option('max_rows') + pd.reset_option("large_repr") + pd.reset_option("max_rows") ``display.max_colwidth`` sets the maximum width of columns. Cells of this length or longer will be truncated with an ellipsis. .. ipython:: python - df = pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'], - ['horse', 'cow', 'banana', 'apple']])) - pd.set_option('max_colwidth', 40) + df = pd.DataFrame( + np.array( + [ + ["foo", "bar", "bim", "uncomfortably long string"], + ["horse", "cow", "banana", "apple"], + ] + ) + ) + pd.set_option("max_colwidth", 40) df - pd.set_option('max_colwidth', 6) + pd.set_option("max_colwidth", 6) df - pd.reset_option('max_colwidth') + pd.reset_option("max_colwidth") ``display.max_info_columns`` sets a threshold for when by-column info will be given. @@ -218,11 +226,11 @@ will be given. .. ipython:: python df = pd.DataFrame(np.random.randn(10, 10)) - pd.set_option('max_info_columns', 11) + pd.set_option("max_info_columns", 11) df.info() - pd.set_option('max_info_columns', 5) + pd.set_option("max_info_columns", 5) df.info() - pd.reset_option('max_info_columns') + pd.reset_option("max_info_columns") ``display.max_info_rows``: ``df.info()`` will usually show null-counts for each column. For large frames this can be quite slow. ``max_info_rows`` and ``max_info_cols`` @@ -233,11 +241,11 @@ can specify the option ``df.info(null_counts=True)`` to override on showing a pa df = pd.DataFrame(np.random.choice([0, 1, np.nan], size=(10, 10))) df - pd.set_option('max_info_rows', 11) + pd.set_option("max_info_rows", 11) df.info() - pd.set_option('max_info_rows', 5) + pd.set_option("max_info_rows", 5) df.info() - pd.reset_option('max_info_rows') + pd.reset_option("max_info_rows") ``display.precision`` sets the output display precision in terms of decimal places. This is only a suggestion. @@ -245,9 +253,9 @@ This is only a suggestion. .. ipython:: python df = pd.DataFrame(np.random.randn(5, 5)) - pd.set_option('precision', 7) + pd.set_option("precision", 7) df - pd.set_option('precision', 4) + pd.set_option("precision", 4) df ``display.chop_threshold`` sets at what level pandas rounds to zero when @@ -257,26 +265,27 @@ precision at which the number is stored. .. ipython:: python df = pd.DataFrame(np.random.randn(6, 6)) - pd.set_option('chop_threshold', 0) + pd.set_option("chop_threshold", 0) df - pd.set_option('chop_threshold', .5) + pd.set_option("chop_threshold", 0.5) df - pd.reset_option('chop_threshold') + pd.reset_option("chop_threshold") ``display.colheader_justify`` controls the justification of the headers. The options are 'right', and 'left'. .. ipython:: python - df = pd.DataFrame(np.array([np.random.randn(6), - np.random.randint(1, 9, 6) * .1, - np.zeros(6)]).T, - columns=['A', 'B', 'C'], dtype='float') - pd.set_option('colheader_justify', 'right') + df = pd.DataFrame( + np.array([np.random.randn(6), np.random.randint(1, 9, 6) * 0.1, np.zeros(6)]).T, + columns=["A", "B", "C"], + dtype="float", + ) + pd.set_option("colheader_justify", "right") df - pd.set_option('colheader_justify', 'left') + pd.set_option("colheader_justify", "left") df - pd.reset_option('colheader_justify') + pd.reset_option("colheader_justify") @@ -306,10 +315,10 @@ display.encoding UTF-8 Defaults to the detected en meant to be displayed on the console. display.expand_frame_repr True Whether to print out the full DataFrame repr for wide DataFrames across - multiple lines, `max_columns` is + multiple lines, ``max_columns`` is still respected, but the output will wrap-around across multiple "pages" - if its width exceeds `display.width`. + if its width exceeds ``display.width``. display.float_format None The callable should accept a floating point number and return a string with the desired format of the number. @@ -323,7 +332,7 @@ display.large_repr truncate For DataFrames exceeding ma (the behaviour in earlier versions of pandas). allowable settings, ['truncate', 'info'] display.latex.repr False Whether to produce a latex DataFrame - representation for jupyter frontends + representation for Jupyter frontends that support it. display.latex.escape True Escapes special characters in DataFrames, when using the to_latex method. @@ -371,11 +380,11 @@ display.max_rows 60 This sets the maximum numbe fully or just a truncated or summary repr. 'None' value means unlimited. display.min_rows 10 The numbers of rows to show in a truncated - repr (when `max_rows` is exceeded). Ignored - when `max_rows` is set to None or 0. When set - to None, follows the value of `max_rows`. + repr (when ``max_rows`` is exceeded). Ignored + when ``max_rows`` is set to None or 0. When set + to None, follows the value of ``max_rows``. display.max_seq_items 100 when pretty-printing a long sequence, - no more then `max_seq_items` will + no more then ``max_seq_items`` will be printed. If items are omitted, they will be denoted by the addition of "..." to the resulting string. @@ -404,7 +413,7 @@ display.show_dimensions truncate Whether to print out dimens frame is truncated (e.g. not display all rows and/or columns) display.width 80 Width of the display in characters. - In case python/IPython is running in + In case Python/IPython is running in a terminal this can be set to None and pandas will correctly auto-detect the width. Note that the IPython notebook, @@ -423,6 +432,16 @@ display.html.use_mathjax True When True, Jupyter notebook dollar symbol. io.excel.xls.writer xlwt The default Excel writer engine for 'xls' files. + + .. deprecated:: 1.2.0 + + As `xlwt `__ + package is no longer maintained, the ``xlwt`` + engine will be removed in a future version of + pandas. Since this is the only engine in pandas + that supports writing to ``.xls`` files, + this option will also be removed. + io.excel.xlsm.writer openpyxl The default Excel writer engine for 'xlsm' files. Available options: 'openpyxl' (the default). @@ -481,9 +500,9 @@ For instance: import numpy as np pd.set_eng_float_format(accuracy=3, use_eng_prefix=True) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - s / 1.e3 - s / 1.e6 + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + s / 1.0e3 + s / 1.0e6 .. ipython:: python :suppress: @@ -510,7 +529,7 @@ If a DataFrame or Series contains these characters, the default output mode may .. ipython:: python - df = pd.DataFrame({'国籍': ['UK', '日本'], '名前': ['Alice', 'しのぶ']}) + df = pd.DataFrame({"国籍": ["UK", "日本"], "名前": ["Alice", "しのぶ"]}) df .. image:: ../_static/option_unicode01.png @@ -521,7 +540,7 @@ times than the standard ``len`` function. .. ipython:: python - pd.set_option('display.unicode.east_asian_width', True) + pd.set_option("display.unicode.east_asian_width", True) df .. image:: ../_static/option_unicode02.png @@ -533,7 +552,7 @@ By default, an "Ambiguous" character's width, such as "¡" (inverted exclamation .. ipython:: python - df = pd.DataFrame({'a': ['xxx', '¡¡'], 'b': ['yyy', '¡¡']}) + df = pd.DataFrame({"a": ["xxx", "¡¡"], "b": ["yyy", "¡¡"]}) df .. image:: ../_static/option_unicode03.png @@ -545,7 +564,7 @@ However, setting this option incorrectly for your terminal will cause these char .. ipython:: python - pd.set_option('display.unicode.ambiguous_as_wide', True) + pd.set_option("display.unicode.ambiguous_as_wide", True) df .. image:: ../_static/option_unicode04.png @@ -553,8 +572,8 @@ However, setting this option incorrectly for your terminal will cause these char .. ipython:: python :suppress: - pd.set_option('display.unicode.east_asian_width', False) - pd.set_option('display.unicode.ambiguous_as_wide', False) + pd.set_option("display.unicode.east_asian_width", False) + pd.set_option("display.unicode.ambiguous_as_wide", False) .. _options.table_schema: @@ -567,7 +586,7 @@ by default. False by default, this can be enabled globally with the .. ipython:: python - pd.set_option('display.html.table_schema', True) + pd.set_option("display.html.table_schema", True) Only ``'display.max_rows'`` are serialized and published. @@ -575,4 +594,4 @@ Only ``'display.max_rows'`` are serialized and published. .. ipython:: python :suppress: - pd.reset_option('display.html.table_schema') + pd.reset_option("display.html.table_schema") diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index aa6bf44547040..77cf43b2e2b19 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -18,14 +18,18 @@ Reshaping by pivoting DataFrame objects import pandas._testing as tm + def unpivot(frame): N, K = frame.shape - data = {'value': frame.to_numpy().ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - columns = ['date', 'variable', 'value'] + data = { + "value": frame.to_numpy().ravel("F"), + "variable": np.asarray(frame.columns).repeat(N), + "date": np.tile(np.asarray(frame.index), K), + } + columns = ["date", "variable", "value"] return pd.DataFrame(data, columns=columns) + df = unpivot(tm.makeTimeDataFrame(3)) Data is often stored in so-called "stacked" or "record" format: @@ -41,12 +45,15 @@ For the curious here is how the above ``DataFrame`` was created: import pandas._testing as tm + def unpivot(frame): N, K = frame.shape - data = {'value': frame.to_numpy().ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - return pd.DataFrame(data, columns=['date', 'variable', 'value']) + data = { + "value": frame.to_numpy().ravel("F"), + "variable": np.asarray(frame.columns).repeat(N), + "date": np.tile(np.asarray(frame.index), K), + } + return pd.DataFrame(data, columns=["date", "variable", "value"]) df = unpivot(tm.makeTimeDataFrame(3)) @@ -55,7 +62,7 @@ To select out everything for variable ``A`` we could do: .. ipython:: python - df[df['variable'] == 'A'] + df[df["variable"] == "A"] But suppose we wish to do time series operations with the variables. A better representation would be where the ``columns`` are the unique variables and an @@ -65,7 +72,7 @@ top level function :func:`~pandas.pivot`): .. ipython:: python - df.pivot(index='date', columns='variable', values='value') + df.pivot(index="date", columns="variable", values="value") If the ``values`` argument is omitted, and the input ``DataFrame`` has more than one column of values which are not used as column or index inputs to ``pivot``, @@ -75,15 +82,15 @@ column: .. ipython:: python - df['value2'] = df['value'] * 2 - pivoted = df.pivot(index='date', columns='variable') + df["value2"] = df["value"] * 2 + pivoted = df.pivot(index="date", columns="variable") pivoted You can then select subsets from the pivoted ``DataFrame``: .. ipython:: python - pivoted['value2'] + pivoted["value2"] Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. @@ -121,12 +128,16 @@ from the hierarchical indexing section: .. ipython:: python - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', - 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', - 'one', 'two', 'one', 'two']])) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) df2 = df[:4] df2 @@ -163,7 +174,7 @@ the level numbers: .. ipython:: python - stacked.unstack('second') + stacked.unstack("second") .. image:: ../_static/reshaping_unstack_0.png @@ -174,8 +185,8 @@ will result in a **sorted** copy of the original ``DataFrame`` or ``Series``: .. ipython:: python - index = pd.MultiIndex.from_product([[2, 1], ['a', 'b']]) - df = pd.DataFrame(np.random.randn(4), index=index, columns=['A']) + index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]]) + df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"]) df all(df.unstack().stack() == df.sort_index()) @@ -193,15 +204,19 @@ processed individually. .. ipython:: python - columns = pd.MultiIndex.from_tuples([ - ('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short')], - names=['exp', 'animal', 'hair_length'] + columns = pd.MultiIndex.from_tuples( + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], ) df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df - df.stack(level=['animal', 'hair_length']) + df.stack(level=["animal", "hair_length"]) The list of levels can contain either level names or level numbers (but not a mixture of the two). @@ -222,12 +237,18 @@ calling ``sort_index``, of course). Here is a more complex example: .. ipython:: python - columns = pd.MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), - ('B', 'cat'), ('A', 'dog')], - names=['exp', 'animal']) - index = pd.MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'), - ('one', 'two')], - names=['first', 'second']) + columns = pd.MultiIndex.from_tuples( + [ + ("A", "cat"), + ("B", "dog"), + ("B", "cat"), + ("A", "dog"), + ], + names=["exp", "animal"], + ) + index = pd.MultiIndex.from_product( + [("bar", "baz", "foo", "qux"), ("one", "two")], names=["first", "second"] + ) df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns) df2 = df.iloc[[0, 1, 2, 4, 5, 7]] df2 @@ -237,8 +258,8 @@ which level in the columns to stack: .. ipython:: python - df2.stack('exp') - df2.stack('animal') + df2.stack("exp") + df2.stack("animal") Unstacking can result in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with the default @@ -288,13 +309,17 @@ For instance, .. ipython:: python - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) cheese - cheese.melt(id_vars=['first', 'last']) - cheese.melt(id_vars=['first', 'last'], var_name='quantity') + cheese.melt(id_vars=["first", "last"]) + cheese.melt(id_vars=["first", "last"], var_name="quantity") When transforming a DataFrame using :func:`~pandas.melt`, the index will be ignored. The original index values can be kept around by setting the ``ignore_index`` parameter to ``False`` (default is ``True``). This will however duplicate them. @@ -302,15 +327,19 @@ When transforming a DataFrame using :func:`~pandas.melt`, the index will be igno .. ipython:: python - index = pd.MultiIndex.from_tuples([('person', 'A'), ('person', 'B')]) - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}, - index=index) + index = pd.MultiIndex.from_tuples([("person", "A"), ("person", "B")]) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + }, + index=index, + ) cheese - cheese.melt(id_vars=['first', 'last']) - cheese.melt(id_vars=['first', 'last'], ignore_index=False) + cheese.melt(id_vars=["first", "last"]) + cheese.melt(id_vars=["first", "last"], ignore_index=False) Another way to transform is to use the :func:`~pandas.wide_to_long` panel data convenience function. It is less flexible than :func:`~pandas.melt`, but more @@ -318,12 +347,15 @@ user-friendly. .. ipython:: python - dft = pd.DataFrame({"A1970": {0: "a", 1: "b", 2: "c"}, - "A1980": {0: "d", 1: "e", 2: "f"}, - "B1970": {0: 2.5, 1: 1.2, 2: .7}, - "B1980": {0: 3.2, 1: 1.3, 2: .1}, - "X": dict(zip(range(3), np.random.randn(3))) - }) + dft = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), np.random.randn(3))), + } + ) dft["id"] = dft.index dft pd.wide_to_long(dft, ["A", "B"], i="id", j="year") @@ -380,23 +412,32 @@ Consider a data set like this: .. ipython:: python import datetime - df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, - 'B': ['A', 'B', 'C'] * 8, - 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, - 'D': np.random.randn(24), - 'E': np.random.randn(24), - 'F': [datetime.datetime(2013, i, 1) for i in range(1, 13)] - + [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) + + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + "F": [datetime.datetime(2013, i, 1) for i in range(1, 13)] + + [datetime.datetime(2013, i, 15) for i in range(1, 13)], + } + ) df We can produce pivot tables from this data very easily: .. ipython:: python - pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) - pd.pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) - pd.pivot_table(df, values=['D', 'E'], index=['B'], columns=['A', 'C'], - aggfunc=np.sum) + pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) + pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum) + pd.pivot_table( + df, values=["D", "E"], + index=["B"], + columns=["A", "C"], + aggfunc=np.sum, + ) The result object is a ``DataFrame`` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -405,22 +446,21 @@ hierarchy in the columns: .. ipython:: python - pd.pivot_table(df, index=['A', 'B'], columns=['C']) + pd.pivot_table(df, index=["A", "B"], columns=["C"]) Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification `. .. ipython:: python - pd.pivot_table(df, values='D', index=pd.Grouper(freq='M', key='F'), - columns='C') + pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C") You can render a nice output of the table omitting the missing values by calling ``to_string`` if you wish: .. ipython:: python - table = pd.pivot_table(df, index=['A', 'B'], columns=['C']) - print(table.to_string(na_rep='')) + table = pd.pivot_table(df, index=["A", "B"], columns=["C"]) + print(table.to_string(na_rep="")) Note that ``pivot_table`` is also available as an instance method on DataFrame, i.e. :meth:`DataFrame.pivot_table`. @@ -436,7 +476,7 @@ rows and columns: .. ipython:: python - df.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.std) + df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std) .. _reshaping.crosstabulations: @@ -470,30 +510,31 @@ For example: .. ipython:: python - foo, bar, dull, shiny, one, two = 'foo', 'bar', 'dull', 'shiny', 'one', 'two' + foo, bar, dull, shiny, one, two = "foo", "bar", "dull", "shiny", "one", "two" a = np.array([foo, foo, bar, bar, foo, foo], dtype=object) b = np.array([one, one, two, one, two, one], dtype=object) c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) - pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) If ``crosstab`` receives only two Series, it will provide a frequency table. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 2, 2, 2], 'B': [3, 3, 4, 4, 4], - 'C': [1, 1, np.nan, 1, 1]}) + df = pd.DataFrame( + {"A": [1, 2, 2, 2, 2], "B": [3, 3, 4, 4, 4], "C": [1, 1, np.nan, 1, 1]} + ) df - pd.crosstab(df['A'], df['B']) + pd.crosstab(df["A"], df["B"]) ``crosstab`` can also be implemented to ``Categorical`` data. .. ipython:: python - foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) - bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) + bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"]) pd.crosstab(foo, bar) If you want to include **all** of data categories even if the actual data does @@ -513,13 +554,13 @@ using the ``normalize`` argument: .. ipython:: python - pd.crosstab(df['A'], df['B'], normalize=True) + pd.crosstab(df["A"], df["B"], normalize=True) ``normalize`` can also normalize values within each row or within each column: .. ipython:: python - pd.crosstab(df['A'], df['B'], normalize='columns') + pd.crosstab(df["A"], df["B"], normalize="columns") ``crosstab`` can also be passed a third ``Series`` and an aggregation function (``aggfunc``) that will be applied to the values of the third ``Series`` within @@ -527,7 +568,7 @@ each group defined by the first two ``Series``: .. ipython:: python - pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum) + pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc=np.sum) Adding margins ~~~~~~~~~~~~~~ @@ -536,8 +577,9 @@ Finally, one can also add margins or normalize this output. .. ipython:: python - pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True, - margins=True) + pd.crosstab( + df["A"], df["B"], values=df["C"], aggfunc=np.sum, normalize=True, margins=True + ) .. _reshaping.tile: .. _reshaping.tile.cut: @@ -581,19 +623,19 @@ values, can derive a ``DataFrame`` containing ``k`` columns of 1s and 0s using .. ipython:: python - df = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)}) + df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)}) - pd.get_dummies(df['key']) + pd.get_dummies(df["key"]) Sometimes it's useful to prefix the column names, for example when merging the result with the original ``DataFrame``: .. ipython:: python - dummies = pd.get_dummies(df['key'], prefix='key') + dummies = pd.get_dummies(df["key"], prefix="key") dummies - df[['data1']].join(dummies) + df[["data1"]].join(dummies) This function is often used along with discretization functions like ``cut``: @@ -609,14 +651,13 @@ This function is often used along with discretization functions like ``cut``: See also :func:`Series.str.get_dummies `. :func:`get_dummies` also accepts a ``DataFrame``. By default all categorical -variables (categorical in the statistical sense, those with `object` or -`categorical` dtype) are encoded as dummy variables. +variables (categorical in the statistical sense, those with ``object`` or +``categorical`` dtype) are encoded as dummy variables. .. ipython:: python - df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], - 'C': [1, 2, 3]}) + df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]}) pd.get_dummies(df) All non-object columns are included untouched in the output. You can control @@ -624,7 +665,7 @@ the columns that are encoded with the ``columns`` keyword. .. ipython:: python - pd.get_dummies(df, columns=['A']) + pd.get_dummies(df, columns=["A"]) Notice that the ``B`` column is still included in the output, it just hasn't been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't @@ -641,11 +682,11 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: .. ipython:: python - simple = pd.get_dummies(df, prefix='new_prefix') + simple = pd.get_dummies(df, prefix="new_prefix") simple - from_list = pd.get_dummies(df, prefix=['from_A', 'from_B']) + from_list = pd.get_dummies(df, prefix=["from_A", "from_B"]) from_list - from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'}) + from_dict = pd.get_dummies(df, prefix={"B": "from_B", "A": "from_A"}) from_dict Sometimes it will be useful to only keep k-1 levels of a categorical @@ -654,7 +695,7 @@ You can switch to this mode by turn on ``drop_first``. .. ipython:: python - s = pd.Series(list('abcaa')) + s = pd.Series(list("abcaa")) pd.get_dummies(s) @@ -664,7 +705,7 @@ When a column contains only one level, it will be omitted in the result. .. ipython:: python - df = pd.DataFrame({'A': list('aaaaa'), 'B': list('ababc')}) + df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")}) pd.get_dummies(df) @@ -675,12 +716,10 @@ To choose another dtype, use the ``dtype`` argument: .. ipython:: python - df = pd.DataFrame({'A': list('abc'), 'B': [1.1, 2.2, 3.3]}) + df = pd.DataFrame({"A": list("abc"), "B": [1.1, 2.2, 3.3]}) pd.get_dummies(df, dtype=bool).dtypes -.. versionadded:: 0.23.0 - .. _reshaping.factorize: @@ -691,7 +730,7 @@ To encode 1-d values as an enumerated type use :func:`~pandas.factorize`: .. ipython:: python - x = pd.Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + x = pd.Series(["A", "A", np.nan, "B", 3.14, np.inf]) x labels, uniques = pd.factorize(x) labels @@ -735,11 +774,12 @@ DataFrame will be pivoted in the answers below. np.random.seed([3, 1415]) n = 20 - cols = np.array(['key', 'row', 'item', 'col']) - df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) - // [2, 1, 2, 1]).astype(str)) + cols = np.array(["key", "row", "item", "col"]) + df = cols + pd.DataFrame( + (np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str) + ) df.columns = cols - df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val')) + df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix("val")) df @@ -764,8 +804,7 @@ This solution uses :func:`~pandas.pivot_table`. Also note that .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc='mean') + df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean") Note that we can also replace the missing values by using the ``fill_value`` parameter. @@ -773,7 +812,12 @@ parameter. .. ipython:: python df.pivot_table( - values='val0', index='row', columns='col', aggfunc='mean', fill_value=0) + values="val0", + index="row", + columns="col", + aggfunc="mean", + fill_value=0, + ) Also note that we can pass in other aggregation functions as well. For example, we can also pass in ``sum``. @@ -781,7 +825,12 @@ we can also pass in ``sum``. .. ipython:: python df.pivot_table( - values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) + values="val0", + index="row", + columns="col", + aggfunc="sum", + fill_value=0, + ) Another aggregation we can do is calculate the frequency in which the columns and rows occur together a.k.a. "cross tabulation". To do this, we can pass @@ -789,7 +838,7 @@ and rows occur together a.k.a. "cross tabulation". To do this, we can pass .. ipython:: python - df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') + df.pivot_table(index="row", columns="col", fill_value=0, aggfunc="size") Pivoting with multiple aggregations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -800,7 +849,11 @@ We can also perform multiple aggregations. For example, to perform both a .. ipython:: python df.pivot_table( - values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) + values="val0", + index="row", + columns="col", + aggfunc=["mean", "sum"], + ) Note to aggregate over multiple value columns, we can pass in a list to the ``values`` parameter. @@ -808,7 +861,11 @@ Note to aggregate over multiple value columns, we can pass in a list to the .. ipython:: python df.pivot_table( - values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) + values=["val0", "val1"], + index="row", + columns="col", + aggfunc=["mean"], + ) Note to subdivide over multiple columns we can pass in a list to the ``columns`` parameter. @@ -816,7 +873,11 @@ Note to subdivide over multiple columns we can pass in a list to the .. ipython:: python df.pivot_table( - values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + values=["val0"], + index="row", + columns=["item", "col"], + aggfunc=["mean"], + ) .. _reshaping.explode: @@ -829,28 +890,28 @@ Sometimes the values in a column are list-like. .. ipython:: python - keys = ['panda1', 'panda2', 'panda3'] - values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']] - df = pd.DataFrame({'keys': keys, 'values': values}) + keys = ["panda1", "panda2", "panda3"] + values = [["eats", "shoots"], ["shoots", "leaves"], ["eats", "leaves"]] + df = pd.DataFrame({"keys": keys, "values": values}) df We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row: .. ipython:: python - df['values'].explode() + df["values"].explode() You can also explode the column in the ``DataFrame``. .. ipython:: python - df.explode('values') + df.explode("values") :meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. .. ipython:: python - s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']]) + s = pd.Series([[1, 2, 3], "foo", [], ["a", "b"]]) s s.explode() @@ -858,12 +919,11 @@ Here is a typical usecase. You have comma separated strings in a column and want .. ipython:: python - df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, - {'var1': 'd,e,f', 'var2': 2}]) + df = pd.DataFrame([{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}]) df Creating a long form DataFrame is now straightforward using explode and chained operations .. ipython:: python - df.assign(var1=df.var1.str.split(',')).explode('var1') + df.assign(var1=df.var1.str.split(",")).explode("var1") diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index cddc3cb2600fd..7f2419bc7f19d 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -4,7 +4,7 @@ Scaling to large datasets ************************* -Pandas provides data structures for in-memory analytics, which makes using pandas +pandas provides data structures for in-memory analytics, which makes using pandas to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets that are a sizable fraction of memory become unwieldy, as some pandas operations need to make intermediate copies. @@ -13,7 +13,7 @@ This document provides a few recommendations for scaling your analysis to larger It's a complement to :ref:`enhancingperf`, which focuses on speeding up analysis for datasets that fit in memory. -But first, it's worth considering *not using pandas*. Pandas isn't the right +But first, it's worth considering *not using pandas*. pandas isn't the right tool for all situations. If you're working with very large datasets and a tool like PostgreSQL fits your needs, then you should probably be using that. Assuming you want or need the expressiveness and power of pandas, let's carry on. @@ -72,7 +72,7 @@ Option 1 loads in all the data and then filters to what we need. .. ipython:: python - columns = ['id_0', 'name_0', 'x_0', 'y_0'] + columns = ["id_0", "name_0", "x_0", "y_0"] pd.read_parquet("timeseries_wide.parquet")[columns] @@ -123,7 +123,7 @@ space-efficient integers to know which specific name is used in each row. .. ipython:: python ts2 = ts.copy() - ts2['name'] = ts2['name'].astype('category') + ts2["name"] = ts2["name"].astype("category") ts2.memory_usage(deep=True) We can go a bit further and downcast the numeric columns to their smallest types @@ -131,8 +131,8 @@ using :func:`pandas.to_numeric`. .. ipython:: python - ts2['id'] = pd.to_numeric(ts2['id'], downcast='unsigned') - ts2[['x', 'y']] = ts2[['x', 'y']].apply(pd.to_numeric, downcast='float') + ts2["id"] = pd.to_numeric(ts2["id"], downcast="unsigned") + ts2[["x", "y"]] = ts2[["x", "y"]].apply(pd.to_numeric, downcast="float") ts2.dtypes .. ipython:: python @@ -141,8 +141,7 @@ using :func:`pandas.to_numeric`. .. ipython:: python - reduction = (ts2.memory_usage(deep=True).sum() - / ts.memory_usage(deep=True).sum()) + reduction = ts2.memory_usage(deep=True).sum() / ts.memory_usage(deep=True).sum() print(f"{reduction:0.2f}") In all, we've reduced the in-memory footprint of this dataset to 1/5 of its @@ -174,13 +173,13 @@ files. Each file in the directory represents a different year of the entire data import pathlib N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] pathlib.Path("data/timeseries").mkdir(exist_ok=True) for i, (start, end) in enumerate(zip(starts, ends)): - ts = _make_timeseries(start=start, end=end, freq='1T', seed=i) + ts = _make_timeseries(start=start, end=end, freq="1T", seed=i) ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet") @@ -214,8 +213,8 @@ work for arbitrary-sized datasets. for path in files: # Only one dataframe is in memory at a time... df = pd.read_parquet(path) - # ... plus a small Series `counts`, which is updated. - counts = counts.add(df['name'].value_counts(), fill_value=0) + # ... plus a small Series ``counts``, which is updated. + counts = counts.add(df["name"].value_counts(), fill_value=0) counts.astype(int) Some readers, like :meth:`pandas.read_csv`, offer parameters to control the @@ -231,7 +230,7 @@ different library that implements these out-of-core algorithms for you. Use other libraries ------------------- -Pandas is just one library offering a DataFrame API. Because of its popularity, +pandas is just one library offering a DataFrame API. Because of its popularity, pandas' API has become something of a standard that other libraries implement. The pandas documentation maintains a list of libraries implementing a DataFrame API in :ref:`our ecosystem page `. @@ -260,7 +259,7 @@ Inspecting the ``ddf`` object, we see a few things * There are new attributes like ``.npartitions`` and ``.divisions`` The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many **Pandas** DataFrames. A single method call on a +DataFrame is made up of many pandas DataFrames. A single method call on a Dask DataFrame ends up making many pandas method calls, and Dask knows how to coordinate everything to get the result. @@ -278,8 +277,8 @@ Rather than executing immediately, doing operations build up a **task graph**. .. ipython:: python ddf - ddf['name'] - ddf['name'].value_counts() + ddf["name"] + ddf["name"].value_counts() Each of these calls is instant because the result isn't being computed yet. We're just building up a list of computation to do when someone needs the @@ -291,7 +290,7 @@ To get the actual result you can call ``.compute()``. .. ipython:: python - %time ddf['name'].value_counts().compute() + %time ddf["name"].value_counts().compute() At that point, you get back the same thing you'd get with pandas, in this case a concrete pandas Series with the count of each ``name``. @@ -324,7 +323,7 @@ a familiar groupby aggregation. .. ipython:: python - %time ddf.groupby('name')[['x', 'y']].mean().compute().head() + %time ddf.groupby("name")[["x", "y"]].mean().compute().head() The grouping and aggregation is done out-of-core and in parallel. @@ -336,8 +335,8 @@ we need to supply the divisions manually. .. ipython:: python N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) ddf.divisions = divisions @@ -347,9 +346,9 @@ Now we can do things like fast random access with ``.loc``. .. ipython:: python - ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute() + ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() -Dask knows to just look in the 3rd partition for selecting values in `2002`. It +Dask knows to just look in the 3rd partition for selecting values in 2002. It doesn't need to look at any other data. Many workflows involve a large amount of data and processing it in a way that @@ -362,7 +361,7 @@ out of memory. At that point it's just a regular pandas object. :okwarning: @savefig dask_resample.png - ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot() + ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() These Dask examples have all be done using multiple processes on a single machine. Dask can be `deployed on a cluster diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index ca8e9a2f313f6..e4eea57c43dbb 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -6,7 +6,7 @@ Sparse data structures ********************** -Pandas provides data structures for efficiently storing sparse data. +pandas provides data structures for efficiently storing sparse data. These are not necessarily sparse in the typical "mostly 0". Rather, you can view these objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value can be chosen, including 0) is omitted. The compressed values are not actually stored in the array. @@ -87,14 +87,15 @@ The :attr:`SparseArray.dtype` property stores two pieces of information sparr.dtype -A :class:`SparseDtype` may be constructed by passing each of these +A :class:`SparseDtype` may be constructed by passing only a dtype .. ipython:: python pd.SparseDtype(np.dtype('datetime64[ns]')) -The default fill value for a given NumPy dtype is the "missing" value for that dtype, -though it may be overridden. +in which case a default fill value will be used (for NumPy dtypes this is often the +"missing" value for that dtype). To override this default an explicit fill value may be +passed instead .. ipython:: python @@ -115,7 +116,7 @@ Sparse accessor .. versionadded:: 0.24.0 -Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` +pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` for categorical data, and ``.dt`` for datetime-like data. This namespace provides attributes and methods that are specific to sparse data. @@ -178,7 +179,7 @@ sparse values instead. rather than a SparseSeries or SparseDataFrame. This section provides some guidance on migrating your code to the new style. As a reminder, -you can use the python warnings module to control warnings. But we recommend modifying +you can use the Python warnings module to control warnings. But we recommend modifying your code, rather than ignoring the warning. **Construction** @@ -302,14 +303,17 @@ The method requires a ``MultiIndex`` with two or more levels. .. ipython:: python s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) - s + s.index = pd.MultiIndex.from_tuples( + [ + (1, 2, "a", 0), + (1, 2, "a", 1), + (1, 1, "b", 0), + (1, 1, "b", 1), + (2, 1, "b", 0), + (2, 1, "b", 1), + ], + names=["A", "B", "C", "D"], + ) ss = s.astype('Sparse') ss @@ -317,9 +321,10 @@ In the example below, we transform the ``Series`` to a sparse representation of .. ipython:: python - A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=True) + A, rows, columns = ss.sparse.to_coo( + row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True + ) + A A.todense() @@ -330,9 +335,9 @@ Specifying different row and column labels (and not sorting them) yields a diffe .. ipython:: python - A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B', 'C'], - column_levels=['D'], - sort_labels=False) + A, rows, columns = ss.sparse.to_coo( + row_levels=["A", "B", "C"], column_levels=["D"], sort_labels=False + ) A A.todense() @@ -344,8 +349,7 @@ A convenience method :meth:`Series.sparse.from_coo` is implemented for creating .. ipython:: python from scipy import sparse - A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) + A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)) A A.todense() diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index fd8dda4fe365e..24f344488d1ca 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -141,7 +141,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this case, the cell's style depends only on it's own value.\n", + "In this case, the cell's style depends only on its own value.\n", "That means we should use the `Styler.applymap` method which works elementwise." ] }, @@ -793,7 +793,8 @@ "source": [ "The next option you have are \"table styles\".\n", "These are styles that apply to the table as a whole, but don't look at the data.\n", - "Certain sytlings, including pseudo-selectors like `:hover` can only be used this way." + "Certain stylings, including pseudo-selectors like `:hover` can only be used this way.\n", + "These can also be used to set specific row or column based class selectors, as will be shown." ] }, { @@ -831,9 +832,32 @@ "The value for `props` should be a list of tuples of `('attribute', 'value')`.\n", "\n", "`table_styles` are extremely flexible, but not as fun to type out by hand.\n", - "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." + "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here.\n", + "\n", + "`table_styles` can be used to add column and row based class descriptors. For large tables this can increase performance by avoiding repetitive individual css for each cell, and it can also simplify style construction in some cases.\n", + "If `table_styles` is given as a dictionary each key should be a specified column or index value and this will map to specific class CSS selectors of the given column or row.\n", + "\n", + "Note that `Styler.set_table_styles` will overwrite existing styles but can be chained by setting the `overwrite` argument to `False`." ] }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "html = html.set_table_styles({\n", + " 'B': [dict(selector='', props=[('color', 'green')])],\n", + " 'C': [dict(selector='td', props=[('color', 'red')])], \n", + " }, overwrite=False)\n", + "html" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, { "cell_type": "markdown", "metadata": {}, @@ -922,10 +946,12 @@ "- DataFrame only `(use Series.to_frame().style)`\n", "- The index and columns must be unique\n", "- No large repr, and performance isn't great; this is intended for summary DataFrames\n", - "- You can only style the *values*, not the index or columns\n", + "- You can only style the *values*, not the index or columns (except with `table_styles` above)\n", "- You can only apply styles, you can't insert new HTML entities\n", "\n", - "Some of these will be addressed in the future.\n" + "Some of these will be addressed in the future.\n", + "Performance can suffer when adding styles to each cell in a large DataFrame.\n", + "It is recommended to apply table or column based styles where possible to limit overall HTML length, as well as setting a shorter UUID to avoid unnecessary repeated data transmission. \n" ] }, { diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 3408b98b3179d..9b1c9b8d04270 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -46,20 +46,20 @@ infer a list of strings to .. ipython:: python - pd.Series(['a', 'b', 'c']) + pd.Series(["a", "b", "c"]) To explicitly request ``string`` dtype, specify the ``dtype`` .. ipython:: python - pd.Series(['a', 'b', 'c'], dtype="string") - pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()) + pd.Series(["a", "b", "c"], dtype="string") + pd.Series(["a", "b", "c"], dtype=pd.StringDtype()) Or ``astype`` after the ``Series`` or ``DataFrame`` is created .. ipython:: python - s = pd.Series(['a', 'b', 'c']) + s = pd.Series(["a", "b", "c"]) s s.astype("string") @@ -71,7 +71,7 @@ it will be converted to ``string`` dtype: .. ipython:: python - s = pd.Series(['a', 2, np.nan], dtype="string") + s = pd.Series(["a", 2, np.nan], dtype="string") s type(s[1]) @@ -147,15 +147,16 @@ the equivalent (scalar) built-in string methods: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str.lower() s.str.upper() s.str.len() .. ipython:: python - idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank']) + idx = pd.Index([" jack", "jill ", " jesse ", "frank"]) idx.str.strip() idx.str.lstrip() idx.str.rstrip() @@ -166,8 +167,9 @@ leading or trailing whitespace: .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 2), - columns=[' Column A ', ' Column B '], index=range(3)) + df = pd.DataFrame( + np.random.randn(3, 2), columns=[" Column A ", " Column B "], index=range(3) + ) df Since ``df.columns`` is an Index object, we can use the ``.str`` accessor @@ -183,7 +185,7 @@ and replacing any remaining whitespaces with underscores: .. ipython:: python - df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') + df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_") df .. note:: @@ -221,21 +223,21 @@ Methods like ``split`` return a Series of lists: .. ipython:: python - s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string") - s2.str.split('_') + s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string") + s2.str.split("_") Elements in the split lists can be accessed using ``get`` or ``[]`` notation: .. ipython:: python - s2.str.split('_').str.get(1) - s2.str.split('_').str[1] + s2.str.split("_").str.get(1) + s2.str.split("_").str[1] It is easy to expand this to return a DataFrame using ``expand``. .. ipython:: python - s2.str.split('_', expand=True) + s2.str.split("_", expand=True) When original ``Series`` has :class:`StringDtype`, the output columns will all be :class:`StringDtype` as well. @@ -244,56 +246,47 @@ It is also possible to limit the number of splits: .. ipython:: python - s2.str.split('_', expand=True, n=1) + s2.str.split("_", expand=True, n=1) ``rsplit`` is similar to ``split`` except it works in the reverse direction, i.e., from the end of the string to the beginning of the string: .. ipython:: python - s2.str.rsplit('_', expand=True, n=1) + s2.str.rsplit("_", expand=True, n=1) -``replace`` by default replaces `regular expressions +``replace`` optionally uses `regular expressions `__: .. ipython:: python - s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', - '', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s3 = pd.Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], + dtype="string", + ) s3 - s3.str.replace('^.a|dog', 'XX-XX ', case=False) + s3.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) -Some caution must be taken to keep regular expressions in mind! For example, the -following code will cause trouble because of the regular expression meaning of -`$`: - -.. ipython:: python - - # Consider the following badly formatted financial data - dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string") - - # This does what you'd naively expect: - dollars.str.replace('$', '') - - # But this doesn't: - dollars.str.replace('-$', '-') +.. warning:: - # We need to escape the special character (for >1 len patterns) - dollars.str.replace(r'-\$', '-') + Some caution must be taken when dealing with regular expressions! The current behavior + is to treat single character patterns as literal strings, even when ``regex`` is set + to ``True``. This behavior is deprecated and will be removed in a future version so + that the ``regex`` keyword is always respected. -.. versionadded:: 0.23.0 +.. versionchanged:: 1.2.0 -If you do want literal replacement of a string (equivalent to -:meth:`str.replace`), you can set the optional ``regex`` parameter to -``False``, rather than escaping each character. In this case both ``pat`` -and ``repl`` must be strings: +If you want literal replacement of a string (equivalent to :meth:`str.replace`), you +can set the optional ``regex`` parameter to ``False``, rather than escaping each +character. In this case both ``pat`` and ``repl`` must be strings: .. ipython:: python + dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string") + # These lines are equivalent - dollars.str.replace(r'-\$', '-') - dollars.str.replace('-$', '-', regex=False) + dollars.str.replace(r"-\$", "-", regex=True) + dollars.str.replace("-$", "-", regex=False) The ``replace`` method can also take a callable as replacement. It is called on every ``pat`` using :func:`re.sub`. The callable should expect one @@ -302,22 +295,29 @@ positional argument (a regex object) and return a string. .. ipython:: python # Reverse every lowercase alphabetic word - pat = r'[a-z]+' + pat = r"[a-z]+" + def repl(m): return m.group(0)[::-1] - pd.Series(['foo 123', 'bar baz', np.nan], - dtype="string").str.replace(pat, repl) + + pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace( + pat, repl, regex=True + ) + # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" + def repl(m): - return m.group('two').swapcase() + return m.group("two").swapcase() + - pd.Series(['Foo Bar Baz', np.nan], - dtype="string").str.replace(pat, repl) + pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace( + pat, repl, regex=True + ) The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the @@ -326,8 +326,9 @@ compiled regular expression object. .. ipython:: python import re - regex_pat = re.compile(r'^.a|dog', flags=re.IGNORECASE) - s3.str.replace(regex_pat, 'XX-XX ') + + regex_pat = re.compile(r"^.a|dog", flags=re.IGNORECASE) + s3.str.replace(regex_pat, "XX-XX ", regex=True) Including a ``flags`` argument when calling ``replace`` with a compiled regular expression object will raise a ``ValueError``. @@ -354,8 +355,8 @@ The content of a ``Series`` (or ``Index``) can be concatenated: .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], dtype="string") - s.str.cat(sep=',') + s = pd.Series(["a", "b", "c", "d"], dtype="string") + s.str.cat(sep=",") If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``: @@ -367,9 +368,9 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re .. ipython:: python - t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string") - t.str.cat(sep=',') - t.str.cat(sep=',', na_rep='-') + t = pd.Series(["a", "b", np.nan, "d"], dtype="string") + t.str.cat(sep=",") + t.str.cat(sep=",", na_rep="-") Concatenating a Series and something list-like into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -378,20 +379,18 @@ The first argument to :meth:`~Series.str.cat` can be a list-like object, provide .. ipython:: python - s.str.cat(['A', 'B', 'C', 'D']) + s.str.cat(["A", "B", "C", "D"]) Missing values on either side will result in missing values in the result as well, *unless* ``na_rep`` is specified: .. ipython:: python s.str.cat(t) - s.str.cat(t, na_rep='-') + s.str.cat(t, na_rep="-") Concatenating a Series and something array-like into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. versionadded:: 0.23.0 - The parameter ``others`` can also be two-dimensional. In this case, the number or rows must match the lengths of the calling ``Series`` (or ``Index``). .. ipython:: python @@ -399,25 +398,22 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o d = pd.concat([t, s], axis=1) s d - s.str.cat(d, na_rep='-') + s.str.cat(d, na_rep="-") Concatenating a Series and an indexed object into a Series, with alignment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. versionadded:: 0.23.0 - For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the indexes before concatenation by setting the ``join``-keyword. .. ipython:: python :okwarning: - u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2], - dtype="string") + u = pd.Series(["b", "d", "a", "c"], index=[1, 3, 0, 2], dtype="string") s u s.str.cat(u) - s.str.cat(u, join='left') + s.str.cat(u, join="left") .. warning:: @@ -429,12 +425,11 @@ In particular, alignment also means that the different lengths do not need to co .. ipython:: python - v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4], - dtype="string") + v = pd.Series(["z", "a", "b", "d", "e"], index=[-1, 0, 1, 3, 4], dtype="string") s v - s.str.cat(v, join='left', na_rep='-') - s.str.cat(v, join='outer', na_rep='-') + s.str.cat(v, join="left", na_rep="-") + s.str.cat(v, join="outer", na_rep="-") The same alignment can be used when ``others`` is a ``DataFrame``: @@ -443,7 +438,7 @@ The same alignment can be used when ``others`` is a ``DataFrame``: f = d.loc[[3, 2, 1, 0], :] s f - s.str.cat(f, join='left', na_rep='-') + s.str.cat(f, join="left", na_rep="-") Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -455,7 +450,7 @@ can be combined in a list-like container (including iterators, ``dict``-views, e s u - s.str.cat([u, u.to_numpy()], join='left') + s.str.cat([u, u.to_numpy()], join="left") All elements without an index (e.g. ``np.ndarray``) within the passed list-like must match in length to the calling ``Series`` (or ``Index``), but ``Series`` and ``Index`` may have arbitrary length (as long as alignment is not disabled with ``join=None``): @@ -463,7 +458,7 @@ but ``Series`` and ``Index`` may have arbitrary length (as long as alignment is .. ipython:: python v - s.str.cat([v, u, u.to_numpy()], join='outer', na_rep='-') + s.str.cat([v, u, u.to_numpy()], join="outer", na_rep="-") If using ``join='right'`` on a list-like of ``others`` that contains different indexes, the union of these indexes will be used as the basis for the final concatenation: @@ -472,7 +467,7 @@ the union of these indexes will be used as the basis for the final concatenation u.loc[[3]] v.loc[[-1, 0]] - s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join='right', na_rep='-') + s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join="right", na_rep="-") Indexing with ``.str`` ---------------------- @@ -485,9 +480,9 @@ of the string, the result will be a ``NaN``. .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, - 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str[0] s.str[1] @@ -518,8 +513,10 @@ DataFrame with one column per group. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'([ab])(\d)', expand=False) + pd.Series( + ["a1", "b2", "c3"], + dtype="string", + ).str.extract(r"([ab])(\d)", expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -532,16 +529,18 @@ Named groups like .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'(?P[ab])(?P\d)', - expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract( + r"(?P[ab])(?P\d)", expand=False + ) and optional groups like .. ipython:: python - pd.Series(['a1', 'b2', '3'], - dtype="string").str.extract(r'([ab])?(\d)', expand=False) + pd.Series( + ["a1", "b2", "3"], + dtype="string", + ).str.extract(r"([ab])?(\d)", expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -552,23 +551,20 @@ with one column if ``expand=True``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=True) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=True) It returns a Series if ``expand=False``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=False) Calling on an ``Index`` with a regex with exactly one capture group returns a ``DataFrame`` with one column if ``expand=True``. .. ipython:: python - s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], - dtype="string") + s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], dtype="string") s s.index.str.extract("(?P[a-zA-Z])", expand=True) @@ -613,10 +609,9 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], - dtype="string") + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], dtype="string") s - two_groups = '(?P[a-z])(?P[0-9])' + two_groups = "(?P[a-z])(?P[0-9])" s.str.extract(two_groups, expand=True) the ``extractall`` method returns every match. The result of @@ -632,7 +627,7 @@ When each subject string in the Series has exactly one match, .. ipython:: python - s = pd.Series(['a3', 'b3', 'c2'], dtype="string") + s = pd.Series(["a3", "b3", "c2"], dtype="string") s then ``extractall(pat).xs(0, level='match')`` gives the same result as @@ -663,23 +658,29 @@ You can check whether elements contain a pattern: .. ipython:: python - pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.contains(pattern) + pattern = r"[0-9][a-z]" + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.match(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.match(pattern) .. versionadded:: 1.1.0 .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.fullmatch(pattern) + pd.Series( + ["1", "2", "3a", "3b", "03c", "4dx"], + dtype="string", + ).str.fullmatch(pattern) .. note:: @@ -701,9 +702,10 @@ True or False: .. ipython:: python - s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") - s4.str.contains('A', na=False) + s4 = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) + s4.str.contains("A", na=False) .. _text.indicator: @@ -715,15 +717,15 @@ For example if they are separated by a ``'|'``: .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string") - s.str.get_dummies(sep='|') + s = pd.Series(["a", "a|b", np.nan, "a|c"], dtype="string") + s.str.get_dummies(sep="|") String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``. .. ipython:: python - idx = pd.Index(['a', 'a|b', np.nan, 'a|c']) - idx.str.get_dummies(sep='|') + idx = pd.Index(["a", "a|b", np.nan, "a|c"]) + idx.str.get_dummies(sep="|") See also :func:`~pandas.get_dummies`. diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 3439a0a4c13c7..0b4ddaaa8a42a 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -18,44 +18,40 @@ parsing, and attributes. Parsing ------- -You can construct a ``Timedelta`` scalar through various arguments: +You can construct a ``Timedelta`` scalar through various arguments, including `ISO 8601 Duration`_ strings. .. ipython:: python import datetime # strings - pd.Timedelta('1 days') - pd.Timedelta('1 days 00:00:00') - pd.Timedelta('1 days 2 hours') - pd.Timedelta('-1 days 2 min 3us') + pd.Timedelta("1 days") + pd.Timedelta("1 days 00:00:00") + pd.Timedelta("1 days 2 hours") + pd.Timedelta("-1 days 2 min 3us") # like datetime.timedelta # note: these MUST be specified as keyword arguments pd.Timedelta(days=1, seconds=1) # integers with a unit - pd.Timedelta(1, unit='d') + pd.Timedelta(1, unit="d") # from a datetime.timedelta/np.timedelta64 pd.Timedelta(datetime.timedelta(days=1, seconds=1)) - pd.Timedelta(np.timedelta64(1, 'ms')) + pd.Timedelta(np.timedelta64(1, "ms")) # negative Timedeltas have this string repr # to be more consistent with datetime.timedelta conventions - pd.Timedelta('-1us') + pd.Timedelta("-1us") # a NaT - pd.Timedelta('nan') - pd.Timedelta('nat') + pd.Timedelta("nan") + pd.Timedelta("nat") # ISO 8601 Duration strings - pd.Timedelta('P0DT0H1M0S') - pd.Timedelta('P0DT0H0M0.000000123S') - -.. versionadded:: 0.23.0 - - Added constructor for `ISO 8601 Duration`_ strings + pd.Timedelta("P0DT0H1M0S") + pd.Timedelta("P0DT0H0M0.000000123S") :ref:`DateOffsets` (``Day, Hour, Minute, Second, Milli, Micro, Nano``) can also be used in construction. @@ -67,8 +63,9 @@ Further, operations among the scalars yield another scalar ``Timedelta``. .. ipython:: python - pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) +\ - pd.Timedelta('00:00:00.000123') + pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) + pd.Timedelta( + "00:00:00.000123" + ) to_timedelta ~~~~~~~~~~~~ @@ -82,28 +79,28 @@ You can parse a single string to a Timedelta: .. ipython:: python - pd.to_timedelta('1 days 06:05:01.00003') - pd.to_timedelta('15.5us') + pd.to_timedelta("1 days 06:05:01.00003") + pd.to_timedelta("15.5us") or a list/array of strings: .. ipython:: python - pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"]) The ``unit`` keyword argument specifies the unit of the Timedelta: .. ipython:: python - pd.to_timedelta(np.arange(5), unit='s') - pd.to_timedelta(np.arange(5), unit='d') + pd.to_timedelta(np.arange(5), unit="s") + pd.to_timedelta(np.arange(5), unit="d") .. _timedeltas.limitations: Timedelta limitations ~~~~~~~~~~~~~~~~~~~~~ -Pandas represents ``Timedeltas`` in nanosecond resolution using +pandas represents ``Timedeltas`` in nanosecond resolution using 64 bit integers. As such, the 64 bit integer limits determine the ``Timedelta`` limits. @@ -122,11 +119,11 @@ subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. .. ipython:: python - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")) td = pd.Series([pd.Timedelta(days=i) for i in range(3)]) - df = pd.DataFrame({'A': s, 'B': td}) + df = pd.DataFrame({"A": s, "B": td}) df - df['C'] = df['A'] + df['B'] + df["C"] = df["A"] + df["B"] df df.dtypes @@ -169,10 +166,10 @@ Operands can also appear in a reversed order (a singular object operated with a .. ipython:: python - A = s - pd.Timestamp('20120101') - pd.Timedelta('00:05:05') - B = s - pd.Series(pd.date_range('2012-1-2', periods=3, freq='D')) + A = s - pd.Timestamp("20120101") - pd.Timedelta("00:05:05") + B = s - pd.Series(pd.date_range("2012-1-2", periods=3, freq="D")) - df = pd.DataFrame({'A': A, 'B': B}) + df = pd.DataFrame({"A": A, "B": B}) df df.min() @@ -196,17 +193,17 @@ You can fillna on timedeltas, passing a timedelta to get a particular value. .. ipython:: python y.fillna(pd.Timedelta(0)) - y.fillna(pd.Timedelta(10, unit='s')) - y.fillna(pd.Timedelta('-1 days, 00:00:05')) + y.fillna(pd.Timedelta(10, unit="s")) + y.fillna(pd.Timedelta("-1 days, 00:00:05")) You can also negate, multiply and use ``abs`` on ``Timedeltas``: .. ipython:: python - td1 = pd.Timedelta('-1 days 2 hours 3 seconds') + td1 = pd.Timedelta("-1 days 2 hours 3 seconds") td1 -1 * td1 - - td1 + -td1 abs(td1) .. _timedeltas.timedeltas_reductions: @@ -219,12 +216,13 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob .. ipython:: python - y2 = pd.Series(pd.to_timedelta(['-1 days +00:00:05', 'nat', - '-1 days +00:00:05', '1 days'])) + y2 = pd.Series( + pd.to_timedelta(["-1 days +00:00:05", "nat", "-1 days +00:00:05", "1 days"]) + ) y2 y2.mean() y2.median() - y2.quantile(.1) + y2.quantile(0.1) y2.sum() .. _timedeltas.timedeltas_convert: @@ -238,8 +236,8 @@ Note that division by the NumPy scalar is true division, while astyping is equiv .. ipython:: python - december = pd.Series(pd.date_range('20121201', periods=4)) - january = pd.Series(pd.date_range('20130101', periods=4)) + december = pd.Series(pd.date_range("20121201", periods=4)) + january = pd.Series(pd.date_range("20130101", periods=4)) td = january - december td[2] += datetime.timedelta(minutes=5, seconds=3) @@ -247,15 +245,15 @@ Note that division by the NumPy scalar is true division, while astyping is equiv td # to days - td / np.timedelta64(1, 'D') - td.astype('timedelta64[D]') + td / np.timedelta64(1, "D") + td.astype("timedelta64[D]") # to seconds - td / np.timedelta64(1, 's') - td.astype('timedelta64[s]') + td / np.timedelta64(1, "s") + td.astype("timedelta64[s]") # to months (these are constant months) - td / np.timedelta64(1, 'M') + td / np.timedelta64(1, "M") Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series yields another ``timedelta64[ns]`` dtypes Series. @@ -309,7 +307,7 @@ You can access the value of the fields for a scalar ``Timedelta`` directly. .. ipython:: python - tds = pd.Timedelta('31 days 5 min 3 sec') + tds = pd.Timedelta("31 days 5 min 3 sec") tds.days tds.seconds (-tds).seconds @@ -329,9 +327,9 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the .. ipython:: python - pd.Timedelta(days=6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12).isoformat() + pd.Timedelta( + days=6, minutes=50, seconds=3, milliseconds=10, microseconds=10, nanoseconds=12 + ).isoformat() .. _ISO 8601 Duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -348,15 +346,21 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss .. ipython:: python - pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2, 'D'), - datetime.timedelta(days=2, seconds=2)]) + pd.TimedeltaIndex( + [ + "1 days", + "1 days, 00:00:05", + np.timedelta64(2, "D"), + datetime.timedelta(days=2, seconds=2), + ] + ) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: .. ipython:: python - pd.TimedeltaIndex(['0 days', '10 days', '20 days'], freq='infer') + pd.TimedeltaIndex(["0 days", "10 days", "20 days"], freq="infer") Generating ranges of time deltas ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -367,27 +371,25 @@ calendar day: .. ipython:: python - pd.timedelta_range(start='1 days', periods=5) + pd.timedelta_range(start="1 days", periods=5) Various combinations of ``start``, ``end``, and ``periods`` can be used with ``timedelta_range``: .. ipython:: python - pd.timedelta_range(start='1 days', end='5 days') + pd.timedelta_range(start="1 days", end="5 days") - pd.timedelta_range(end='10 days', periods=4) + pd.timedelta_range(end="10 days", periods=4) The ``freq`` parameter can passed a variety of :ref:`frequency aliases `: .. ipython:: python - pd.timedelta_range(start='1 days', end='2 days', freq='30T') - - pd.timedelta_range(start='1 days', periods=5, freq='2D5H') + pd.timedelta_range(start="1 days", end="2 days", freq="30T") + pd.timedelta_range(start="1 days", periods=5, freq="2D5H") -.. versionadded:: 0.23.0 Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced timedeltas from ``start`` to ``end`` inclusively, with ``periods`` number of elements @@ -395,9 +397,9 @@ in the resulting ``TimedeltaIndex``: .. ipython:: python - pd.timedelta_range('0 days', '4 days', periods=5) + pd.timedelta_range("0 days", "4 days", periods=5) - pd.timedelta_range('0 days', '4 days', periods=10) + pd.timedelta_range("0 days", "4 days", periods=10) Using the TimedeltaIndex ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -407,23 +409,25 @@ Similarly to other of the datetime-like indices, ``DatetimeIndex`` and ``PeriodI .. ipython:: python - s = pd.Series(np.arange(100), - index=pd.timedelta_range('1 days', periods=100, freq='h')) + s = pd.Series( + np.arange(100), + index=pd.timedelta_range("1 days", periods=100, freq="h"), + ) s Selections work similarly, with coercion on string-likes and slices: .. ipython:: python - s['1 day':'2 day'] - s['1 day 01:00:00'] - s[pd.Timedelta('1 day 1h')] + s["1 day":"2 day"] + s["1 day 01:00:00"] + s[pd.Timedelta("1 day 1h")] Furthermore you can use partial string selection and the range will be inferred: .. ipython:: python - s['1 day':'1 day 5 hours'] + s["1 day":"1 day 5 hours"] Operations ~~~~~~~~~~ @@ -432,9 +436,9 @@ Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow cert .. ipython:: python - tdi = pd.TimedeltaIndex(['1 days', pd.NaT, '2 days']) + tdi = pd.TimedeltaIndex(["1 days", pd.NaT, "2 days"]) tdi.to_list() - dti = pd.date_range('20130101', periods=3) + dti = pd.date_range("20130101", periods=3) dti.to_list() (dti + tdi).to_list() (dti - tdi).to_list() @@ -446,22 +450,22 @@ Similarly to frequency conversion on a ``Series`` above, you can convert these i .. ipython:: python - tdi / np.timedelta64(1, 's') - tdi.astype('timedelta64[s]') + tdi / np.timedelta64(1, "s") + tdi.astype("timedelta64[s]") Scalars type ops work as well. These can potentially return a *different* type of index. .. ipython:: python # adding or timedelta and date -> datelike - tdi + pd.Timestamp('20130101') + tdi + pd.Timestamp("20130101") # subtraction of a date and a timedelta -> datelike # note that trying to subtract a date from a Timedelta will raise an exception - (pd.Timestamp('20130101') - tdi).to_list() + (pd.Timestamp("20130101") - tdi).to_list() # timedelta + timedelta -> timedelta - tdi + pd.Timedelta('10 days') + tdi + pd.Timedelta("10 days") # division can result in a Timedelta if the divisor is an integer tdi / 2 @@ -478,4 +482,4 @@ Similar to :ref:`timeseries resampling `, we can resample .. ipython:: python - s.resample('D').mean() + s.resample("D").mean() diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index a03ba6c775e68..354c510b843dd 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -19,42 +19,43 @@ Parsing time series information from various sources and formats import datetime - dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), - datetime.datetime(2018, 1, 1)]) + dti = pd.to_datetime( + ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] + ) dti Generate sequences of fixed-frequency dates and time spans .. ipython:: python - dti = pd.date_range('2018-01-01', periods=3, freq='H') + dti = pd.date_range("2018-01-01", periods=3, freq="H") dti Manipulating and converting date times with timezone information .. ipython:: python - dti = dti.tz_localize('UTC') + dti = dti.tz_localize("UTC") dti - dti.tz_convert('US/Pacific') + dti.tz_convert("US/Pacific") Resampling or converting a time series to a particular frequency .. ipython:: python - idx = pd.date_range('2018-01-01', periods=5, freq='H') + idx = pd.date_range("2018-01-01", periods=5, freq="H") ts = pd.Series(range(len(idx)), index=idx) ts - ts.resample('2H').mean() + ts.resample("2H").mean() Performing date and time arithmetic with absolute or relative time increments .. ipython:: python - friday = pd.Timestamp('2018-01-05') + friday = pd.Timestamp("2018-01-05") friday.day_name() # Add 1 day - saturday = friday + pd.Timedelta('1 day') + saturday = friday + pd.Timedelta("1 day") saturday.day_name() # Add 1 business day (Friday --> Monday) monday = friday + pd.offsets.BDay() @@ -90,13 +91,13 @@ so manipulations can be performed with respect to the time element. .. ipython:: python - pd.Series(range(3), index=pd.date_range('2000', freq='D', periods=3)) + pd.Series(range(3), index=pd.date_range("2000", freq="D", periods=3)) However, :class:`Series` and :class:`DataFrame` can directly also support the time component as data itself. .. ipython:: python - pd.Series(pd.date_range('2000', freq='D', periods=3)) + pd.Series(pd.date_range("2000", freq="D", periods=3)) :class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime``, ``timedelta`` and ``Period`` data when passed into those constructors. ``DateOffset`` @@ -104,9 +105,9 @@ data however will be stored as ``object`` data. .. ipython:: python - pd.Series(pd.period_range('1/1/2011', freq='M', periods=3)) + pd.Series(pd.period_range("1/1/2011", freq="M", periods=3)) pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) - pd.Series(pd.date_range('1/1/2011', freq='M', periods=3)) + pd.Series(pd.date_range("1/1/2011", freq="M", periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which is useful for representing missing or null date like values and behaves similar @@ -132,7 +133,7 @@ time. .. ipython:: python pd.Timestamp(datetime.datetime(2012, 5, 1)) - pd.Timestamp('2012-05-01') + pd.Timestamp("2012-05-01") pd.Timestamp(2012, 5, 1) However, in many cases it is more natural to associate things like change @@ -143,9 +144,9 @@ For example: .. ipython:: python - pd.Period('2011-01') + pd.Period("2011-01") - pd.Period('2012-05', freq='D') + pd.Period("2012-05", freq="D") :class:`Timestamp` and :class:`Period` can serve as an index. Lists of ``Timestamp`` and ``Period`` are automatically coerced to :class:`DatetimeIndex` @@ -153,9 +154,11 @@ and :class:`PeriodIndex` respectively. .. ipython:: python - dates = [pd.Timestamp('2012-05-01'), - pd.Timestamp('2012-05-02'), - pd.Timestamp('2012-05-03')] + dates = [ + pd.Timestamp("2012-05-01"), + pd.Timestamp("2012-05-02"), + pd.Timestamp("2012-05-03"), + ] ts = pd.Series(np.random.randn(3), dates) type(ts.index) @@ -163,7 +166,7 @@ and :class:`PeriodIndex` respectively. ts - periods = [pd.Period('2012-01'), pd.Period('2012-02'), pd.Period('2012-03')] + periods = [pd.Period("2012-01"), pd.Period("2012-02"), pd.Period("2012-03")] ts = pd.Series(np.random.randn(3), periods) @@ -193,18 +196,18 @@ is converted to a ``DatetimeIndex``: .. ipython:: python - pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])) + pd.to_datetime(pd.Series(["Jul 31, 2009", "2010-01-10", None])) - pd.to_datetime(['2005/11/23', '2010.12.31']) + pd.to_datetime(["2005/11/23", "2010.12.31"]) If you use dates which start with the day first (i.e. European style), you can pass the ``dayfirst`` flag: .. ipython:: python - pd.to_datetime(['04-01-2012 10:00'], dayfirst=True) + pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) - pd.to_datetime(['14-01-2012', '01-14-2012'], dayfirst=True) + pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True) .. warning:: @@ -218,22 +221,22 @@ options like ``dayfirst`` or ``format``, so use ``to_datetime`` if these are req .. ipython:: python - pd.to_datetime('2010/11/12') + pd.to_datetime("2010/11/12") - pd.Timestamp('2010/11/12') + pd.Timestamp("2010/11/12") You can also use the ``DatetimeIndex`` constructor directly: .. ipython:: python - pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05']) + pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"]) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: .. ipython:: python - pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer') + pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], freq="infer") .. _timeseries.converting.format: @@ -245,9 +248,9 @@ This could also potentially speed up the conversion considerably. .. ipython:: python - pd.to_datetime('2010/11/12', format='%Y/%m/%d') + pd.to_datetime("2010/11/12", format="%Y/%m/%d") - pd.to_datetime('12-11-2010 00:00', format='%d-%m-%Y %H:%M') + pd.to_datetime("12-11-2010 00:00", format="%d-%m-%Y %H:%M") For more information on the choices available when specifying the ``format`` option, see the Python `datetime documentation`_. @@ -261,10 +264,9 @@ You can also pass a ``DataFrame`` of integer or string columns to assemble into .. ipython:: python - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [2, 3]}) + df = pd.DataFrame( + {"year": [2015, 2016], "month": [2, 3], "day": [4, 5], "hour": [2, 3]} + ) pd.to_datetime(df) @@ -272,7 +274,7 @@ You can pass only the columns that you need to assemble. .. ipython:: python - pd.to_datetime(df[['year', 'month', 'day']]) + pd.to_datetime(df[["year", "month", "day"]]) ``pd.to_datetime`` looks for standard designations of the datetime component in the column names, including: @@ -282,24 +284,24 @@ You can pass only the columns that you need to assemble. Invalid data ~~~~~~~~~~~~ -The default behavior, ``errors='raise'``, is to raise when unparseable: +The default behavior, ``errors='raise'``, is to raise when unparsable: .. code-block:: ipython In [2]: pd.to_datetime(['2009/07/31', 'asd'], errors='raise') ValueError: Unknown string format -Pass ``errors='ignore'`` to return the original input when unparseable: +Pass ``errors='ignore'`` to return the original input when unparsable: .. ipython:: python - pd.to_datetime(['2009/07/31', 'asd'], errors='ignore') + pd.to_datetime(["2009/07/31", "asd"], errors="ignore") -Pass ``errors='coerce'`` to convert unparseable data to ``NaT`` (not a time): +Pass ``errors='coerce'`` to convert unparsable data to ``NaT`` (not a time): .. ipython:: python - pd.to_datetime(['2009/07/31', 'asd'], errors='coerce') + pd.to_datetime(["2009/07/31", "asd"], errors="coerce") .. _timeseries.converting.epoch: @@ -315,11 +317,14 @@ which can be specified. These are computed from the starting point specified by .. ipython:: python - pd.to_datetime([1349720105, 1349806505, 1349892905, - 1349979305, 1350065705], unit='s') + pd.to_datetime( + [1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s" + ) - pd.to_datetime([1349720105100, 1349720105200, 1349720105300, - 1349720105400, 1349720105500], unit='ms') + pd.to_datetime( + [1349720105100, 1349720105200, 1349720105300, 1349720105400, 1349720105500], + unit="ms", + ) .. note:: @@ -327,17 +332,17 @@ which can be specified. These are computed from the starting point specified by that was discussed :ref:`above`). The available units are listed on the documentation for :func:`pandas.to_datetime`. +.. versionchanged:: 1.0.0 + Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp -with the ``tz`` argument specified will currently localize the epoch timestamps to UTC -first then convert the result to the specified time zone. However, this behavior -is :ref:`deprecated `, and if you have -epochs in wall time in another timezone, it is recommended to read the epochs +with the ``tz`` argument specified will raise a ValueError. If you have +epochs in wall time in another timezone, you can read the epochs as timezone-naive timestamps and then localize to the appropriate timezone: .. ipython:: python - pd.Timestamp(1262347200000000000).tz_localize('US/Pacific') - pd.DatetimeIndex([1262347200000000000]).tz_localize('US/Pacific') + pd.Timestamp(1262347200000000000).tz_localize("US/Pacific") + pd.DatetimeIndex([1262347200000000000]).tz_localize("US/Pacific") .. note:: @@ -353,8 +358,8 @@ as timezone-naive timestamps and then localize to the appropriate timezone: .. ipython:: python - pd.to_datetime([1490195805.433, 1490195805.433502912], unit='s') - pd.to_datetime(1490195805433502912, unit='ns') + pd.to_datetime([1490195805.433, 1490195805.433502912], unit="s") + pd.to_datetime(1490195805433502912, unit="ns") .. seealso:: @@ -369,7 +374,7 @@ To invert the operation from above, namely, to convert from a ``Timestamp`` to a .. ipython:: python - stamps = pd.date_range('2012-10-08 18:15:05', periods=4, freq='D') + stamps = pd.date_range("2012-10-08 18:15:05", periods=4, freq="D") stamps We subtract the epoch (midnight at January 1, 1970 UTC) and then floor divide by the @@ -377,7 +382,7 @@ We subtract the epoch (midnight at January 1, 1970 UTC) and then floor divide by .. ipython:: python - (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') + (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s") .. _timeseries.origin: @@ -389,14 +394,14 @@ of a ``DatetimeIndex``. For example, to use 1960-01-01 as the starting date: .. ipython:: python - pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) + pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")) The default is set at ``origin='unix'``, which defaults to ``1970-01-01 00:00:00``. Commonly called 'unix epoch' or POSIX time. .. ipython:: python - pd.to_datetime([1, 2, 3], unit='D') + pd.to_datetime([1, 2, 3], unit="D") .. _timeseries.daterange: @@ -408,9 +413,11 @@ To generate an index with timestamps, you can use either the ``DatetimeIndex`` o .. ipython:: python - dates = [datetime.datetime(2012, 5, 1), - datetime.datetime(2012, 5, 2), - datetime.datetime(2012, 5, 3)] + dates = [ + datetime.datetime(2012, 5, 1), + datetime.datetime(2012, 5, 2), + datetime.datetime(2012, 5, 3), + ] # Note the frequency information index = pd.DatetimeIndex(dates) @@ -442,9 +449,9 @@ variety of :ref:`frequency aliases `: .. ipython:: python - pd.date_range(start, periods=1000, freq='M') + pd.date_range(start, periods=1000, freq="M") - pd.bdate_range(start, periods=250, freq='BQS') + pd.bdate_range(start, periods=250, freq="BQS") ``date_range`` and ``bdate_range`` make it easy to generate a range of dates using various combinations of parameters like ``start``, ``end``, ``periods``, @@ -453,25 +460,23 @@ of those specified will not be generated: .. ipython:: python - pd.date_range(start, end, freq='BM') + pd.date_range(start, end, freq="BM") - pd.date_range(start, end, freq='W') + pd.date_range(start, end, freq="W") pd.bdate_range(end=end, periods=20) pd.bdate_range(start=start, periods=20) -.. versionadded:: 0.23.0 - Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced dates from ``start`` to ``end`` inclusively, with ``periods`` number of elements in the resulting ``DatetimeIndex``: .. ipython:: python - pd.date_range('2018-01-01', '2018-01-05', periods=5) + pd.date_range("2018-01-01", "2018-01-05", periods=5) - pd.date_range('2018-01-01', '2018-01-05', periods=10) + pd.date_range("2018-01-01", "2018-01-05", periods=10) .. _timeseries.custom-freq-ranges: @@ -484,13 +489,13 @@ used if a custom frequency string is passed. .. ipython:: python - weekmask = 'Mon Wed Fri' + weekmask = "Mon Wed Fri" holidays = [datetime.datetime(2011, 1, 5), datetime.datetime(2011, 3, 14)] - pd.bdate_range(start, end, freq='C', weekmask=weekmask, holidays=holidays) + pd.bdate_range(start, end, freq="C", weekmask=weekmask, holidays=holidays) - pd.bdate_range(start, end, freq='CBMS', weekmask=weekmask) + pd.bdate_range(start, end, freq="CBMS", weekmask=weekmask) .. seealso:: @@ -547,7 +552,7 @@ intelligent functionality like selection, slicing, etc. .. ipython:: python - rng = pd.date_range(start, end, freq='BM') + rng = pd.date_range(start, end, freq="BM") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts.index ts[:5].index @@ -562,71 +567,83 @@ Dates and strings that parse to timestamps can be passed as indexing parameters: .. ipython:: python - ts['1/31/2011'] + ts["1/31/2011"] ts[datetime.datetime(2011, 12, 25):] - ts['10/31/2011':'12/31/2011'] + ts["10/31/2011":"12/31/2011"] To provide convenience for accessing longer time series, you can also pass in the year or year and month as strings: .. ipython:: python - ts['2011'] + ts["2011"] - ts['2011-6'] + ts["2011-6"] This type of slicing will work on a ``DataFrame`` with a ``DatetimeIndex`` as well. Since the partial string selection is a form of label slicing, the endpoints **will be** included. This would include matching times on an included date: +.. warning:: + + Indexing ``DataFrame`` rows with a *single* string with getitem (e.g. ``frame[dtstring]``) + is deprecated starting with pandas 1.2.0 (given the ambiguity whether it is indexing + the rows or selecting a column) and will be removed in a future version. The equivalent + with ``.loc`` (e.g. ``frame.loc[dtstring]``) is still supported. + .. ipython:: python - dft = pd.DataFrame(np.random.randn(100000, 1), columns=['A'], - index=pd.date_range('20130101', periods=100000, freq='T')) + dft = pd.DataFrame( + np.random.randn(100000, 1), + columns=["A"], + index=pd.date_range("20130101", periods=100000, freq="T"), + ) dft - dft['2013'] + dft.loc["2013"] This starts on the very first time in the month, and includes the last date and time for the month: .. ipython:: python - dft['2013-1':'2013-2'] + dft["2013-1":"2013-2"] This specifies a stop time **that includes all of the times on the last day**: .. ipython:: python - dft['2013-1':'2013-2-28'] + dft["2013-1":"2013-2-28"] This specifies an **exact** stop time (and is not the same as the above): .. ipython:: python - dft['2013-1':'2013-2-28 00:00:00'] + dft["2013-1":"2013-2-28 00:00:00"] We are stopping on the included end-point as it is part of the index: .. ipython:: python - dft['2013-1-15':'2013-1-15 12:30:00'] + dft["2013-1-15":"2013-1-15 12:30:00"] ``DatetimeIndex`` partial string indexing also works on a ``DataFrame`` with a ``MultiIndex``: .. ipython:: python - dft2 = pd.DataFrame(np.random.randn(20, 1), - columns=['A'], - index=pd.MultiIndex.from_product( - [pd.date_range('20130101', periods=10, freq='12H'), - ['a', 'b']])) + dft2 = pd.DataFrame( + np.random.randn(20, 1), + columns=["A"], + index=pd.MultiIndex.from_product( + [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ), + ) dft2 - dft2.loc['2013-01-05'] + dft2.loc["2013-01-05"] idx = pd.IndexSlice dft2 = dft2.swaplevel(0, 1).sort_index() - dft2.loc[idx[:, '2013-01-05'], :] + dft2.loc[idx[:, "2013-01-05"], :] .. versionadded:: 0.25.0 @@ -634,61 +651,64 @@ Slicing with string indexing also honors UTC offset. .. ipython:: python - df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + df = pd.DataFrame([0], index=pd.DatetimeIndex(["2019-01-01"], tz="US/Pacific")) df - df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + df["2019-01-01 12:00:00+04:00":"2019-01-01 13:00:00+04:00"] .. _timeseries.slice_vs_exact_match: Slice vs. exact match ~~~~~~~~~~~~~~~~~~~~~ -.. versionchanged:: 0.20.0 - The same string used as an indexing parameter can be treated either as a slice or as an exact match depending on the resolution of the index. If the string is less accurate than the index, it will be treated as a slice, otherwise as an exact match. Consider a ``Series`` object with a minute resolution index: .. ipython:: python - series_minute = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12-31 23:59:00', - '2012-01-01 00:00:00', - '2012-01-01 00:02:00'])) + series_minute = pd.Series( + [1, 2, 3], + pd.DatetimeIndex( + ["2011-12-31 23:59:00", "2012-01-01 00:00:00", "2012-01-01 00:02:00"] + ), + ) series_minute.index.resolution A timestamp string less accurate than a minute gives a ``Series`` object. .. ipython:: python - series_minute['2011-12-31 23'] + series_minute["2011-12-31 23"] A timestamp string with minute resolution (or more accurate), gives a scalar instead, i.e. it is not casted to a slice. .. ipython:: python - series_minute['2011-12-31 23:59'] - series_minute['2011-12-31 23:59:00'] + series_minute["2011-12-31 23:59"] + series_minute["2011-12-31 23:59:00"] If index resolution is second, then the minute-accurate timestamp gives a ``Series``. .. ipython:: python - series_second = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12-31 23:59:59', - '2012-01-01 00:00:00', - '2012-01-01 00:00:01'])) + series_second = pd.Series( + [1, 2, 3], + pd.DatetimeIndex( + ["2011-12-31 23:59:59", "2012-01-01 00:00:00", "2012-01-01 00:00:01"] + ), + ) series_second.index.resolution - series_second['2011-12-31 23:59'] + series_second["2011-12-31 23:59"] -If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. +If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``.loc[]`` as well. .. ipython:: python - dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, - index=series_minute.index) - dft_minute['2011-12-31 23'] + dft_minute = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index + ) + dft_minute.loc["2011-12-31 23"] .. warning:: @@ -699,16 +719,17 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python - dft_minute.loc['2011-12-31 23:59'] + dft_minute.loc["2011-12-31 23:59"] Note also that ``DatetimeIndex`` resolution cannot be less precise than day. .. ipython:: python - series_monthly = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12', '2012-01', '2012-02'])) + series_monthly = pd.Series( + [1, 2, 3], pd.DatetimeIndex(["2011-12", "2012-01", "2012-02"]) + ) series_monthly.index.resolution - series_monthly['2011-12'] # returns Series + series_monthly["2011-12"] # returns Series Exact indexing @@ -720,15 +741,17 @@ These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and .. ipython:: python - dft[datetime.datetime(2013, 1, 1):datetime.datetime(2013, 2, 28)] + dft[datetime.datetime(2013, 1, 1): datetime.datetime(2013, 2, 28)] With no defaults. .. ipython:: python - dft[datetime.datetime(2013, 1, 1, 10, 12, 0): - datetime.datetime(2013, 2, 28, 10, 12, 0)] - + dft[ + datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime( + 2013, 2, 28, 10, 12, 0 + ) + ] Truncating & fancy indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -740,11 +763,11 @@ partially matching dates: .. ipython:: python - rng2 = pd.date_range('2011-01-01', '2012-01-01', freq='W') + rng2 = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts2 = pd.Series(np.random.randn(len(rng2)), index=rng2) - ts2.truncate(before='2011-11', after='2011-12') - ts2['2011-11':'2011-12'] + ts2.truncate(before="2011-11", after="2011-12") + ts2["2011-11":"2011-12"] Even complicated fancy indexing that breaks the ``DatetimeIndex`` frequency regularity will result in a ``DatetimeIndex``, although frequency is lost: @@ -776,9 +799,11 @@ There are several time/date properties that one can access from ``Timestamp`` or time,"Returns datetime.time (does not contain timezone information)" timetz,"Returns datetime.time as local time with timezone information" dayofyear,"The ordinal day of year" + day_of_year,"The ordinal day of year" weekofyear,"The week ordinal of the year" week,"The week ordinal of the year" dayofweek,"The number of the day of the week with Monday=0, Sunday=6" + day_of_week,"The number of the day of the week with Monday=0, Sunday=6" weekday,"The number of the day of the week with Monday=0, Sunday=6" quarter,"Quarter of the date: Jan-Mar = 1, Apr-Jun = 2, etc." days_in_month,"The number of days in the month of the datetime" @@ -800,7 +825,7 @@ You may obtain the year, week and day components of the ISO year from the ISO 86 .. ipython:: python - idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + idx = pd.date_range(start="2019-12-29", freq="D", periods=4) idx.isocalendar() idx.to_series().dt.isocalendar() @@ -830,12 +855,12 @@ arithmetic operator (``+``) or the ``apply`` method can be used to perform the s .. ipython:: python # This particular day contains a day light savings time transition - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts = pd.Timestamp("2016-10-30 00:00:00", tz="Europe/Helsinki") # Respects absolute time ts + pd.Timedelta(days=1) # Respects calendar time ts + pd.DateOffset(days=1) - friday = pd.Timestamp('2018-01-05') + friday = pd.Timestamp("2018-01-05") friday.day_name() # Add 2 business days (Friday --> Tuesday) two_business_days = 2 * pd.offsets.BDay() @@ -850,7 +875,7 @@ into ``freq`` keyword arguments. The available date offsets and associated frequ :header: "Date Offset", "Frequency String", "Description" :widths: 15, 15, 65 - :class:`~pandas.tseries.offsets.DateOffset`, None, "Generic offset class, defaults to 1 calendar day" + :class:`~pandas.tseries.offsets.DateOffset`, None, "Generic offset class, defaults to absolute 24 hours" :class:`~pandas.tseries.offsets.BDay` or :class:`~pandas.tseries.offsets.BusinessDay`, ``'B'``,"business day (weekday)" :class:`~pandas.tseries.offsets.CDay` or :class:`~pandas.tseries.offsets.CustomBusinessDay`, ``'C'``, "custom business day" :class:`~pandas.tseries.offsets.Week`, ``'W'``, "one week, optionally anchored on a day of the week" @@ -893,10 +918,10 @@ business offsets operate on the weekdays. .. ipython:: python - ts = pd.Timestamp('2018-01-06 00:00:00') + ts = pd.Timestamp("2018-01-06 00:00:00") ts.day_name() # BusinessHour's valid offset dates are Monday through Friday - offset = pd.offsets.BusinessHour(start='09:00') + offset = pd.offsets.BusinessHour(start="09:00") # Bring the date to the closest offset date (Monday) offset.rollforward(ts) # Date is brought to the closest offset date first and then the hour is added @@ -909,12 +934,12 @@ in the operation). .. ipython:: python - ts = pd.Timestamp('2014-01-01 09:00') + ts = pd.Timestamp("2014-01-01 09:00") day = pd.offsets.Day() day.apply(ts) day.apply(ts).normalize() - ts = pd.Timestamp('2014-01-01 22:00') + ts = pd.Timestamp("2014-01-01 22:00") hour = pd.offsets.Hour() hour.apply(ts) hour.apply(ts).normalize() @@ -967,7 +992,7 @@ apply the offset to each element. .. ipython:: python - rng = pd.date_range('2012-01-01', '2012-01-03') + rng = pd.date_range("2012-01-01", "2012-01-03") s = pd.Series(rng) rng rng + pd.DateOffset(months=2) @@ -982,7 +1007,7 @@ used exactly like a ``Timedelta`` - see the .. ipython:: python s - pd.offsets.Day(2) - td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31')) + td = s - pd.Series(pd.date_range("2011-12-29", "2011-12-31")) td td + pd.offsets.Minute(15) @@ -1009,16 +1034,20 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i .. ipython:: python - weekmask_egypt = 'Sun Mon Tue Wed Thu' + weekmask_egypt = "Sun Mon Tue Wed Thu" # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', - datetime.datetime(2013, 5, 1), - np.datetime64('2014-05-01')] - bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, - weekmask=weekmask_egypt) + holidays = [ + "2012-05-01", + datetime.datetime(2013, 5, 1), + np.datetime64("2014-05-01"), + ] + bday_egypt = pd.offsets.CustomBusinessDay( + holidays=holidays, + weekmask=weekmask_egypt, + ) dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt @@ -1028,8 +1057,7 @@ Let's map to the weekday names: dts = pd.date_range(dt, periods=5, freq=bday_egypt) - pd.Series(dts.weekday, dts).map( - pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) + pd.Series(dts.weekday, dts).map(pd.Series("Mon Tue Wed Thu Fri Sat Sun".split())) Holiday calendars can be used to provide the list of holidays. See the :ref:`holiday calendar` section for more information. @@ -1051,15 +1079,14 @@ in the usual way. .. ipython:: python - bmth_us = pd.offsets.CustomBusinessMonthBegin( - calendar=USFederalHolidayCalendar()) + bmth_us = pd.offsets.CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar()) # Skip new years dt = datetime.datetime(2013, 12, 17) dt + bmth_us # Define date index with custom offset - pd.date_range(start='20100101', end='20120101', freq=bmth_us) + pd.date_range(start="20100101", end="20120101", freq=bmth_us) .. note:: @@ -1090,23 +1117,23 @@ hours are added to the next business day. bh # 2014-08-01 is Friday - pd.Timestamp('2014-08-01 10:00').weekday() - pd.Timestamp('2014-08-01 10:00') + bh + pd.Timestamp("2014-08-01 10:00").weekday() + pd.Timestamp("2014-08-01 10:00") + bh # Below example is the same as: pd.Timestamp('2014-08-01 09:00') + bh - pd.Timestamp('2014-08-01 08:00') + bh + pd.Timestamp("2014-08-01 08:00") + bh # If the results is on the end time, move to the next business day - pd.Timestamp('2014-08-01 16:00') + bh + pd.Timestamp("2014-08-01 16:00") + bh # Remainings are added to the next day - pd.Timestamp('2014-08-01 16:30') + bh + pd.Timestamp("2014-08-01 16:30") + bh # Adding 2 business hours - pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(2) + pd.Timestamp("2014-08-01 10:00") + pd.offsets.BusinessHour(2) # Subtracting 3 business hours - pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(-3) + pd.Timestamp("2014-08-01 10:00") + pd.offsets.BusinessHour(-3) You can also specify ``start`` and ``end`` time by keywords. The argument must be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` @@ -1115,12 +1142,12 @@ results in ``ValueError``. .. ipython:: python - bh = pd.offsets.BusinessHour(start='11:00', end=datetime.time(20, 0)) + bh = pd.offsets.BusinessHour(start="11:00", end=datetime.time(20, 0)) bh - pd.Timestamp('2014-08-01 13:00') + bh - pd.Timestamp('2014-08-01 09:00') + bh - pd.Timestamp('2014-08-01 18:00') + bh + pd.Timestamp("2014-08-01 13:00") + bh + pd.Timestamp("2014-08-01 09:00") + bh + pd.Timestamp("2014-08-01 18:00") + bh Passing ``start`` time later than ``end`` represents midnight business hour. In this case, business hour exceeds midnight and overlap to the next day. @@ -1128,19 +1155,19 @@ Valid business hours are distinguished by whether it started from valid ``Busine .. ipython:: python - bh = pd.offsets.BusinessHour(start='17:00', end='09:00') + bh = pd.offsets.BusinessHour(start="17:00", end="09:00") bh - pd.Timestamp('2014-08-01 17:00') + bh - pd.Timestamp('2014-08-01 23:00') + bh + pd.Timestamp("2014-08-01 17:00") + bh + pd.Timestamp("2014-08-01 23:00") + bh # Although 2014-08-02 is Saturday, # it is valid because it starts from 08-01 (Friday). - pd.Timestamp('2014-08-02 04:00') + bh + pd.Timestamp("2014-08-02 04:00") + bh # Although 2014-08-04 is Monday, # it is out of business hours because it starts from 08-03 (Sunday). - pd.Timestamp('2014-08-04 04:00') + bh + pd.Timestamp("2014-08-04 04:00") + bh Applying ``BusinessHour.rollforward`` and ``rollback`` to out of business hours results in the next business hour start or previous day's end. Different from other offsets, ``BusinessHour.rollforward`` @@ -1153,19 +1180,19 @@ under the default business hours (9:00 - 17:00), there is no gap (0 minutes) bet .. ipython:: python # This adjusts a Timestamp to business hour edge - pd.offsets.BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) - pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollback(pd.Timestamp("2014-08-02 15:00")) + pd.offsets.BusinessHour().rollforward(pd.Timestamp("2014-08-02 15:00")) # It is the same as BusinessHour().apply(pd.Timestamp('2014-08-01 17:00')). # And it is the same as BusinessHour().apply(pd.Timestamp('2014-08-04 09:00')) - pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().apply(pd.Timestamp("2014-08-02 15:00")) # BusinessDay results (for reference) - pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp("2014-08-02")) # It is the same as BusinessDay().apply(pd.Timestamp('2014-08-01')) # The result is the same as rollworward because BusinessDay never overlap. - pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().apply(pd.Timestamp("2014-08-02")) ``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary holidays, you can use ``CustomBusinessHour`` offset, as explained in the @@ -1183,6 +1210,7 @@ as ``BusinessHour`` except that it skips specified custom holidays. .. ipython:: python from pandas.tseries.holiday import USFederalHolidayCalendar + bhour_us = pd.offsets.CustomBusinessHour(calendar=USFederalHolidayCalendar()) # Friday before MLK Day dt = datetime.datetime(2014, 1, 17, 15) @@ -1196,8 +1224,7 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB .. ipython:: python - bhour_mon = pd.offsets.CustomBusinessHour(start='10:00', - weekmask='Tue Wed Thu Fri') + bhour_mon = pd.offsets.CustomBusinessHour(start="10:00", weekmask="Tue Wed Thu Fri") # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 @@ -1250,7 +1277,7 @@ most functions: .. ipython:: python - pd.date_range(start, periods=5, freq='B') + pd.date_range(start, periods=5, freq="B") pd.date_range(start, periods=5, freq=pd.offsets.BDay()) @@ -1258,9 +1285,9 @@ You can combine together day and intraday offsets: .. ipython:: python - pd.date_range(start, periods=10, freq='2h20min') + pd.date_range(start, periods=10, freq="2h20min") - pd.date_range(start, periods=10, freq='1D10U') + pd.date_range(start, periods=10, freq="1D10U") Anchored offsets ~~~~~~~~~~~~~~~~ @@ -1319,39 +1346,39 @@ anchor point, and moved ``|n|-1`` additional steps forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-02") + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-02') - pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-02") - pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-02") - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=4) - pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-02") - pd.offsets.MonthBegin(n=4) If the given date *is* on an anchor point, it is moved ``|n|`` points forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-31") + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') - pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-31') - pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-01") - pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-31") - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=4) - pd.Timestamp('2014-01-31') - pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-31") - pd.offsets.MonthBegin(n=4) For the case when ``n=0``, the date is not moved if on an anchor point, otherwise it is rolled forward to the next anchor point. .. ipython:: python - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=0) - pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=0) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=0) + pd.Timestamp("2014-01-02") + pd.offsets.MonthEnd(n=0) - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=0) - pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=0) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=0) + pd.Timestamp("2014-01-31") + pd.offsets.MonthEnd(n=0) .. _timeseries.holiday: @@ -1387,14 +1414,27 @@ An example of how holidays and holiday calendars are defined: .. ipython:: python - from pandas.tseries.holiday import Holiday, USMemorialDay,\ - AbstractHolidayCalendar, nearest_workday, MO + from pandas.tseries.holiday import ( + Holiday, + USMemorialDay, + AbstractHolidayCalendar, + nearest_workday, + MO, + ) + + class ExampleCalendar(AbstractHolidayCalendar): rules = [ USMemorialDay, - Holiday('July 4th', month=7, day=4, observance=nearest_workday), - Holiday('Columbus Day', month=10, day=1, - offset=pd.DateOffset(weekday=MO(2)))] + Holiday("July 4th", month=7, day=4, observance=nearest_workday), + Holiday( + "Columbus Day", + month=10, + day=1, + offset=pd.DateOffset(weekday=MO(2)), + ), + ] + cal = ExampleCalendar() cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31)) @@ -1410,8 +1450,9 @@ or ``Timestamp`` objects. .. ipython:: python - pd.date_range(start='7/1/2012', end='7/10/2012', - freq=pd.offsets.CDay(calendar=cal)).to_pydatetime() + pd.date_range( + start="7/1/2012", end="7/10/2012", freq=pd.offsets.CDay(calendar=cal) + ).to_pydatetime() offset = pd.offsets.CustomBusinessDay(calendar=cal) datetime.datetime(2012, 5, 25) + offset datetime.datetime(2012, 7, 3) + offset @@ -1443,11 +1484,11 @@ or calendars with additional rules. .. ipython:: python - from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory,\ - USLaborDay - cal = get_calendar('ExampleCalendar') + from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory, USLaborDay + + cal = get_calendar("ExampleCalendar") cal.rules - new_cal = HolidayCalendarFactory('NewExampleCalendar', cal, USLaborDay) + new_cal = HolidayCalendarFactory("NewExampleCalendar", cal, USLaborDay) new_cal.rules .. _timeseries.advanced_datetime: @@ -1477,9 +1518,9 @@ rather than changing the alignment of the data and the index: .. ipython:: python - ts.shift(5, freq='D') + ts.shift(5, freq="D") ts.shift(5, freq=pd.offsets.BDay()) - ts.shift(5, freq='BM') + ts.shift(5, freq="BM") Note that with when ``freq`` is specified, the leading entry is no longer NaN because the data is not being realigned. @@ -1494,7 +1535,7 @@ calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * pd.offsets.BDay()) + dr = pd.date_range("1/1/2010", periods=3, freq=3 * pd.offsets.BDay()) ts = pd.Series(np.random.randn(3), index=dr) ts ts.asfreq(pd.offsets.BDay()) @@ -1504,7 +1545,7 @@ method for any gaps that may appear after the frequency conversion. .. ipython:: python - ts.asfreq(pd.offsets.BDay(), method='pad') + ts.asfreq(pd.offsets.BDay(), method="pad") Filling forward / backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1523,7 +1564,7 @@ Converting to Python datetimes Resampling ---------- -Pandas has a simple, powerful, and efficient functionality for performing +pandas has a simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. @@ -1535,21 +1576,16 @@ some advanced strategies. The ``resample()`` method can be used directly from ``DataFrameGroupBy`` objects, see the :ref:`groupby docs `. -.. note:: - - ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with - a time-based offset, see a discussion :ref:`here `. - Basics ~~~~~~ .. ipython:: python - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - ts.resample('5Min').sum() + ts.resample("5Min").sum() The ``resample`` function is very flexible and allows you to specify many different parameters to control the frequency conversion and resampling @@ -1561,11 +1597,11 @@ a method of the returned object, including ``sum``, ``mean``, ``std``, ``sem``, .. ipython:: python - ts.resample('5Min').mean() + ts.resample("5Min").mean() - ts.resample('5Min').ohlc() + ts.resample("5Min").ohlc() - ts.resample('5Min').max() + ts.resample("5Min").max() For downsampling, ``closed`` can be set to 'left' or 'right' to specify which @@ -1573,9 +1609,9 @@ end of the interval is closed: .. ipython:: python - ts.resample('5Min', closed='right').mean() + ts.resample("5Min", closed="right").mean() - ts.resample('5Min', closed='left').mean() + ts.resample("5Min", closed="left").mean() Parameters like ``label`` are used to manipulate the resulting labels. ``label`` specifies whether the result is labeled with the beginning or @@ -1583,9 +1619,9 @@ the end of the interval. .. ipython:: python - ts.resample('5Min').mean() # by default label='left' + ts.resample("5Min").mean() # by default label='left' - ts.resample('5Min', label='left').mean() + ts.resample("5Min", label="left").mean() .. warning:: @@ -1599,12 +1635,12 @@ the end of the interval. .. ipython:: python - s = pd.date_range('2000-01-01', '2000-01-05').to_series() + s = pd.date_range("2000-01-01", "2000-01-05").to_series() s.iloc[2] = pd.NaT s.dt.day_name() # default: label='left', closed='left' - s.resample('B').last().dt.day_name() + s.resample("B").last().dt.day_name() Notice how the value for Sunday got pulled back to the previous Friday. To get the behavior where the value for Sunday is pushed to Monday, use @@ -1612,7 +1648,7 @@ the end of the interval. .. ipython:: python - s.resample('B', label='right', closed='right').last().dt.day_name() + s.resample("B", label="right", closed="right").last().dt.day_name() The ``axis`` parameter can be set to 0 or 1 and allows you to resample the specified axis for a ``DataFrame``. @@ -1635,11 +1671,11 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to # from secondly to every 250 milliseconds - ts[:2].resample('250L').asfreq() + ts[:2].resample("250L").asfreq() - ts[:2].resample('250L').ffill() + ts[:2].resample("250L").ffill() - ts[:2].resample('250L').ffill(limit=2) + ts[:2].resample("250L").ffill(limit=2) Sparse resampling ~~~~~~~~~~~~~~~~~ @@ -1655,14 +1691,14 @@ resample only the groups that are not all ``NaN``. .. ipython:: python - rng = pd.date_range('2014-1-1', periods=100, freq='D') + pd.Timedelta('1s') + rng = pd.date_range("2014-1-1", periods=100, freq="D") + pd.Timedelta("1s") ts = pd.Series(range(100), index=rng) If we want to resample to the full range of the series: .. ipython:: python - ts.resample('3T').sum() + ts.resample("3T").sum() We can instead only resample those groups where we have points as follows: @@ -1671,44 +1707,48 @@ We can instead only resample those groups where we have points as follows: from functools import partial from pandas.tseries.frequencies import to_offset + def round(t, freq): # round a Timestamp to a specified freq freq = to_offset(freq) return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value) - ts.groupby(partial(round, freq='3T')).sum() + + ts.groupby(partial(round, freq="3T")).sum() .. _timeseries.aggregate: Aggregation ~~~~~~~~~~~ -Similar to the :ref:`aggregating API `, :ref:`groupby API `, and the :ref:`window functions API `, +Similar to the :ref:`aggregating API `, :ref:`groupby API `, and the :ref:`window API `, a ``Resampler`` can be selectively resampled. Resampling a ``DataFrame``, the default will be to act on all columns with the same function. .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) - r = df.resample('3T') + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + r = df.resample("3T") r.mean() We can select a specific column or columns using standard getitem. .. ipython:: python - r['A'].mean() + r["A"].mean() - r[['A', 'B']].mean() + r[["A", "B"]].mean() You can pass a list or dict of functions to do aggregation with, outputting a ``DataFrame``: .. ipython:: python - r['A'].agg([np.sum, np.mean, np.std]) + r["A"].agg([np.sum, np.mean, np.std]) On a resampled ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -1723,21 +1763,20 @@ columns of a ``DataFrame``: .. ipython:: python :okexcept: - r.agg({'A': np.sum, - 'B': lambda x: np.std(x, ddof=1)}) + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the resampled object: .. ipython:: python - r.agg({'A': 'sum', 'B': 'std'}) + r.agg({"A": "sum", "B": "std"}) Furthermore, you can also specify multiple aggregation functions for each column separately. .. ipython:: python - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) If a ``DataFrame`` does not have a datetimelike index, but instead you want @@ -1746,14 +1785,15 @@ to resample based on datetimelike column in the frame, it can passed to the .. ipython:: python - df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), - 'a': np.arange(5)}, - index=pd.MultiIndex.from_arrays([ - [1, 2, 3, 4, 5], - pd.date_range('2015-01-01', freq='W', periods=5)], - names=['v', 'd'])) + df = pd.DataFrame( + {"date": pd.date_range("2015-01-01", freq="W", periods=5), "a": np.arange(5)}, + index=pd.MultiIndex.from_arrays( + [[1, 2, 3, 4, 5], pd.date_range("2015-01-01", freq="W", periods=5)], + names=["v", "d"], + ), + ) df - df.resample('M', on='date').sum() + df.resample("M", on="date").sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1761,7 +1801,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample('M', level='d').sum() + df.resample("M", level="d").sum() .. _timeseries.iterating-label: @@ -1775,14 +1815,18 @@ natural and functions similarly to :py:func:`itertools.groupby`: small = pd.Series( range(6), - index=pd.to_datetime(['2017-01-01T00:00:00', - '2017-01-01T00:30:00', - '2017-01-01T00:31:00', - '2017-01-01T01:00:00', - '2017-01-01T03:00:00', - '2017-01-01T03:05:00']) + index=pd.to_datetime( + [ + "2017-01-01T00:00:00", + "2017-01-01T00:30:00", + "2017-01-01T00:31:00", + "2017-01-01T01:00:00", + "2017-01-01T03:00:00", + "2017-01-01T03:05:00", + ] + ), ) - resampled = small.resample('H') + resampled = small.resample("H") for name, group in resampled: print("Group: ", name) @@ -1793,20 +1837,20 @@ See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more. .. _timeseries.adjust-the-start-of-the-bins: -Use `origin` or `offset` to adjust the start of the bins -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``origin`` or ``offset`` to adjust the start of the bins +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 1.1.0 -The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divide a day evenly (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed Timestamp with the argument ``origin``. +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like ``30D``) or that divide a day evenly (like ``90s`` or ``1min``). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed Timestamp with the argument ``origin``. For example: .. ipython:: python - start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - middle = '2000-10-02 00:00:00' - rng = pd.date_range(start, end, freq='7min') + start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + middle = "2000-10-02 00:00:00" + rng = pd.date_range(start, end, freq="7min") ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts @@ -1814,32 +1858,32 @@ Here we can see that, when using ``origin`` with its default value (``'start_day .. ipython:: python - ts.resample('17min', origin='start_day').sum() - ts[middle:end].resample('17min', origin='start_day').sum() + ts.resample("17min", origin="start_day").sum() + ts[middle:end].resample("17min", origin="start_day").sum() Here we can see that, when setting ``origin`` to ``'epoch'``, the result after ``'2000-10-02 00:00:00'`` are identical depending on the start of time series: .. ipython:: python - ts.resample('17min', origin='epoch').sum() - ts[middle:end].resample('17min', origin='epoch').sum() + ts.resample("17min", origin="epoch").sum() + ts[middle:end].resample("17min", origin="epoch").sum() If needed you can use a custom timestamp for ``origin``: .. ipython:: python - ts.resample('17min', origin='2001-01-01').sum() - ts[middle:end].resample('17min', origin=pd.Timestamp('2001-01-01')).sum() + ts.resample("17min", origin="2001-01-01").sum() + ts[middle:end].resample("17min", origin=pd.Timestamp("2001-01-01")).sum() If needed you can just adjust the bins with an ``offset`` Timedelta that would be added to the default ``origin``. Those two examples are equivalent for this time series: .. ipython:: python - ts.resample('17min', origin='start').sum() - ts.resample('17min', offset='23h30min').sum() + ts.resample("17min", origin="start").sum() + ts.resample("17min", offset="23h30min").sum() Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. @@ -1862,37 +1906,37 @@ Because ``freq`` represents a span of ``Period``, it cannot be negative like "-3 .. ipython:: python - pd.Period('2012', freq='A-DEC') + pd.Period("2012", freq="A-DEC") - pd.Period('2012-1-1', freq='D') + pd.Period("2012-1-1", freq="D") - pd.Period('2012-1-1 19:00', freq='H') + pd.Period("2012-1-1 19:00", freq="H") - pd.Period('2012-1-1 19:00', freq='5H') + pd.Period("2012-1-1 19:00", freq="5H") Adding and subtracting integers from periods shifts the period by its own frequency. Arithmetic is not allowed between ``Period`` with different ``freq`` (span). .. ipython:: python - p = pd.Period('2012', freq='A-DEC') + p = pd.Period("2012", freq="A-DEC") p + 1 p - 3 - p = pd.Period('2012-01', freq='2M') + p = pd.Period("2012-01", freq="2M") p + 2 p - 1 @okexcept - p == pd.Period('2012-01', freq='3M') + p == pd.Period("2012-01", freq="3M") If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised. .. ipython:: python - p = pd.Period('2014-07-01 09:00', freq='H') + p = pd.Period("2014-07-01 09:00", freq="H") p + pd.offsets.Hour(2) p + datetime.timedelta(minutes=120) - p + np.timedelta64(7200, 's') + p + np.timedelta64(7200, "s") .. code-block:: ipython @@ -1905,7 +1949,7 @@ If ``Period`` has other frequencies, only the same ``offsets`` can be added. Oth .. ipython:: python - p = pd.Period('2014-07', freq='M') + p = pd.Period("2014-07", freq="M") p + pd.offsets.MonthEnd(3) .. code-block:: ipython @@ -1920,7 +1964,7 @@ return the number of frequency units between them: .. ipython:: python - pd.Period('2012', freq='A-DEC') - pd.Period('2002', freq='A-DEC') + pd.Period("2012", freq="A-DEC") - pd.Period("2002", freq="A-DEC") PeriodIndex and period_range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1929,21 +1973,21 @@ which can be constructed using the ``period_range`` convenience function: .. ipython:: python - prng = pd.period_range('1/1/2011', '1/1/2012', freq='M') + prng = pd.period_range("1/1/2011", "1/1/2012", freq="M") prng The ``PeriodIndex`` constructor can also be used directly: .. ipython:: python - pd.PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + pd.PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") Passing multiplied frequency outputs a sequence of ``Period`` which has multiplied span. .. ipython:: python - pd.period_range(start='2014-01', freq='3M', periods=4) + pd.period_range(start="2014-01", freq="3M", periods=4) If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor endpoints for a ``PeriodIndex`` with frequency matching that of the @@ -1951,8 +1995,9 @@ endpoints for a ``PeriodIndex`` with frequency matching that of the .. ipython:: python - pd.period_range(start=pd.Period('2017Q1', freq='Q'), - end=pd.Period('2017Q2', freq='Q'), freq='M') + pd.period_range( + start=pd.Period("2017Q1", freq="Q"), end=pd.Period("2017Q2", freq="Q"), freq="M" + ) Just like ``DatetimeIndex``, a ``PeriodIndex`` can also be used to index pandas objects: @@ -1966,11 +2011,11 @@ objects: .. ipython:: python - idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') + idx = pd.period_range("2014-07-01 09:00", periods=5, freq="H") idx idx + pd.offsets.Hour(2) - idx = pd.period_range('2014-07', periods=5, freq='M') + idx = pd.period_range("2014-07", periods=5, freq="M") idx idx + pd.offsets.MonthEnd(3) @@ -1989,7 +2034,7 @@ The ``period`` dtype holds the ``freq`` attribute and is represented with .. ipython:: python - pi = pd.period_range('2016-01-01', periods=3, freq='M') + pi = pd.period_range("2016-01-01", periods=3, freq="M") pi pi.dtype @@ -2000,15 +2045,15 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th .. ipython:: python # change monthly freq to daily freq - pi.astype('period[D]') + pi.astype("period[D]") # convert to DatetimeIndex - pi.astype('datetime64[ns]') + pi.astype("datetime64[ns]") # convert to PeriodIndex - dti = pd.date_range('2011-01-01', freq='M', periods=3) + dti = pd.date_range("2011-01-01", freq="M", periods=3) dti - dti.astype('period[M]') + dti.astype("period[M]") PeriodIndex partial string indexing @@ -2022,31 +2067,31 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI .. ipython:: python - ps['2011-01'] + ps["2011-01"] ps[datetime.datetime(2011, 12, 25):] - ps['10/31/2011':'12/31/2011'] + ps["10/31/2011":"12/31/2011"] Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data. .. ipython:: python - ps['2011'] + ps["2011"] - dfp = pd.DataFrame(np.random.randn(600, 1), - columns=['A'], - index=pd.period_range('2013-01-01 9:00', - periods=600, - freq='T')) + dfp = pd.DataFrame( + np.random.randn(600, 1), + columns=["A"], + index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"), + ) dfp - dfp['2013-01-01 10H'] + dfp.loc["2013-01-01 10H"] As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59. .. ipython:: python - dfp['2013-01-01 10H':'2013-01-01 11H'] + dfp["2013-01-01 10H":"2013-01-01 11H"] Frequency conversion and resampling with PeriodIndex @@ -2056,7 +2101,7 @@ method. Let's start with the fiscal year 2011, ending in December: .. ipython:: python - p = pd.Period('2011', freq='A-DEC') + p = pd.Period("2011", freq="A-DEC") p We can convert it to a monthly frequency. Using the ``how`` parameter, we can @@ -2064,16 +2109,16 @@ specify whether to return the starting or ending month: .. ipython:: python - p.asfreq('M', how='start') + p.asfreq("M", how="start") - p.asfreq('M', how='end') + p.asfreq("M", how="end") The shorthands 's' and 'e' are provided for convenience: .. ipython:: python - p.asfreq('M', 's') - p.asfreq('M', 'e') + p.asfreq("M", "s") + p.asfreq("M", "e") Converting to a "super-period" (e.g., annual frequency is a super-period of quarterly frequency) automatically returns the super-period that includes the @@ -2081,9 +2126,9 @@ input period: .. ipython:: python - p = pd.Period('2011-12', freq='M') + p = pd.Period("2011-12", freq="M") - p.asfreq('A-NOV') + p.asfreq("A-NOV") Note that since we converted to an annual frequency that ends the year in November, the monthly period of December 2011 is actually in the 2012 A-NOV @@ -2102,21 +2147,21 @@ frequencies ``Q-JAN`` through ``Q-DEC``. .. ipython:: python - p = pd.Period('2012Q1', freq='Q-DEC') + p = pd.Period("2012Q1", freq="Q-DEC") - p.asfreq('D', 's') + p.asfreq("D", "s") - p.asfreq('D', 'e') + p.asfreq("D", "e") ``Q-MAR`` defines fiscal year end in March: .. ipython:: python - p = pd.Period('2011Q4', freq='Q-MAR') + p = pd.Period("2011Q4", freq="Q-MAR") - p.asfreq('D', 's') + p.asfreq("D", "s") - p.asfreq('D', 'e') + p.asfreq("D", "e") .. _timeseries.interchange: @@ -2128,7 +2173,7 @@ and vice-versa using ``to_timestamp``: .. ipython:: python - rng = pd.date_range('1/1/2012', periods=5, freq='M') + rng = pd.date_range("1/1/2012", periods=5, freq="M") ts = pd.Series(np.random.randn(len(rng)), index=rng) @@ -2145,7 +2190,7 @@ end of the period: .. ipython:: python - ps.to_timestamp('D', how='s') + ps.to_timestamp("D", how="s") Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly @@ -2154,11 +2199,11 @@ the quarter end: .. ipython:: python - prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') + prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV") ts = pd.Series(np.random.randn(len(prng)), prng) - ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9 ts.head() @@ -2172,7 +2217,7 @@ then you can use a ``PeriodIndex`` and/or ``Series`` of ``Periods`` to do comput .. ipython:: python - span = pd.period_range('1215-01-01', '1381-01-01', freq='D') + span = pd.period_range("1215-01-01", "1381-01-01", freq="D") span To convert from an ``int64`` based YYYYMMDD representation. @@ -2182,9 +2227,10 @@ To convert from an ``int64`` based YYYYMMDD representation. s = pd.Series([20121231, 20141130, 99991231]) s + def conv(x): - return pd.Period(year=x // 10000, month=x // 100 % 100, - day=x % 100, freq='D') + return pd.Period(year=x // 10000, month=x // 100 % 100, day=x % 100, freq="D") + s.apply(conv) s.apply(conv)[2] @@ -2202,7 +2248,7 @@ Time zone handling ------------------ pandas provides rich support for working with timestamps in different time -zones using the ``pytz`` and ``dateutil`` libraries or class:`datetime.timezone` +zones using the ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` objects from the standard library. @@ -2213,7 +2259,7 @@ By default, pandas objects are time zone unaware: .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=15, freq='D') + rng = pd.date_range("3/6/2012 00:00", periods=15, freq="D") rng.tz is None To localize these dates to a time zone (assign a particular time zone to a naive date), @@ -2233,18 +2279,21 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string import dateutil # pytz - rng_pytz = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz='Europe/London') + rng_pytz = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz="Europe/London") rng_pytz.tz # dateutil - rng_dateutil = pd.date_range('3/6/2012 00:00', periods=3, freq='D') - rng_dateutil = rng_dateutil.tz_localize('dateutil/Europe/London') + rng_dateutil = pd.date_range("3/6/2012 00:00", periods=3, freq="D") + rng_dateutil = rng_dateutil.tz_localize("dateutil/Europe/London") rng_dateutil.tz # dateutil - utc special case - rng_utc = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=dateutil.tz.tzutc()) + rng_utc = pd.date_range( + "3/6/2012 00:00", + periods=3, + freq="D", + tz=dateutil.tz.tzutc(), + ) rng_utc.tz .. versionadded:: 0.25.0 @@ -2252,8 +2301,12 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string .. ipython:: python # datetime.timezone - rng_utc = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=datetime.timezone.utc) + rng_utc = pd.date_range( + "3/6/2012 00:00", + periods=3, + freq="D", + tz=datetime.timezone.utc, + ) rng_utc.tz Note that the ``UTC`` time zone is a special case in ``dateutil`` and should be constructed explicitly @@ -2265,15 +2318,14 @@ zones objects explicitly first. import pytz # pytz - tz_pytz = pytz.timezone('Europe/London') - rng_pytz = pd.date_range('3/6/2012 00:00', periods=3, freq='D') + tz_pytz = pytz.timezone("Europe/London") + rng_pytz = pd.date_range("3/6/2012 00:00", periods=3, freq="D") rng_pytz = rng_pytz.tz_localize(tz_pytz) rng_pytz.tz == tz_pytz # dateutil - tz_dateutil = dateutil.tz.gettz('Europe/London') - rng_dateutil = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=tz_dateutil) + tz_dateutil = dateutil.tz.gettz("Europe/London") + rng_dateutil = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=tz_dateutil) rng_dateutil.tz == tz_dateutil To convert a time zone aware pandas object from one time zone to another, @@ -2281,7 +2333,7 @@ you can use the ``tz_convert`` method. .. ipython:: python - rng_pytz.tz_convert('US/Eastern') + rng_pytz.tz_convert("US/Eastern") .. note:: @@ -2293,9 +2345,9 @@ you can use the ``tz_convert`` method. .. ipython:: python - dti = pd.date_range('2019-01-01', periods=3, freq='D', tz='US/Pacific') + dti = pd.date_range("2019-01-01", periods=3, freq="D", tz="US/Pacific") dti.tz - ts = pd.Timestamp('2019-01-01', tz='US/Pacific') + ts = pd.Timestamp("2019-01-01", tz="US/Pacific") ts.tz .. warning:: @@ -2319,23 +2371,28 @@ you can use the ``tz_convert`` method. Instead, the datetime needs to be localized using the ``localize`` method on the ``pytz`` time zone object. +.. warning:: + + Be aware that for times in the future, correct conversion between time zones + (and UTC) cannot be guaranteed by any time zone library because a timezone's + offset from UTC may be changed by the respective government. + .. warning:: If you are using dates beyond 2038-01-18, due to current deficiencies in the underlying libraries caused by the year 2038 problem, daylight saving time (DST) adjustments to timezone aware dates will not be applied. If and when the underlying libraries are fixed, - the DST transitions will be applied. It should be noted though, that time zone data for far future time zones - are likely to be inaccurate, as they are simple extrapolations of the current set of (regularly revised) rules. + the DST transitions will be applied. For example, for two dates that are in British Summer Time (and so would normally be GMT+1), both the following asserts evaluate as true: .. ipython:: python - d_2037 = '2037-03-31T010101' - d_2038 = '2038-03-31T010101' - DST = 'Europe/London' - assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz='GMT') - assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz='GMT') + d_2037 = "2037-03-31T010101" + d_2038 = "2038-03-31T010101" + DST = "Europe/London" + assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz="GMT") + assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz="GMT") Under the hood, all timestamps are stored in UTC. Values from a time zone aware :class:`DatetimeIndex` or :class:`Timestamp` will have their fields (day, hour, minute, etc.) @@ -2344,8 +2401,8 @@ still considered to be equal even if they are in different time zones: .. ipython:: python - rng_eastern = rng_utc.tz_convert('US/Eastern') - rng_berlin = rng_utc.tz_convert('Europe/Berlin') + rng_eastern = rng_utc.tz_convert("US/Eastern") + rng_berlin = rng_utc.tz_convert("Europe/Berlin") rng_eastern[2] rng_berlin[2] @@ -2356,9 +2413,9 @@ Operations between :class:`Series` in different time zones will yield UTC .. ipython:: python - ts_utc = pd.Series(range(3), pd.date_range('20130101', periods=3, tz='UTC')) - eastern = ts_utc.tz_convert('US/Eastern') - berlin = ts_utc.tz_convert('Europe/Berlin') + ts_utc = pd.Series(range(3), pd.date_range("20130101", periods=3, tz="UTC")) + eastern = ts_utc.tz_convert("US/Eastern") + berlin = ts_utc.tz_convert("Europe/Berlin") result = eastern + berlin result result.index @@ -2369,14 +2426,13 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None) .. ipython:: python - didx = pd.date_range(start='2014-08-01 09:00', freq='H', - periods=3, tz='US/Eastern') + didx = pd.date_range(start="2014-08-01 09:00", freq="H", periods=3, tz="US/Eastern") didx didx.tz_localize(None) didx.tz_convert(None) # tz_convert(None) is identical to tz_convert('UTC').tz_localize(None) - didx.tz_convert('UTC').tz_localize(None) + didx.tz_convert("UTC").tz_localize(None) .. _timeseries.fold: @@ -2402,10 +2458,20 @@ control over how they are handled. .. ipython:: python - pd.Timestamp(datetime.datetime(2019, 10, 27, 1, 30, 0, 0), - tz='dateutil/Europe/London', fold=0) - pd.Timestamp(year=2019, month=10, day=27, hour=1, minute=30, - tz='dateutil/Europe/London', fold=1) + pd.Timestamp( + datetime.datetime(2019, 10, 27, 1, 30, 0, 0), + tz="dateutil/Europe/London", + fold=0, + ) + pd.Timestamp( + year=2019, + month=10, + day=27, + hour=1, + minute=30, + tz="dateutil/Europe/London", + fold=1, + ) .. _timeseries.timezone_ambiguous: @@ -2423,8 +2489,9 @@ twice within one day ("clocks fall back"). The following options are available: .. ipython:: python - rng_hourly = pd.DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', - '11/06/2011 01:00', '11/06/2011 02:00']) + rng_hourly = pd.DatetimeIndex( + ["11/06/2011 00:00", "11/06/2011 01:00", "11/06/2011 01:00", "11/06/2011 02:00"] + ) This will fail as there are ambiguous times (``'11/06/2011 01:00'``) @@ -2437,9 +2504,9 @@ Handle these ambiguous times by specifying the following. .. ipython:: python - rng_hourly.tz_localize('US/Eastern', ambiguous='infer') - rng_hourly.tz_localize('US/Eastern', ambiguous='NaT') - rng_hourly.tz_localize('US/Eastern', ambiguous=[True, True, False, False]) + rng_hourly.tz_localize("US/Eastern", ambiguous="infer") + rng_hourly.tz_localize("US/Eastern", ambiguous="NaT") + rng_hourly.tz_localize("US/Eastern", ambiguous=[True, True, False, False]) .. _timeseries.timezone_nonexistent: @@ -2458,7 +2525,7 @@ can be controlled by the ``nonexistent`` argument. The following options are ava .. ipython:: python - dti = pd.date_range(start='2015-03-29 02:30:00', periods=3, freq='H') + dti = pd.date_range(start="2015-03-29 02:30:00", periods=3, freq="H") # 2:30 is a nonexistent time Localization of nonexistent times will raise an error by default. @@ -2473,10 +2540,10 @@ Transform nonexistent times to ``NaT`` or shift the times. .. ipython:: python dti - dti.tz_localize('Europe/Warsaw', nonexistent='shift_forward') - dti.tz_localize('Europe/Warsaw', nonexistent='shift_backward') - dti.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta(1, unit='H')) - dti.tz_localize('Europe/Warsaw', nonexistent='NaT') + dti.tz_localize("Europe/Warsaw", nonexistent="shift_forward") + dti.tz_localize("Europe/Warsaw", nonexistent="shift_backward") + dti.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta(1, unit="H")) + dti.tz_localize("Europe/Warsaw", nonexistent="NaT") .. _timeseries.timezone_series: @@ -2489,7 +2556,7 @@ represented with a dtype of ``datetime64[ns]``. .. ipython:: python - s_naive = pd.Series(pd.date_range('20130101', periods=3)) + s_naive = pd.Series(pd.date_range("20130101", periods=3)) s_naive A :class:`Series` with a time zone **aware** values is @@ -2497,7 +2564,7 @@ represented with a dtype of ``datetime64[ns, tz]`` where ``tz`` is the time zone .. ipython:: python - s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) + s_aware = pd.Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) s_aware Both of these :class:`Series` time zone information @@ -2507,7 +2574,7 @@ For example, to localize and convert a naive stamp to time zone aware. .. ipython:: python - s_naive.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + s_naive.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") Time zone information can also be manipulated using the ``astype`` method. This method can localize and convert time zone naive timestamps or @@ -2516,13 +2583,13 @@ convert time zone aware timestamps. .. ipython:: python # localize and convert a naive time zone - s_naive.astype('datetime64[ns, US/Eastern]') + s_naive.astype("datetime64[ns, US/Eastern]") # make an aware tz naive - s_aware.astype('datetime64[ns]') + s_aware.astype("datetime64[ns]") # convert to a new time zone - s_aware.astype('datetime64[ns, CET]') + s_aware.astype("datetime64[ns, CET]") .. note:: @@ -2548,4 +2615,4 @@ convert time zone aware timestamps. .. ipython:: python - s_aware.to_numpy(dtype='datetime64[ns]') + s_aware.to_numpy(dtype="datetime64[ns]") diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 27826e7cde9e1..c4ee8677a6b0d 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -11,7 +11,8 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") We provide the basics in pandas to easily create decent looking plots. See the :ref:`ecosystem ` section for visualization @@ -39,12 +40,11 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around .. ipython:: python - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png - ts.plot() + ts.plot(); If the index consists of dates, it calls :meth:`gcf().autofmt_xdate() ` to try to format the x-axis nicely as per above. @@ -54,36 +54,35 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list("ABCD")) df = df.cumsum() plt.figure(); @savefig frame_plot_basic.png df.plot(); -You can plot one column versus another using the `x` and `y` keywords in +You can plot one column versus another using the ``x`` and ``y`` keywords in :meth:`~DataFrame.plot`: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df3 = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() - df3['A'] = pd.Series(list(range(len(df)))) + df3 = pd.DataFrame(np.random.randn(1000, 2), columns=["B", "C"]).cumsum() + df3["A"] = pd.Series(list(range(len(df)))) @savefig df_plot_xy.png - df3.plot(x='A', y='B') + df3.plot(x="A", y="B"); .. note:: @@ -93,7 +92,7 @@ You can plot one column versus another using the `x` and `y` keywords in .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.other: @@ -120,7 +119,7 @@ For example, a bar plot can be created the following way: plt.figure(); @savefig bar_plot_ex.png - df.iloc[5].plot(kind='bar'); + df.iloc[5].plot(kind="bar"); You can also create these other plots using the methods ``DataFrame.plot.`` instead of providing the ``kind`` keyword argument. This makes it easier to discover plot methods and the specific arguments they use: @@ -163,8 +162,8 @@ For labeled, non-time series data, you may wish to produce a bar plot: plt.figure(); @savefig bar_plot_ex.png - df.iloc[5].plot.bar() - plt.axhline(0, color='k'); + df.iloc[5].plot.bar(); + plt.axhline(0, color="k"); Calling a DataFrame's :meth:`plot.bar() ` method produces a multiple bar plot: @@ -172,13 +171,13 @@ bar plot: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig bar_plot_multi_ex.png df2.plot.bar(); @@ -188,7 +187,7 @@ To produce a stacked bar plot, pass ``stacked=True``: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @@ -201,7 +200,7 @@ To get horizontal bar plots, use the ``barh`` method: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @@ -218,19 +217,25 @@ Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Seri .. ipython:: python - df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000), - 'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c']) + df4 = pd.DataFrame( + { + "a": np.random.randn(1000) + 1, + "b": np.random.randn(1000), + "c": np.random.randn(1000) - 1, + }, + columns=["a", "b", "c"], + ) plt.figure(); @savefig hist_new.png - df4.plot.hist(alpha=0.5) + df4.plot.hist(alpha=0.5); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") A histogram can be stacked using ``stacked=True``. Bin size can be changed using the ``bins`` keyword. @@ -240,12 +245,12 @@ using the ``bins`` keyword. plt.figure(); @savefig hist_new_stacked.png - df4.plot.hist(stacked=True, bins=20) + df4.plot.hist(stacked=True, bins=20); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib ``hist``. For example, horizontal and cumulative histograms can be drawn by @@ -256,12 +261,12 @@ horizontal and cumulative histograms can be drawn by plt.figure(); @savefig hist_new_kwargs.png - df4['a'].plot.hist(orientation='horizontal', cumulative=True) + df4["a"].plot.hist(orientation="horizontal", cumulative=True); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hist ` method and the `matplotlib hist documentation `__ for more. @@ -274,22 +279,22 @@ The existing interface ``DataFrame.hist`` to plot histogram still can be used. plt.figure(); @savefig hist_plot_ex.png - df['A'].diff().hist() + df["A"].diff().hist(); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") :meth:`DataFrame.hist` plots the histograms of the columns on multiple subplots: .. ipython:: python - plt.figure() + plt.figure(); @savefig frame_hist_ex.png - df.diff().hist(color='k', alpha=0.5, bins=50) + df.diff().hist(color="k", alpha=0.5, bins=50); The ``by`` keyword can be specified to plot grouped histograms: @@ -297,7 +302,7 @@ The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) @@ -306,7 +311,7 @@ The ``by`` keyword can be specified to plot grouped histograms: data = pd.Series(np.random.randn(1000)) @savefig grouped_hist.png - data.hist(by=np.random.randint(0, 4, 1000), figsize=(6, 4)) + data.hist(by=np.random.randint(0, 4, 1000), figsize=(6, 4)); .. _visualization.box: @@ -323,15 +328,15 @@ a uniform random variable on [0,1). .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + df = pd.DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) @savefig box_plot_new.png - df.plot.box() + df.plot.box(); Boxplot can be colorized by passing ``color`` keyword. You can pass a ``dict`` whose keys are ``boxes``, ``whiskers``, ``medians`` and ``caps``. @@ -348,16 +353,20 @@ more complicated colorization, you can get each drawn artists by passing .. ipython:: python - color = {'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', - 'medians': 'DarkBlue', 'caps': 'Gray'} + color = { + "boxes": "DarkGreen", + "whiskers": "DarkOrange", + "medians": "DarkBlue", + "caps": "Gray", + } @savefig box_new_colorize.png - df.plot.box(color=color, sym='r+') + df.plot.box(color=color, sym="r+"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Also, you can pass other keywords supported by matplotlib ``boxplot``. For example, horizontal and custom-positioned boxplot can be drawn by @@ -366,7 +375,7 @@ For example, horizontal and custom-positioned boxplot can be drawn by .. ipython:: python @savefig box_new_kwargs.png - df.plot.box(vert=False, positions=[1, 4, 5, 6, 8]) + df.plot.box(vert=False, positions=[1, 4, 5, 6, 8]); See the :meth:`boxplot ` method and the @@ -378,7 +387,7 @@ The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python @@ -396,19 +405,19 @@ groupings. For instance, .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df = pd.DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) plt.figure(); @savefig box_plot_ex2.png - bp = df.boxplot(by='X') + bp = df.boxplot(by="X") You can also pass a subset of columns to plot, as well as group by multiple columns: @@ -416,25 +425,25 @@ columns: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 3), columns=['Col1', 'Col2', 'Col3']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']) + df = pd.DataFrame(np.random.rand(10, 3), columns=["Col1", "Col2", "Col3"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"]) plt.figure(); @savefig box_plot_ex3.png - bp = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) + bp = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"]) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.box.return: @@ -462,16 +471,16 @@ keyword, will affect the output type as well: np.random.seed(1234) df_box = pd.DataFrame(np.random.randn(50, 2)) - df_box['g'] = np.random.choice(['A', 'B'], size=50) - df_box.loc[df_box['g'] == 'B', 1] += 3 + df_box["g"] = np.random.choice(["A", "B"], size=50) + df_box.loc[df_box["g"] == "B", 1] += 3 @savefig boxplot_groupby.png - bp = df_box.boxplot(by='g') + bp = df_box.boxplot(by="g") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The subplots above are split by the numeric columns first, then the value of the ``g`` column. Below the subplots are first split by the value of ``g``, @@ -481,12 +490,12 @@ then by the numeric columns. :okwarning: @savefig groupby_boxplot_vis.png - bp = df_box.groupby('g').boxplot() + bp = df_box.groupby("g").boxplot() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.area_plot: @@ -496,7 +505,7 @@ Area plot You can create area plots with :meth:`Series.plot.area` and :meth:`DataFrame.plot.area`. Area plots are stacked by default. To produce stacked area plot, each column must be either all positive or all negative values. -When input data contains `NaN`, it will be automatically filled by 0. If you want to drop or fill by different values, use :func:`dataframe.dropna` or :func:`dataframe.fillna` before calling `plot`. +When input data contains ``NaN``, it will be automatically filled by 0. If you want to drop or fill by different values, use :func:`dataframe.dropna` or :func:`dataframe.fillna` before calling ``plot``. .. ipython:: python :suppress: @@ -506,7 +515,7 @@ When input data contains `NaN`, it will be automatically filled by 0. If you wan .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig area_plot_stacked.png df.plot.area(); @@ -516,7 +525,7 @@ To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @@ -537,29 +546,29 @@ These can be specified by the ``x`` and ``y`` keywords. :suppress: np.random.seed(123456) - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python - df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(50, 4), columns=["a", "b", "c", "d"]) @savefig scatter_plot.png - df.plot.scatter(x='a', y='b'); + df.plot.scatter(x="a", y="b"); To plot multiple column groups in a single axes, repeat ``plot`` method specifying target ``ax``. It is recommended to specify ``color`` and ``label`` keywords to distinguish each groups. .. ipython:: python - ax = df.plot.scatter(x='a', y='b', color='DarkBlue', label='Group 1'); + ax = df.plot.scatter(x="a", y="b", color="DarkBlue", label="Group 1") @savefig scatter_plot_repeated.png - df.plot.scatter(x='c', y='d', color='DarkGreen', label='Group 2', ax=ax); + df.plot.scatter(x="c", y="d", color="DarkGreen", label="Group 2", ax=ax); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The keyword ``c`` may be given as the name of a column to provide colors for each point: @@ -567,13 +576,13 @@ each point: .. ipython:: python @savefig scatter_plot_colored.png - df.plot.scatter(x='a', y='b', c='c', s=50); + df.plot.scatter(x="a", y="b", c="c", s=50); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib :meth:`scatter `. The example below shows a @@ -582,12 +591,12 @@ bubble chart using a column of the ``DataFrame`` as the bubble size. .. ipython:: python @savefig scatter_plot_bubble.png - df.plot.scatter(x='a', y='b', s=df['c'] * 200); + df.plot.scatter(x="a", y="b", s=df["c"] * 200); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`scatter ` method and the `matplotlib scatter documentation `__ for more. @@ -609,11 +618,11 @@ too dense to plot each point individually. .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] + np.arange(1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] + np.arange(1000) @savefig hexbin_plot.png - df.plot.hexbin(x='a', y='b', gridsize=25) + df.plot.hexbin(x="a", y="b", gridsize=25); A useful keyword argument is ``gridsize``; it controls the number of hexagons @@ -631,23 +640,23 @@ given by column ``z``. The bins are aggregated with NumPy's ``max`` function. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] = df['b'] + np.arange(1000) - df['z'] = np.random.uniform(0, 3, 1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] = df["b"] + np.arange(1000) + df["z"] = np.random.uniform(0, 3, 1000) @savefig hexbin_plot_agg.png - df.plot.hexbin(x='a', y='b', C='z', reduce_C_function=np.max, gridsize=25) + df.plot.hexbin(x="a", y="b", C="z", reduce_C_function=np.max, gridsize=25); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hexbin ` method and the `matplotlib hexbin documentation `__ for more. @@ -668,17 +677,17 @@ A ``ValueError`` will be raised if there are any negative values in your data. plt.figure() .. ipython:: python + :okwarning: - series = pd.Series(3 * np.random.rand(4), - index=['a', 'b', 'c', 'd'], name='series') + series = pd.Series(3 * np.random.rand(4), index=["a", "b", "c", "d"], name="series") @savefig series_pie_plot.png - series.plot.pie(figsize=(6, 6)) + series.plot.pie(figsize=(6, 6)); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") For pie plots it's best to use square figures, i.e. a figure aspect ratio 1. You can create the figure with equal width and height, or force the aspect ratio @@ -699,16 +708,17 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python - df = pd.DataFrame(3 * np.random.rand(4, 2), - index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + df = pd.DataFrame( + 3 * np.random.rand(4, 2), index=["a", "b", "c", "d"], columns=["x", "y"] + ) @savefig df_pie_plot.png - df.plot.pie(subplots=True, figsize=(8, 4)) + df.plot.pie(subplots=True, figsize=(8, 4)); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can use the ``labels`` and ``colors`` keywords to specify the labels and colors of each wedge. @@ -730,37 +740,43 @@ Also, other keywords supported by :func:`matplotlib.pyplot.pie` can be used. .. ipython:: python @savefig series_pie_plot_options.png - series.plot.pie(labels=['AA', 'BB', 'CC', 'DD'], colors=['r', 'g', 'b', 'c'], - autopct='%.2f', fontsize=20, figsize=(6, 6)) + series.plot.pie( + labels=["AA", "BB", "CC", "DD"], + colors=["r", "g", "b", "c"], + autopct="%.2f", + fontsize=20, + figsize=(6, 6), + ); If you pass values whose sum total is less than 1.0, matplotlib draws a semicircle. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python + :okwarning: - series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') + series = pd.Series([0.1] * 4, index=["a", "b", "c", "d"], name="series2") @savefig series_pie_plot_semi.png - series.plot.pie(figsize=(6, 6)) + series.plot.pie(figsize=(6, 6)); See the `matplotlib pie documentation `__ for more. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.missing_data: Plotting with missing data -------------------------- -Pandas tries to be pragmatic about plotting ``DataFrames`` or ``Series`` +pandas tries to be pragmatic about plotting ``DataFrames`` or ``Series`` that contain missing data. Missing values are dropped, left out, or filled depending on the plot type. @@ -817,15 +833,16 @@ You can create a scatter plot matrix using the .. ipython:: python from pandas.plotting import scatter_matrix - df = pd.DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) + + df = pd.DataFrame(np.random.randn(1000, 4), columns=["a", "b", "c", "d"]) @savefig scatter_matrix_kde.png - scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde'); + scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal="kde"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.kde: @@ -845,12 +862,12 @@ You can create density plots using the :meth:`Series.plot.kde` and :meth:`DataFr ser = pd.Series(np.random.randn(1000)) @savefig kde_plot.png - ser.plot.kde() + ser.plot.kde(); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.andrews_curves: @@ -870,12 +887,12 @@ of the same class will usually be closer together and form larger structures. from pandas.plotting import andrews_curves - data = pd.read_csv('data/iris.data') + data = pd.read_csv("data/iris.data") - plt.figure() + plt.figure(); @savefig andrews_curves.png - andrews_curves(data, 'Name') + andrews_curves(data, "Name"); .. _visualization.parallel_coordinates: @@ -894,17 +911,17 @@ represents one data point. Points that tend to cluster will appear closer togeth from pandas.plotting import parallel_coordinates - data = pd.read_csv('data/iris.data') + data = pd.read_csv("data/iris.data") - plt.figure() + plt.figure(); @savefig parallel_coordinates.png - parallel_coordinates(data, 'Name') + parallel_coordinates(data, "Name"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.lag: @@ -926,18 +943,18 @@ be passed, and when ``lag=1`` the plot is essentially ``data[:-1]`` vs. from pandas.plotting import lag_plot - plt.figure() + plt.figure(); spacing = np.linspace(-99 * np.pi, 99 * np.pi, num=1000) data = pd.Series(0.1 * np.random.rand(1000) + 0.9 * np.sin(spacing)) @savefig lag_plot.png - lag_plot(data) + lag_plot(data); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.autocorrelation: @@ -963,18 +980,18 @@ autocorrelation plots. from pandas.plotting import autocorrelation_plot - plt.figure() + plt.figure(); spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000) data = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing)) @savefig autocorrelation_plot.png - autocorrelation_plot(data) + autocorrelation_plot(data); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.bootstrap: @@ -999,12 +1016,12 @@ are what constitutes the bootstrap plot. data = pd.Series(np.random.rand(1000)) @savefig bootstrap_plot.png - bootstrap_plot(data, size=50, samples=500, color='grey') + bootstrap_plot(data, size=50, samples=500, color="grey"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.radviz: @@ -1030,17 +1047,17 @@ for more information. from pandas.plotting import radviz - data = pd.read_csv('data/iris.data') + data = pd.read_csv("data/iris.data") - plt.figure() + plt.figure(); @savefig radviz.png - radviz(data, 'Name') + radviz(data, "Name"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.formatting: @@ -1069,14 +1086,14 @@ layout and formatting of the returned plot: plt.figure(); @savefig series_plot_basic2.png - ts.plot(style='k--', label='Series'); + ts.plot(style="k--", label="Series"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") -For each kind of plot (e.g. `line`, `bar`, `scatter`) any additional arguments +For each kind of plot (e.g. ``line``, ``bar``, ``scatter``) any additional arguments keywords are passed along to the corresponding matplotlib function (:meth:`ax.plot() `, :meth:`ax.bar() `, @@ -1096,17 +1113,16 @@ shown by default. .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list("ABCD")) df = df.cumsum() @savefig frame_plot_basic_noleg.png - df.plot(legend=False) + df.plot(legend=False); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Controlling the labels @@ -1121,19 +1137,19 @@ it empty for ylabel. .. ipython:: python :suppress: - plt.figure() + plt.figure(); .. ipython:: python - df.plot() + df.plot(); @savefig plot_xlabel_ylabel.png - df.plot(xlabel="new x", ylabel="new y") + df.plot(xlabel="new x", ylabel="new y"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Scales @@ -1149,17 +1165,16 @@ You may pass ``logy`` to get a log-scale Y axis. .. ipython:: python - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = np.exp(ts.cumsum()) @savefig series_plot_logy.png - ts.plot(logy=True) + ts.plot(logy=True); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See also the ``logx`` and ``loglog`` keyword arguments. @@ -1175,31 +1190,31 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: .. ipython:: python - df['A'].plot() + df["A"].plot(); @savefig series_plot_secondary_y.png - df['B'].plot(secondary_y=True, style='g') + df["B"].plot(secondary_y=True, style="g"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") To plot some columns in a ``DataFrame``, give the column names to the ``secondary_y`` keyword: .. ipython:: python - plt.figure() - ax = df.plot(secondary_y=['A', 'B']) - ax.set_ylabel('CD scale') + plt.figure(); + ax = df.plot(secondary_y=["A", "B"]) + ax.set_ylabel("CD scale"); @savefig frame_plot_secondary_y.png - ax.right_ax.set_ylabel('AB scale') + ax.right_ax.set_ylabel("AB scale"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Note that the columns plotted on the secondary y-axis is automatically marked with "(right)" in the legend. To turn off the automatic marking, use the @@ -1207,15 +1222,15 @@ with "(right)" in the legend. To turn off the automatic marking, use the .. ipython:: python - plt.figure() + plt.figure(); @savefig frame_plot_secondary_y_no_right.png - df.plot(secondary_y=['A', 'B'], mark_right=False) + df.plot(secondary_y=["A", "B"], mark_right=False); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _plotting.formatters: @@ -1224,7 +1239,7 @@ Custom formatters for timeseries plots .. versionchanged:: 1.0.0 -Pandas provides custom formatters for timeseries plots. These change the +pandas provides custom formatters for timeseries plots. These change the formatting of the axis labels for dates and times. By default, the custom formatters are applied only to plots created by pandas with :meth:`DataFrame.plot` or :meth:`Series.plot`. To have them apply to all @@ -1244,47 +1259,47 @@ Here is the default behavior, notice how the x-axis tick labeling is performed: .. ipython:: python - plt.figure() + plt.figure(); @savefig ser_plot_suppress.png - df['A'].plot() + df["A"].plot(); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Using the ``x_compat`` parameter, you can suppress this behavior: .. ipython:: python - plt.figure() + plt.figure(); @savefig ser_plot_suppress_parm.png - df['A'].plot(x_compat=True) + df["A"].plot(x_compat=True); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") If you have more than one plot that needs to be suppressed, the ``use`` method -in ``pandas.plotting.plot_params`` can be used in a `with statement`: +in ``pandas.plotting.plot_params`` can be used in a ``with`` statement: .. ipython:: python - plt.figure() + plt.figure(); @savefig ser_plot_suppress_context.png - with pd.plotting.plot_params.use('x_compat', True): - df['A'].plot(color='r') - df['B'].plot(color='g') - df['C'].plot(color='b') + with pd.plotting.plot_params.use("x_compat", True): + df["A"].plot(color="r") + df["B"].plot(color="g") + df["C"].plot(color="b") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Automatic date tick adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1310,7 +1325,7 @@ with the ``subplots`` keyword: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Using layout and targeting multiple axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1333,7 +1348,7 @@ or columns needed, given the other. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The above example is identical to using: @@ -1344,7 +1359,7 @@ The above example is identical to using: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The required number of columns (3) is inferred from the number of series to plot and the given number of rows (2). @@ -1366,13 +1381,12 @@ otherwise you will see a warning. df.plot(subplots=True, ax=target1, legend=False, sharex=False, sharey=False); @savefig frame_plot_subplots_multi_ax.png - (-df).plot(subplots=True, ax=target2, legend=False, - sharex=False, sharey=False); + (-df).plot(subplots=True, ax=target2, legend=False, sharex=False, sharey=False); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a particular axis: @@ -1380,37 +1394,35 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a :suppress: np.random.seed(123456) - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = ts.cumsum() - df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, - columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list("ABCD")) df = df.cumsum() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python fig, axes = plt.subplots(nrows=2, ncols=2) plt.subplots_adjust(wspace=0.2, hspace=0.5) - df['A'].plot(ax=axes[0, 0]); - axes[0, 0].set_title('A'); - df['B'].plot(ax=axes[0, 1]); - axes[0, 1].set_title('B'); - df['C'].plot(ax=axes[1, 0]); - axes[1, 0].set_title('C'); - df['D'].plot(ax=axes[1, 1]); + df["A"].plot(ax=axes[0, 0]); + axes[0, 0].set_title("A"); + df["B"].plot(ax=axes[0, 1]); + axes[0, 1].set_title("B"); + df["C"].plot(ax=axes[1, 0]); + axes[1, 0].set_title("C"); + df["D"].plot(ax=axes[1, 1]); @savefig series_plot_multi.png - axes[1, 1].set_title('D'); + axes[1, 1].set_title("D"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.errorbars: @@ -1425,24 +1437,32 @@ Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` * As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values. * As raw values (``list``, ``tuple``, or ``np.ndarray``). Must be the same length as the plotting :class:`DataFrame`/:class:`Series`. -Asymmetrical error bars are also supported, however raw error values must be provided in this case. For a ``M`` length :class:`Series`, a ``Mx2`` array should be provided indicating lower and upper (or left and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors should be in a ``Mx2xN`` array. +Asymmetrical error bars are also supported, however raw error values must be provided in this case. For a ``N`` length :class:`Series`, a ``2xN`` array should be provided indicating lower and upper (or left and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors should be in a ``Mx2xN`` array. Here is an example of one way to easily plot group means with standard deviations from the raw data. .. ipython:: python # Generate the data - ix3 = pd.MultiIndex.from_arrays([ - ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'], - ['foo', 'foo', 'bar', 'bar', 'foo', 'foo', 'bar', 'bar']], - names=['letter', 'word']) - - df3 = pd.DataFrame({'data1': [3, 2, 4, 3, 2, 4, 3, 2], - 'data2': [6, 5, 7, 5, 4, 5, 6, 5]}, index=ix3) + ix3 = pd.MultiIndex.from_arrays( + [ + ["a", "a", "a", "a", "b", "b", "b", "b"], + ["foo", "foo", "bar", "bar", "foo", "foo", "bar", "bar"], + ], + names=["letter", "word"], + ) + + df3 = pd.DataFrame( + { + "data1": [3, 2, 4, 3, 2, 4, 3, 2], + "data2": [6, 5, 7, 5, 4, 5, 6, 5], + }, + index=ix3, + ) # Group by index labels and take the means and standard deviations # for each group - gp3 = df3.groupby(level=('letter', 'word')) + gp3 = df3.groupby(level=("letter", "word")) means = gp3.mean() errors = gp3.std() means @@ -1451,12 +1471,12 @@ Here is an example of one way to easily plot group means with standard deviation # Plot fig, ax = plt.subplots() @savefig errorbar_example.png - means.plot.bar(yerr=errors, ax=ax, capsize=4, rot=0) + means.plot.bar(yerr=errors, ax=ax, capsize=4, rot=0); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.table: @@ -1473,16 +1493,16 @@ Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and : .. ipython:: python fig, ax = plt.subplots(1, 1, figsize=(7, 6.5)) - df = pd.DataFrame(np.random.rand(5, 3), columns=['a', 'b', 'c']) + df = pd.DataFrame(np.random.rand(5, 3), columns=["a", "b", "c"]) ax.xaxis.tick_top() # Display x-axis ticks on top. @savefig line_plot_table_true.png - df.plot(table=True, ax=ax) + df.plot(table=True, ax=ax); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Also, you can pass a different :class:`DataFrame` or :class:`Series` to the ``table`` keyword. The data will be drawn as displayed in print method @@ -1495,12 +1515,12 @@ as seen in the example below. ax.xaxis.tick_top() # Display x-axis ticks on top. @savefig line_plot_table_data.png - df.plot(table=np.round(df.T, 2), ax=ax) + df.plot(table=np.round(df.T, 2), ax=ax); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") There also exists a helper function ``pandas.plotting.table``, which creates a table from :class:`DataFrame` or :class:`Series`, and adds it to an @@ -1510,18 +1530,18 @@ matplotlib `table `__ for more. @@ -1555,15 +1575,15 @@ To use the cubehelix colormap, we can pass ``colormap='cubehelix'``. df = pd.DataFrame(np.random.randn(1000, 10), index=ts.index) df = df.cumsum() - plt.figure() + plt.figure(); @savefig cubehelix.png - df.plot(colormap='cubehelix') + df.plot(colormap="cubehelix"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Alternatively, we can pass the colormap itself: @@ -1571,15 +1591,15 @@ Alternatively, we can pass the colormap itself: from matplotlib import cm - plt.figure() + plt.figure(); @savefig cubehelix_cm.png - df.plot(colormap=cm.cubehelix) + df.plot(colormap=cm.cubehelix); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Colormaps can also be used other plot types, like bar charts: @@ -1593,43 +1613,43 @@ Colormaps can also be used other plot types, like bar charts: dd = pd.DataFrame(np.random.randn(10, 10)).applymap(abs) dd = dd.cumsum() - plt.figure() + plt.figure(); @savefig greens.png - dd.plot.bar(colormap='Greens') + dd.plot.bar(colormap="Greens"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Parallel coordinates charts: .. ipython:: python - plt.figure() + plt.figure(); @savefig parallel_gist_rainbow.png - parallel_coordinates(data, 'Name', colormap='gist_rainbow') + parallel_coordinates(data, "Name", colormap="gist_rainbow"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Andrews curves charts: .. ipython:: python - plt.figure() + plt.figure(); @savefig andrews_curve_winter.png - andrews_curves(data, 'Name', colormap='winter') + andrews_curves(data, "Name", colormap="winter"); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting directly with matplotlib --------------------------------- @@ -1653,23 +1673,24 @@ when plotting a large number of points. .. ipython:: python - price = pd.Series(np.random.randn(150).cumsum(), - index=pd.date_range('2000-1-1', periods=150, freq='B')) + price = pd.Series( + np.random.randn(150).cumsum(), + index=pd.date_range("2000-1-1", periods=150, freq="B"), + ) ma = price.rolling(20).mean() mstd = price.rolling(20).std() - plt.figure() + plt.figure(); - plt.plot(price.index, price, 'k') - plt.plot(ma.index, ma, 'b') + plt.plot(price.index, price, "k"); + plt.plot(ma.index, ma, "b"); @savefig bollinger.png - plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, - color='b', alpha=0.2) + plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, color="b", alpha=0.2); .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting backends ----------------- @@ -1683,21 +1704,21 @@ function. For example: .. code-block:: python - >>> Series([1, 2, 3]).plot(backend='backend.module') + >>> Series([1, 2, 3]).plot(backend="backend.module") Alternatively, you can also set this option globally, do you don't need to specify the keyword in each ``plot`` call. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() Or: .. code-block:: python - >>> pd.options.plotting.backend = 'backend.module' + >>> pd.options.plotting.backend = "backend.module" >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst new file mode 100644 index 0000000000000..05f8be091fa25 --- /dev/null +++ b/doc/source/user_guide/window.rst @@ -0,0 +1,593 @@ +.. _window: + +{{ header }} + +******************** +Windowing Operations +******************** + +pandas contains a compact set of APIs for performing windowing operations - an operation that performs +an aggregation over a sliding partition of values. The API functions similarly to the ``groupby`` API +in that :class:`Series` and :class:`DataFrame` call the windowing method with +necessary parameters and then subsequently call the aggregation function. + +.. ipython:: python + + s = pd.Series(range(5)) + s.rolling(window=2).sum() + +The windows are comprised by looking back the length of the window from the current observation. +The result above can be derived by taking the sum of the following windowed partitions of data: + +.. ipython:: python + + for window in s.rolling(window=2): + print(window) + + +.. _window.overview: + +Overview +-------- + +pandas supports 4 types of windowing operations: + +#. Rolling window: Generic fixed or variable sliding window over the values. +#. Weighted window: Weighted, non-rectangular window supplied by the ``scipy.signal`` library. +#. Expanding window: Accumulating window over the values. +#. Exponentially Weighted window: Accumulating and exponentially weighted window over the values. + +============================= ================= =========================== =========================== ======================== +Concept Method Returned Object Supports time-based windows Supports chained groupby +============================= ================= =========================== =========================== ======================== +Rolling window ``rolling`` ``Rolling`` Yes Yes +Weighted window ``rolling`` ``Window`` No No +Expanding window ``expanding`` ``Expanding`` No Yes +Exponentially Weighted window ``ewm`` ``ExponentialMovingWindow`` No Yes (as of version 1.2) +============================= ================= =========================== =========================== ======================== + +As noted above, some operations support specifying a window based on a time offset: + +.. ipython:: python + + s = pd.Series(range(5), index=pd.date_range('2020-01-01', periods=5, freq='1D')) + s.rolling(window='2D').sum() + +Additionally, some methods support chaining a ``groupby`` operation with a windowing operation +which will first group the data by the specified keys and then perform a windowing operation per group. + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'a', 'b', 'a'], 'B': range(5)}) + df.groupby('A').expanding().sum() + +.. note:: + + Windowing operations currently only support numeric data (integer and float) + and will always return ``float64`` values. + +.. warning:: + + Some windowing aggregation, ``mean``, ``sum``, ``var`` and ``std`` methods may suffer from numerical + imprecision due to the underlying windowing algorithms accumulating sums. When values differ + with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be + noted, that large values may have an impact on windows, which do not include these values. `Kahan summation + `__ is used + to compute the rolling sums to preserve accuracy as much as possible. + + +All windowing operations support a ``min_periods`` argument that dictates the minimum amount of +non-``np.nan`` values a window must have; otherwise, the resulting value is ``np.nan``. +``min_peridos`` defaults to 1 for time-based windows and ``window`` for fixed windows + +.. ipython:: python + + s = pd.Series([np.nan, 1, 2, np.nan, np.nan, 3]) + s.rolling(window=3, min_periods=1).sum() + s.rolling(window=3, min_periods=2).sum() + # Equivalent to min_periods=3 + s.rolling(window=3, min_periods=None).sum() + + +Additionally, all windowing operations supports the ``aggregate`` method for returning a result +of multiple aggregations applied to a window. + +.. ipython:: python + + df = pd.DataFrame({"A": range(5), "B": range(10, 15)}) + df.expanding().agg([np.sum, np.mean, np.std]) + + +.. _window.generic: + +Rolling window +-------------- + +Generic rolling windows support specifying windows as a fixed number of observations or variable +number of observations based on an offset. If a time based offset is provided, the corresponding +time based index must be monotonic. + +.. ipython:: python + + times = ['2020-01-01', '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-29'] + s = pd.Series(range(5), index=pd.DatetimeIndex(times)) + s + # Window with 2 observations + s.rolling(window=2).sum() + # Window with 2 days worth of observations + s.rolling(window='2D').sum() + +For all supported aggregation functions, see :ref:`api.functions_rolling`. + +.. _window.center: + +Centering windows +~~~~~~~~~~~~~~~~~ + +By default the labels are set to the right edge of the window, but a +``center`` keyword is available so the labels can be set at the center. + +.. ipython:: python + + s = pd.Series(range(10)) + s.rolling(window=5).mean() + s.rolling(window=5, center=True).mean() + + +.. _window.endpoints: + +Rolling window endpoints +~~~~~~~~~~~~~~~~~~~~~~~~ + +The inclusion of the interval endpoints in rolling window calculations can be specified with the ``closed`` +parameter: + +============= ==================== +Value Behavior +============= ==================== +``right'`` close right endpoint +``'left'`` close left endpoint +``'both'`` close both endpoints +``'neither'`` open endpoints +============= ==================== + +For example, having the right endpoint open is useful in many problems that require that there is no contamination +from present information back to past information. This allows the rolling window to compute statistics +"up to that point in time", but not including that point in time. + +.. ipython:: python + + df = pd.DataFrame( + {"x": 1}, + index=[ + pd.Timestamp("20130101 09:00:01"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:04"), + pd.Timestamp("20130101 09:00:06"), + ], + ) + + df["right"] = df.rolling("2s", closed="right").x.sum() # default + df["both"] = df.rolling("2s", closed="both").x.sum() + df["left"] = df.rolling("2s", closed="left").x.sum() + df["neither"] = df.rolling("2s", closed="neither").x.sum() + + df + + +.. _window.custom_rolling_window: + +Custom window rolling +~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.0 + +In addition to accepting an integer or offset as a ``window`` argument, ``rolling`` also accepts +a ``BaseIndexer`` subclass that allows a user to define a custom method for calculating window bounds. +The ``BaseIndexer`` subclass will need to define a ``get_window_bounds`` method that returns +a tuple of two arrays, the first being the starting indices of the windows and second being the +ending indices of the windows. Additionally, ``num_values``, ``min_periods``, ``center``, ``closed`` +and will automatically be passed to ``get_window_bounds`` and the defined method must +always accept these arguments. + +For example, if we have the following :class:``DataFrame``: + +.. ipython:: python + + use_expanding = [True, False, True, False, True] + use_expanding + df = pd.DataFrame({"values": range(5)}) + df + +and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size +1, we can create the following ``BaseIndexer`` subclass: + +.. code-block:: ipython + + In [2]: from pandas.api.indexers import BaseIndexer + ...: + ...: class CustomIndexer(BaseIndexer): + ...: + ...: def get_window_bounds(self, num_values, min_periods, center, closed): + ...: start = np.empty(num_values, dtype=np.int64) + ...: end = np.empty(num_values, dtype=np.int64) + ...: for i in range(num_values): + ...: if self.use_expanding[i]: + ...: start[i] = 0 + ...: end[i] = i + 1 + ...: else: + ...: start[i] = i + ...: end[i] = i + self.window_size + ...: return start, end + ...: + + In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + + In [4]: df.rolling(indexer).sum() + Out[4]: + values + 0 0.0 + 1 1.0 + 2 3.0 + 3 3.0 + 4 10.0 + +You can view other examples of ``BaseIndexer`` subclasses `here `__ + +.. versionadded:: 1.1 + +One subclass of note within those examples is the ``VariableOffsetWindowIndexer`` that allows +rolling operations over a non-fixed offset like a ``BusinessDay``. + +.. ipython:: python + + from pandas.api.indexers import VariableOffsetWindowIndexer + + df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) + offset = pd.offsets.BDay(1) + indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) + df + df.rolling(indexer).sum() + +For some problems knowledge of the future is available for analysis. For example, this occurs when +each data point is a full time series read from an experiment, and the task is to extract underlying +conditions. In these cases it can be useful to perform forward-looking rolling window computations. +:func:`FixedForwardWindowIndexer ` class is available for this purpose. +This :func:`BaseIndexer ` subclass implements a closed fixed-width +forward-looking rolling window, and we can use it as follows: + +.. ipython:: ipython + + from pandas.api.indexers import FixedForwardWindowIndexer + indexer = FixedForwardWindowIndexer(window_size=2) + df.rolling(indexer, min_periods=1).sum() + + +.. _window.rolling_apply: + +Rolling apply +~~~~~~~~~~~~~ + +The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs +generic rolling computations. The ``func`` argument should be a single function +that produces a single value from an ndarray input. ``raw`` specifies whether +the windows are cast as :class:`Series` objects (``raw=False``) or ndarray objects (``raw=True``). + +.. ipython:: python + + def mad(x): + return np.fabs(x - x.mean()).mean() + + s = pd.Series(range(10)) + s.rolling(window=4).apply(mad, raw=True) + + +.. _window.numba_engine: + +Numba engine +~~~~~~~~~~~~ + +.. versionadded:: 1.0 + +Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ +if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying +``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +Numba will be applied in potentially two routines: + +#. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. +#. The engine will JIT the for loop where the apply function is applied to each window. + +The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the +`numba.jit decorator `__. +These keyword arguments will be applied to *both* the passed function (if a standard Python function) +and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and their default values are set to ``False``, ``True`` and ``False`` respectively. + +.. note:: + + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, the compiled functions are cached, + and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: data = pd.Series(range(1_000_000)) + + In [2]: roll = data.rolling(10) + + In [3]: def f(x): + ...: return np.sum(x) + 5 + # Run the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225, E999 + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + +.. _window.cov_corr: + +Binary window functions +~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~Rolling.cov` and :meth:`~Rolling.corr` can compute moving window statistics about +two :class:`Series` or any combination of :class:`DataFrame`/:class:`Series` or +:class:`DataFrame`/:class:`DataFrame`. Here is the behavior in each case: + +* two :class:`Series`: compute the statistic for the pairing. +* :class:`DataFrame`/:class:`Series`: compute the statistics for each column of the DataFrame + with the passed Series, thus returning a DataFrame. +* :class:`DataFrame`/:class:`DataFrame`: by default compute the statistic for matching column + names, returning a DataFrame. If the keyword argument ``pairwise=True`` is + passed then computes the statistic for each pair of columns, returning a + ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section + `). + +For example: + +.. ipython:: python + + df = pd.DataFrame( + np.random.randn(10, 4), + index=pd.date_range("2020-01-01", periods=10), + columns=["A", "B", "C", "D"], + ) + df = df.cumsum() + + df2 = df[:4] + df2.rolling(window=2).corr(df2["B"]) + +.. _window.corr_pairwise: + +Computing rolling pairwise covariances and correlations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In financial data analysis and other fields it's common to compute covariance +and correlation matrices for a collection of time series. Often one is also +interested in moving-window covariance and correlation matrices. This can be +done by passing the ``pairwise`` keyword argument, which in the case of +:class:`DataFrame` inputs will yield a MultiIndexed :class:`DataFrame` whose ``index`` are the dates in +question. In the case of a single DataFrame argument the ``pairwise`` argument +can even be omitted: + +.. note:: + + Missing values are ignored and each entry is computed using the pairwise + complete observations. Please see the :ref:`covariance section + ` for :ref:`caveats + ` associated with this method of + calculating covariance and correlation matrices. + +.. ipython:: python + + covs = ( + df[["B", "C", "D"]] + .rolling(window=4) + .cov(df[["A", "B", "C"]], pairwise=True) + ) + covs + + +.. _window.weighted: + +Weighted window +--------------- + +The ``win_type`` argument in ``.rolling`` generates a weighted windows that are commonly used in filtering +and spectral estimation. ``win_type`` must be string that corresponds to a `scipy.signal window function +`__. +Scipy must be installed in order to use these windows, and supplementary arguments +that the Scipy window methods take must be specified in the aggregation function. + + +.. ipython:: python + + s = pd.Series(range(10)) + s.rolling(window=5).mean() + s.rolling(window=5, win_type="triang").mean() + # Supplementary Scipy arguments passed in the aggregation function + s.rolling(window=5, win_type="gaussian").mean(std=0.1) + +For all supported aggregation functions, see :ref:`api.functions_window`. + +.. _window.expanding: + +Expanding window +---------------- + +An expanding window yields the value of an aggregation statistic with all the data available up to that +point in time. Since these calculations are a special case of rolling statistics, +they are implemented in pandas such that the following two calls are equivalent: + +.. ipython:: python + + df = pd.DataFrame(range(5)) + df.rolling(window=len(df), min_periods=1).mean() + df.expanding(min_periods=1).mean() + +For all supported aggregation functions, see :ref:`api.functions_expanding`. + + +.. _window.exponentially_weighted: + +Exponentially Weighted window +----------------------------- + +An exponentially weighted window is similar to an expanding window but with each prior point +being exponentially weighted down relative to the current point. + +In general, a weighted moving average is calculated as + +.. math:: + + y_t = \frac{\sum_{i=0}^t w_i x_{t-i}}{\sum_{i=0}^t w_i}, + +where :math:`x_t` is the input, :math:`y_t` is the result and the :math:`w_i` +are the weights. + +For all supported aggregation functions, see :ref:`api.functions_ewm`. + +The EW functions support two variants of exponential weights. +The default, ``adjust=True``, uses the weights :math:`w_i = (1 - \alpha)^i` +which gives + +.. math:: + + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + + (1 - \alpha)^t x_{0}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + + (1 - \alpha)^t} + +When ``adjust=False`` is specified, moving averages are calculated as + +.. math:: + + y_0 &= x_0 \\ + y_t &= (1 - \alpha) y_{t-1} + \alpha x_t, + +which is equivalent to using weights + +.. math:: + + w_i = \begin{cases} + \alpha (1 - \alpha)^i & \text{if } i < t \\ + (1 - \alpha)^i & \text{if } i = t. + \end{cases} + +.. note:: + + These equations are sometimes written in terms of :math:`\alpha' = 1 - \alpha`, e.g. + + .. math:: + + y_t = \alpha' y_{t-1} + (1 - \alpha') x_t. + +The difference between the above two variants arises because we are +dealing with series which have finite history. Consider a series of infinite +history, with ``adjust=True``: + +.. math:: + + y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} + {1 + (1 - \alpha) + (1 - \alpha)^2 + ...} + +Noting that the denominator is a geometric series with initial term equal to 1 +and a ratio of :math:`1 - \alpha` we have + +.. math:: + + y_t &= \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...} + {\frac{1}{1 - (1 - \alpha)}}\\ + &= [x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...] \alpha \\ + &= \alpha x_t + [(1-\alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...]\alpha \\ + &= \alpha x_t + (1 - \alpha)[x_{t-1} + (1 - \alpha) x_{t-2} + ...]\alpha\\ + &= \alpha x_t + (1 - \alpha) y_{t-1} + +which is the same expression as ``adjust=False`` above and therefore +shows the equivalence of the two variants for infinite series. +When ``adjust=False``, we have :math:`y_0 = x_0` and +:math:`y_t = \alpha x_t + (1 - \alpha) y_{t-1}`. +Therefore, there is an assumption that :math:`x_0` is not an ordinary value +but rather an exponentially weighted moment of the infinite series up to that +point. + +One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass +:math:`\alpha` directly, it's often easier to think about either the +**span**, **center of mass (com)** or **half-life** of an EW moment: + +.. math:: + + \alpha = + \begin{cases} + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \end{cases} + +One must specify precisely one of **span**, **center of mass**, **half-life** +and **alpha** to the EW functions: + +* **Span** corresponds to what is commonly called an "N-day EW moving average". +* **Center of mass** has a more physical interpretation and can be thought of + in terms of span: :math:`c = (s - 1) / 2`. +* **Half-life** is the period of time for the exponential weight to reduce to + one half. +* **Alpha** specifies the smoothing factor directly. + +.. versionadded:: 1.1.0 + +You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of +time it takes for an observation to decay to half its value when also specifying a sequence +of ``times``. + +.. ipython:: python + + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) + df + times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"] + df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean() + +The following formula is used to compute exponentially weighted mean with an input vector of times: + +.. math:: + + y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}}, + + +ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how +intermediate null values affect the calculation of the weights. +When ``ignore_na=False`` (the default), weights are calculated based on absolute +positions, so that intermediate null values affect the result. +When ``ignore_na=True``, +weights are calculated by ignoring intermediate null values. +For example, assuming ``adjust=True``, if ``ignore_na=False``, the weighted +average of ``3, NaN, 5`` would be calculated as + +.. math:: + + \frac{(1-\alpha)^2 \cdot 3 + 1 \cdot 5}{(1-\alpha)^2 + 1}. + +Whereas if ``ignore_na=True``, the weighted average would be calculated as + +.. math:: + + \frac{(1-\alpha) \cdot 3 + 1 \cdot 5}{(1-\alpha) + 1}. + +The :meth:`~Ewm.var`, :meth:`~Ewm.std`, and :meth:`~Ewm.cov` functions have a ``bias`` argument, +specifying whether the result should contain biased or unbiased statistics. +For example, if ``bias=True``, ``ewmvar(x)`` is calculated as +``ewmvar(x) = ewma(x**2) - ewma(x)**2``; +whereas if ``bias=False`` (the default), the biased variance statistics +are scaled by debiasing factors + +.. math:: + + \frac{\left(\sum_{i=0}^t w_i\right)^2}{\left(\sum_{i=0}^t w_i\right)^2 - \sum_{i=0}^t w_i^2}. + +(For :math:`w_i = 1`, this reduces to the usual :math:`N / (N - 1)` factor, +with :math:`N = t + 1`.) +See `Weighted Sample Variance `__ +on Wikipedia for further details. diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ad5bb5a5b2d72..aa8a7a389adee 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,12 +10,33 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 1.3 +----------- + +.. toctree:: + :maxdepth: 2 + + v1.3.0 + +Version 1.2 +----------- + +.. toctree:: + :maxdepth: 2 + + v1.2.0 + Version 1.1 ----------- .. toctree:: :maxdepth: 2 + v1.1.5 + v1.1.4 + v1.1.3 + v1.1.2 + v1.1.1 v1.1.0 Version 1.0 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 443250592a4a7..aa2749c85a232 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -49,8 +49,8 @@ talking about: :okwarning: import pandas as pd - df = pd.DataFrame(np.random.randn(6, 4), - index=pd.date_range('1/1/2000', periods=6)) + + df = pd.DataFrame(np.random.randn(6, 4), index=pd.date_range("1/1/2000", periods=6)) df # deprecated now df - df[0] @@ -184,12 +184,14 @@ labeled the aggregated group with the end of the interval: the next day). import io - data = ('a,b,c\n' - '1,Yes,2\n' - '3,No,4') + data = """ + a,b,c + 1,Yes,2 + 3,No,4 + """ print(data) pd.read_csv(io.StringIO(data), header=None) - pd.read_csv(io.StringIO(data), header=None, prefix='X') + pd.read_csv(io.StringIO(data), header=None, prefix="X") - Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, though this can be controlled by new ``true_values`` and ``false_values`` @@ -199,7 +201,7 @@ labeled the aggregated group with the end of the interval: the next day). print(data) pd.read_csv(io.StringIO(data)) - pd.read_csv(io.StringIO(data), true_values=['Yes'], false_values=['No']) + pd.read_csv(io.StringIO(data), true_values=["Yes"], false_values=["No"]) - The file parsers will not recognize non-string values arising from a converter function as NA if passed in the ``na_values`` argument. It's better @@ -210,10 +212,10 @@ labeled the aggregated group with the end of the interval: the next day). .. ipython:: python - s = pd.Series([np.nan, 1., 2., np.nan, 4]) + s = pd.Series([np.nan, 1.0, 2.0, np.nan, 4]) s s.fillna(0) - s.fillna(method='pad') + s.fillna(method="pad") Convenience methods ``ffill`` and ``bfill`` have been added: @@ -229,7 +231,8 @@ Convenience methods ``ffill`` and ``bfill`` have been added: .. ipython:: python def f(x): - return pd.Series([x, x**2], index=['x', 'x^2']) + return pd.Series([x, x ** 2], index=["x", "x^2"]) + s = pd.Series(np.random.rand(5)) s @@ -272,20 +275,20 @@ The old behavior of printing out summary information can be achieved via the .. ipython:: python - pd.set_option('expand_frame_repr', False) + pd.set_option("expand_frame_repr", False) wide_frame .. ipython:: python :suppress: - pd.reset_option('expand_frame_repr') + pd.reset_option("expand_frame_repr") The width of each line can be changed via 'line_width' (80 by default): .. code-block:: python - pd.set_option('line_width', 40) + pd.set_option("line_width", 40) wide_frame diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index 1e9eafd2700e9..611ac2021fcec 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -45,29 +45,31 @@ You may need to upgrade your existing data files. Please visit the import os - os.remove('store.h5') + os.remove("store.h5") You can designate (and index) certain columns that you want to be able to perform queries on a table, by passing a list to ``data_columns`` .. ipython:: python - store = pd.HDFStore('store.h5') - df = pd.DataFrame(np.random.randn(8, 3), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C']) - df['string'] = 'foo' - df.loc[df.index[4:6], 'string'] = np.nan - df.loc[df.index[7:9], 'string'] = 'bar' - df['string2'] = 'cool' + store = pd.HDFStore("store.h5") + df = pd.DataFrame( + np.random.randn(8, 3), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C"], + ) + df["string"] = "foo" + df.loc[df.index[4:6], "string"] = np.nan + df.loc[df.index[7:9], "string"] = "bar" + df["string2"] = "cool" df # on-disk operations - store.append('df', df, data_columns=['B', 'C', 'string', 'string2']) - store.select('df', "B>0 and string=='foo'") + store.append("df", df, data_columns=["B", "C", "string", "string2"]) + store.select("df", "B>0 and string=='foo'") # this is in-memory version of this type of selection - df[(df.B > 0) & (df.string == 'foo')] + df[(df.B > 0) & (df.string == "foo")] Retrieving unique values in an indexable or data column. @@ -75,19 +77,19 @@ Retrieving unique values in an indexable or data column. # note that this is deprecated as of 0.14.0 # can be replicated by: store.select_column('df','index').unique() - store.unique('df', 'index') - store.unique('df', 'string') + store.unique("df", "index") + store.unique("df", "string") You can now store ``datetime64`` in data columns .. ipython:: python df_mixed = df.copy() - df_mixed['datetime64'] = pd.Timestamp('20010102') - df_mixed.loc[df_mixed.index[3:4], ['A', 'B']] = np.nan + df_mixed["datetime64"] = pd.Timestamp("20010102") + df_mixed.loc[df_mixed.index[3:4], ["A", "B"]] = np.nan - store.append('df_mixed', df_mixed) - df_mixed1 = store.select('df_mixed') + store.append("df_mixed", df_mixed) + df_mixed1 = store.select("df_mixed") df_mixed1 df_mixed1.dtypes.value_counts() @@ -97,7 +99,7 @@ columns, this is equivalent to passing a .. ipython:: python - store.select('df', columns=['A', 'B']) + store.select("df", columns=["A", "B"]) ``HDFStore`` now serializes MultiIndex dataframes when appending tables. @@ -160,37 +162,41 @@ combined result, by using ``where`` on a selector table. .. ipython:: python - df_mt = pd.DataFrame(np.random.randn(8, 6), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) - df_mt['foo'] = 'bar' + df_mt = pd.DataFrame( + np.random.randn(8, 6), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C", "D", "E", "F"], + ) + df_mt["foo"] = "bar" # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, - df_mt, selector='df1_mt') + store.append_to_multiple( + {"df1_mt": ["A", "B"], "df2_mt": None}, df_mt, selector="df1_mt" + ) store # individual tables were created - store.select('df1_mt') - store.select('df2_mt') + store.select("df1_mt") + store.select("df2_mt") # as a multiple - store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector='df1_mt') + store.select_as_multiple( + ["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt" + ) .. ipython:: python :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") **Enhancements** - ``HDFStore`` now can read native PyTables table format tables - You can pass ``nan_rep = 'my_nan_rep'`` to append, to change the default nan - representation on disk (which converts to/from `np.nan`), this defaults to - `nan`. + representation on disk (which converts to/from ``np.nan``), this defaults to + ``nan``. - You can pass ``index`` to ``append``. This defaults to ``True``. This will automagically create indices on the *indexables* and *data columns* of the @@ -224,7 +230,7 @@ combined result, by using ``where`` on a selector table. - Function to reset Google Analytics token store so users can recover from improperly setup client secrets (:issue:`2687`). - Fixed groupby bug resulting in segfault when passing in MultiIndex (:issue:`2706`) -- Fixed bug where passing a Series with datetime64 values into `to_datetime` +- Fixed bug where passing a Series with datetime64 values into ``to_datetime`` results in bogus output values (:issue:`2699`) - Fixed bug in ``pattern in HDFStore`` expressions when pattern is not a valid regex (:issue:`2694`) @@ -240,7 +246,7 @@ combined result, by using ``where`` on a selector table. - Fixed C file parser behavior when the file has more columns than data (:issue:`2668`) - Fixed file reader bug that misaligned columns with data in the presence of an - implicit column and a specified `usecols` value + implicit column and a specified ``usecols`` value - DataFrames with numerical or datetime indices are now sorted prior to plotting (:issue:`2609`) - Fixed DataFrame.from_records error when passed columns, index, but empty diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 6c13a125a4e54..a69d1ad1dec3b 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -24,7 +24,7 @@ Selection choices ~~~~~~~~~~~~~~~~~ Starting in 0.11.0, object selection has had a number of user-requested additions in -order to support more explicit location based indexing. Pandas now supports +order to support more explicit location based indexing. pandas now supports three types of multi-axis indexing. - ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, allowed inputs are: @@ -367,6 +367,7 @@ Enhancements - You can now select with a string from a DataFrame with a datelike index, in a similar way to a Series (:issue:`3070`) .. ipython:: python + :okwarning: idx = pd.date_range("2001-10-1", periods=5, freq='M') ts = pd.Series(np.random.rand(len(idx)), index=idx) @@ -424,13 +425,13 @@ Enhancements - Cursor coordinate information is now displayed in time-series plots. - - added option `display.max_seq_items` to control the number of + - added option ``display.max_seq_items`` to control the number of elements printed per sequence pprinting it. (:issue:`2979`) - - added option `display.chop_threshold` to control display of small numerical + - added option ``display.chop_threshold`` to control display of small numerical values. (:issue:`2739`) - - added option `display.max_info_rows` to prevent verbose_info from being + - added option ``display.max_info_rows`` to prevent verbose_info from being calculated for frames above 1M rows (configurable). (:issue:`2807`, :issue:`2918`) - value_counts() now accepts a "normalize" argument, for normalized @@ -439,7 +440,7 @@ Enhancements - DataFrame.from_records now accepts not only dicts but any instance of the collections.Mapping ABC. - - added option `display.mpl_style` providing a sleeker visual style + - added option ``display.mpl_style`` providing a sleeker visual style for plots. Based on https://gist.github.com/huyng/816622 (:issue:`3075`). - Treat boolean values as integers (values 1 and 0) for numeric diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 9971ae22822f6..c12adb2f1334f 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -47,7 +47,7 @@ API changes .. ipython:: python - p = pd.DataFrame({'first': [4, 5, 8], 'second': [0, 0, 3]}) + p = pd.DataFrame({"first": [4, 5, 8], "second": [0, 0, 3]}) p % 0 p % p p / p @@ -95,8 +95,8 @@ API changes .. ipython:: python - df = pd.DataFrame(range(5), index=list('ABCDE'), columns=['a']) - mask = (df.a % 2 == 0) + df = pd.DataFrame(range(5), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 mask # this is what you should use @@ -141,21 +141,24 @@ API changes .. code-block:: python from pandas.io.parsers import ExcelFile - xls = ExcelFile('path_to_file.xls') - xls.parse('Sheet1', index_col=None, na_values=['NA']) + + xls = ExcelFile("path_to_file.xls") + xls.parse("Sheet1", index_col=None, na_values=["NA"]) With .. code-block:: python import pandas as pd - pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + + pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"]) - added top-level function ``read_sql`` that is equivalent to the following .. code-block:: python from pandas.io.sql import read_frame + read_frame(...) - ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for @@ -200,7 +203,7 @@ IO enhancements .. ipython:: python :okwarning: - df = pd.DataFrame({'a': range(3), 'b': list('abc')}) + df = pd.DataFrame({"a": range(3), "b": list("abc")}) print(df) html = df.to_html() alist = pd.read_html(html, index_col=0) @@ -248,16 +251,18 @@ IO enhancements .. ipython:: python from pandas._testing import makeCustomDataframe as mkdf + df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - df.to_csv('mi.csv') - print(open('mi.csv').read()) - pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1]) + df.to_csv("mi.csv") + print(open("mi.csv").read()) + pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) .. ipython:: python :suppress: import os - os.remove('mi.csv') + + os.remove("mi.csv") - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3 @@ -304,8 +309,8 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'a': list('ab..'), 'b': [1, 2, 3, 4]}) - df.replace(regex=r'\s*\.\s*', value=np.nan) + df = pd.DataFrame({"a": list("ab.."), "b": [1, 2, 3, 4]}) + df.replace(regex=r"\s*\.\s*", value=np.nan) to replace all occurrences of the string ``'.'`` with zero or more instances of surrounding white space with ``NaN``. @@ -314,7 +319,7 @@ Other enhancements .. ipython:: python - df.replace('.', np.nan) + df.replace(".", np.nan) to replace all occurrences of the string ``'.'`` with ``NaN``. @@ -359,8 +364,8 @@ Other enhancements .. ipython:: python - dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) - dff.groupby('B').filter(lambda x: len(x) > 2) + dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")}) + dff.groupby("B").filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are @@ -368,7 +373,7 @@ Other enhancements .. ipython:: python - dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False) - Series and DataFrame hist methods now take a ``figsize`` argument (:issue:`3834`) @@ -397,23 +402,24 @@ Experimental features from pandas.tseries.offsets import CustomBusinessDay from datetime import datetime + # As an interesting example, let's look at Egypt where # a Friday-Saturday weekend is observed. - weekmask_egypt = 'Sun Mon Tue Wed Thu' + weekmask_egypt = "Sun Mon Tue Wed Thu" # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) dt = datetime(2013, 4, 30) print(dt + 2 * bday_egypt) dts = pd.date_range(dt, periods=5, freq=bday_egypt) - print(pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) + print(pd.Series(dts.weekday, dts).map(pd.Series("Mon Tue Wed Thu Fri Sat Sun".split()))) Bug fixes ~~~~~~~~~ - Plotting functions now raise a ``TypeError`` before trying to plot anything - if the associated objects have have a dtype of ``object`` (:issue:`1818`, + if the associated objects have a dtype of ``object`` (:issue:`1818`, :issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to numeric arrays if possible so that you can still plot, for example, an object array with floats. This happens before any drawing takes place which @@ -424,20 +430,20 @@ Bug fixes - ``Series.str`` now supports iteration (:issue:`3638`). You can iterate over the individual elements of each string in the ``Series``. Each iteration yields - yields a ``Series`` with either a single character at each index of the - original ``Series`` or ``NaN``. For example, + a ``Series`` with either a single character at each index of the original + ``Series`` or ``NaN``. For example, .. ipython:: python :okwarning: - strs = 'go', 'bow', 'joe', 'slow' + strs = "go", "bow", "joe", "slow" ds = pd.Series(strs) for s in ds.str: print(s) s - s.dropna().values.item() == 'w' + s.dropna().values.item() == "w" The last element yielded by the iterator will be a ``Series`` containing the last element of the longest string in the ``Series`` with all other diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 5a904d6c85c61..3c6b70fb21383 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -214,7 +214,7 @@ These were announced changes in 0.12 or prior that are taking effect as of 0.13. - Remove deprecated ``read_clipboard/to_clipboard/ExcelFile/ExcelWriter`` from ``pandas.io.parsers`` (:issue:`3717`) These are available as functions in the main pandas namespace (e.g. ``pd.read_clipboard``) - default for ``tupleize_cols`` is now ``False`` for both ``to_csv`` and ``read_csv``. Fair warning in 0.12 (:issue:`3604`) -- default for `display.max_seq_len` is now 100 rather than `None`. This activates +- default for ``display.max_seq_len`` is now 100 rather than ``None``. This activates truncated display ("...") of long sequences in various places. (:issue:`3391`) Deprecations @@ -498,7 +498,7 @@ Enhancements - ``to_dict`` now takes ``records`` as a possible out type. Returns an array of column-keyed dictionaries. (:issue:`4936`) -- ``NaN`` handing in get_dummies (:issue:`4446`) with `dummy_na` +- ``NaN`` handing in get_dummies (:issue:`4446`) with ``dummy_na`` .. ipython:: python @@ -668,7 +668,7 @@ Enhancements - ``Series`` now supports a ``to_frame`` method to convert it to a single-column DataFrame (:issue:`5164`) -- All R datasets listed here http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html can now be loaded into Pandas objects +- All R datasets listed here http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html can now be loaded into pandas objects .. code-block:: python @@ -1071,7 +1071,7 @@ Bug fixes as the docstring says (:issue:`4362`). - ``as_index`` is no longer ignored when doing groupby apply (:issue:`4648`, :issue:`3417`) -- JSON NaT handling fixed, NaTs are now serialized to `null` (:issue:`4498`) +- JSON NaT handling fixed, NaTs are now serialized to ``null`` (:issue:`4498`) - Fixed JSON handling of escapable characters in JSON object keys (:issue:`4593`) - Fixed passing ``keep_default_na=False`` when ``na_values=None`` @@ -1188,7 +1188,7 @@ Bug fixes single column and passing a list for ``ascending``, the argument for ``ascending`` was being interpreted as ``True`` (:issue:`4839`, :issue:`4846`) -- Fixed ``Panel.tshift`` not working. Added `freq` support to ``Panel.shift`` +- Fixed ``Panel.tshift`` not working. Added ``freq`` support to ``Panel.shift`` (:issue:`4853`) - Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser) with thousands != "," (:issue:`4596`) diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 6fe010be8fb2d..249b9555b7fd4 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -31,16 +31,16 @@ Highlights include: .. ipython:: python - df = pd.DataFrame({'A': np.array(['foo', 'bar', 'bah', 'foo', 'bar'])}) - df['A'].iloc[0] = np.nan + df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) + df["A"].iloc[0] = np.nan df The recommended way to do this type of assignment is: .. ipython:: python - df = pd.DataFrame({'A': np.array(['foo', 'bar', 'bah', 'foo', 'bar'])}) - df.loc[0, 'A'] = np.nan + df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) + df.loc[0, "A"] = np.nan df Output formatting enhancements @@ -52,24 +52,27 @@ Output formatting enhancements .. ipython:: python - max_info_rows = pd.get_option('max_info_rows') + max_info_rows = pd.get_option("max_info_rows") - df = pd.DataFrame({'A': np.random.randn(10), - 'B': np.random.randn(10), - 'C': pd.date_range('20130101', periods=10) - }) + df = pd.DataFrame( + { + "A": np.random.randn(10), + "B": np.random.randn(10), + "C": pd.date_range("20130101", periods=10), + } + ) df.iloc[3:6, [0, 2]] = np.nan .. ipython:: python # set to not display the null counts - pd.set_option('max_info_rows', 0) + pd.set_option("max_info_rows", 0) df.info() .. ipython:: python # this is the default (same as in 0.13.0) - pd.set_option('max_info_rows', max_info_rows) + pd.set_option("max_info_rows", max_info_rows) df.info() - Add ``show_dimensions`` display option for the new DataFrame repr to control whether the dimensions print. @@ -77,10 +80,10 @@ Output formatting enhancements .. ipython:: python df = pd.DataFrame([[1, 2], [3, 4]]) - pd.set_option('show_dimensions', False) + pd.set_option("show_dimensions", False) df - pd.set_option('show_dimensions', True) + pd.set_option("show_dimensions", True) df - The ``ArrayFormatter`` for ``datetime`` and ``timedelta64`` now intelligently @@ -98,10 +101,11 @@ Output formatting enhancements .. ipython:: python - df = pd.DataFrame([pd.Timestamp('20010101'), - pd.Timestamp('20040601')], columns=['age']) - df['today'] = pd.Timestamp('20130419') - df['diff'] = df['today'] - df['age'] + df = pd.DataFrame( + [pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"] + ) + df["today"] = pd.Timestamp("20130419") + df["diff"] = df["today"] - df["age"] df API changes @@ -115,8 +119,8 @@ API changes .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c']) - s.str.get_dummies(sep='|') + s = pd.Series(["a", "a|b", np.nan, "a|c"]) + s.str.get_dummies(sep="|") - Added the ``NDFrame.equals()`` method to compare if two NDFrames are equal have equal axes, dtypes, and values. Added the @@ -126,8 +130,8 @@ API changes .. code-block:: python - df = pd.DataFrame({'col': ['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0]) + df = pd.DataFrame({"col": ["foo", 0, np.nan]}) + df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0]) df.equals(df2) df.equals(df2.sort_index()) @@ -204,8 +208,9 @@ Enhancements .. code-block:: python # Try to infer the format for the index column - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, - infer_datetime_format=True) + df = pd.read_csv( + "foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True + ) - ``date_format`` and ``datetime_format`` keywords can now be specified when writing to ``excel`` files (:issue:`4133`) @@ -215,10 +220,10 @@ Enhancements .. ipython:: python - shades = ['light', 'dark'] - colors = ['red', 'green', 'blue'] + shades = ["light", "dark"] + colors = ["red", "green", "blue"] - pd.MultiIndex.from_product([shades, colors], names=['shade', 'color']) + pd.MultiIndex.from_product([shades, colors], names=["shade", "color"]) - Panel :meth:`~pandas.Panel.apply` will work on non-ufuncs. See :ref:`the docs`. @@ -379,7 +384,7 @@ Performance improvements for 0.13.1 - Series datetime/timedelta binary operations (:issue:`5801`) - DataFrame ``count/dropna`` for ``axis=1`` -- Series.str.contains now has a `regex=False` keyword which can be faster for plain (non-regex) string patterns. (:issue:`5879`) +- Series.str.contains now has a ``regex=False`` keyword which can be faster for plain (non-regex) string patterns. (:issue:`5879`) - Series.str.extract (:issue:`5944`) - ``dtypes/ftypes`` methods (:issue:`5968`) - indexing with object dtypes (:issue:`5968`) @@ -399,7 +404,7 @@ Bug fixes - Bug in ``io.wb.get_countries`` not including all countries (:issue:`6008`) - Bug in Series replace with timestamp dict (:issue:`5797`) -- read_csv/read_table now respects the `prefix` kwarg (:issue:`5732`). +- read_csv/read_table now respects the ``prefix`` kwarg (:issue:`5732`). - Bug in selection with missing values via ``.ix`` from a duplicate indexed DataFrame failing (:issue:`5835`) - Fix issue of boolean comparison on empty DataFrames (:issue:`5808`) - Bug in isnull handling ``NaT`` in an object array (:issue:`5443`) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 847a42b3a7643..b59938a9b9c9b 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -82,7 +82,7 @@ API changes - The :meth:`DataFrame.interpolate` keyword ``downcast`` default has been changed from ``infer`` to ``None``. This is to preserve the original dtype unless explicitly requested otherwise (:issue:`6290`). -- When converting a dataframe to HTML it used to return `Empty DataFrame`. This special case has +- When converting a dataframe to HTML it used to return ``Empty DataFrame``. This special case has been removed, instead a header with the column names is returned (:issue:`6062`). - ``Series`` and ``Index`` now internally share more common operations, e.g. ``factorize(),nunique(),value_counts()`` are now supported on ``Index`` types as well. The ``Series.weekday`` property from is removed @@ -171,7 +171,7 @@ API changes ``expanding_cov``, ``expanding_corr`` to allow the calculation of moving window covariance and correlation matrices (:issue:`4950`). See :ref:`Computing rolling pairwise covariances and correlations - ` in the docs. + ` in the docs. .. code-block:: ipython @@ -291,12 +291,12 @@ Display changes - Regression in the display of a MultiIndexed Series with ``display.max_rows`` is less than the length of the series (:issue:`7101`) - Fixed a bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the - `large_repr` set to 'info' (:issue:`7105`) -- The `verbose` keyword in ``DataFrame.info()``, which controls whether to shorten the ``info`` + ``large_repr`` set to 'info' (:issue:`7105`) +- The ``verbose`` keyword in ``DataFrame.info()``, which controls whether to shorten the ``info`` representation, is now ``None`` by default. This will follow the global setting in ``display.max_info_columns``. The global setting can be overridden with ``verbose=True`` or ``verbose=False``. -- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`) +- Fixed a bug with the ``info`` repr not honoring the ``display.max_info_columns`` setting (:issue:`6939`) - Offset/freq info now in Timestamp __repr__ (:issue:`4553`) .. _whatsnew_0140.parsing: @@ -603,11 +603,11 @@ Plotting - Following keywords are now acceptable for :meth:`DataFrame.plot` with ``kind='bar'`` and ``kind='barh'``: - - `width`: Specify the bar width. In previous versions, static value 0.5 was passed to matplotlib and it cannot be overwritten. (:issue:`6604`) - - `align`: Specify the bar alignment. Default is `center` (different from matplotlib). In previous versions, pandas passes `align='edge'` to matplotlib and adjust the location to `center` by itself, and it results `align` keyword is not applied as expected. (:issue:`4525`) - - `position`: Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1(right/top-end). Default is 0.5 (center). (:issue:`6604`) + - ``width``: Specify the bar width. In previous versions, static value 0.5 was passed to matplotlib and it cannot be overwritten. (:issue:`6604`) + - ``align``: Specify the bar alignment. Default is ``center`` (different from matplotlib). In previous versions, pandas passes ``align='edge'`` to matplotlib and adjust the location to ``center`` by itself, and it results ``align`` keyword is not applied as expected. (:issue:`4525`) + - ``position``: Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1(right/top-end). Default is 0.5 (center). (:issue:`6604`) - Because of the default `align` value changes, coordinates of bar plots are now located on integer values (0.0, 1.0, 2.0 ...). This is intended to make bar plot be located on the same coordinates as line plot. However, bar plot may differs unexpectedly when you manually adjust the bar location or drawing area, such as using `set_xlim`, `set_ylim`, etc. In this cases, please modify your script to meet with new coordinates. + Because of the default ``align`` value changes, coordinates of bar plots are now located on integer values (0.0, 1.0, 2.0 ...). This is intended to make bar plot be located on the same coordinates as line plot. However, bar plot may differs unexpectedly when you manually adjust the bar location or drawing area, such as using ``set_xlim``, ``set_ylim``, etc. In this cases, please modify your script to meet with new coordinates. - The :func:`parallel_coordinates` function now takes argument ``color`` instead of ``colors``. A ``FutureWarning`` is raised to alert that @@ -618,7 +618,7 @@ Plotting raised if the old ``data`` argument is used by name. (:issue:`6956`) - :meth:`DataFrame.boxplot` now supports ``layout`` keyword (:issue:`6769`) -- :meth:`DataFrame.boxplot` has a new keyword argument, `return_type`. It accepts ``'dict'``, +- :meth:`DataFrame.boxplot` has a new keyword argument, ``return_type``. It accepts ``'dict'``, ``'axes'``, or ``'both'``, in which case a namedtuple with the matplotlib axes and a dict of matplotlib Lines is returned. @@ -721,8 +721,8 @@ Deprecations - The following ``io.sql`` functions have been deprecated: ``tquery``, ``uquery``, ``read_frame``, ``frame_query``, ``write_frame``. -- The `percentile_width` keyword argument in :meth:`~DataFrame.describe` has been deprecated. - Use the `percentiles` keyword instead, which takes a list of percentiles to display. The +- The ``percentile_width`` keyword argument in :meth:`~DataFrame.describe` has been deprecated. + Use the ``percentiles`` keyword instead, which takes a list of percentiles to display. The default output is unchanged. - The default return type of :func:`boxplot` will change from a dict to a matplotlib Axes @@ -851,7 +851,7 @@ Enhancements - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`) -- `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`) +- ``PeriodIndex`` fully supports partial string indexing like ``DatetimeIndex`` (:issue:`7043`) .. ipython:: python @@ -868,7 +868,7 @@ Enhancements - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) - ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) - Support passing ``encoding`` with xlwt (:issue:`3710`) -- Refactor Block classes removing `Block.items` attributes to avoid duplication +- Refactor Block classes removing ``Block.items`` attributes to avoid duplication in item handling (:issue:`6745`, :issue:`6988`). - Testing statements updated to use specialized asserts (:issue:`6175`) @@ -923,7 +923,7 @@ Bug fixes - ``HDFStore.select_as_multiple`` handles start and stop the same way as ``select`` (:issue:`6177`) - ``HDFStore.select_as_coordinates`` and ``select_column`` works with a ``where`` clause that results in filters (:issue:`6177`) - Regression in join of non_unique_indexes (:issue:`6329`) -- Issue with groupby ``agg`` with a single function and a a mixed-type frame (:issue:`6337`) +- Issue with groupby ``agg`` with a single function and a mixed-type frame (:issue:`6337`) - Bug in ``DataFrame.replace()`` when passing a non- ``bool`` ``to_replace`` argument (:issue:`6332`) - Raise when trying to align on different levels of a MultiIndex assignment (:issue:`3738`) @@ -1063,10 +1063,10 @@ Bug fixes - Bug in ``MultiIndex.get_level_values`` doesn't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`7092`) - Bug in ``Groupby`` doesn't preserve ``tz`` (:issue:`3950`) - Bug in ``PeriodIndex`` partial string slicing (:issue:`6716`) -- Bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the `large_repr` set to 'info' +- Bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the ``large_repr`` set to 'info' (:issue:`7105`) - Bug in ``DatetimeIndex`` specifying ``freq`` raises ``ValueError`` when passed value is too short (:issue:`7098`) -- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`) +- Fixed a bug with the ``info`` repr not honoring the ``display.max_info_columns`` setting (:issue:`6939`) - Bug ``PeriodIndex`` string slicing with out of bounds values (:issue:`5407`) - Fixed a memory error in the hashtable implementation/factorizer on resizing of large tables (:issue:`7157`) - Bug in ``isnull`` when applied to 0-dimensional object arrays (:issue:`7176`) @@ -1084,4 +1084,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v0.13.1..v0.14.0 \ No newline at end of file +.. contributors:: v0.13.1..v0.14.0 diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 5de193007474c..a8f8955c3c1b9 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -68,7 +68,8 @@ API changes :suppress: import pandas.tseries.offsets as offsets - d = pd.Timestamp('2014-01-01 09:00') + + d = pd.Timestamp("2014-01-01 09:00") .. ipython:: python @@ -100,15 +101,15 @@ Enhancements import pandas.tseries.offsets as offsets day = offsets.Day() - day.apply(pd.Timestamp('2014-01-01 09:00')) + day.apply(pd.Timestamp("2014-01-01 09:00")) day = offsets.Day(normalize=True) - day.apply(pd.Timestamp('2014-01-01 09:00')) + day.apply(pd.Timestamp("2014-01-01 09:00")) - ``PeriodIndex`` is represented as the same format as ``DatetimeIndex`` (:issue:`7601`) - ``StringMethods`` now work on empty Series (:issue:`7242`) - The file parsers ``read_csv`` and ``read_table`` now ignore line comments provided by - the parameter `comment`, which accepts only a single character for the C reader. + the parameter ``comment``, which accepts only a single character for the C reader. In particular, they allow for comments before file data begins (:issue:`2685`) - Add ``NotImplementedError`` for simultaneous use of ``chunksize`` and ``nrows`` for read_csv() (:issue:`6774`). @@ -123,8 +124,9 @@ Enhancements .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=10, freq='D', - tz='dateutil/Europe/London') + rng = pd.date_range( + "3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London" + ) rng.tz See :ref:`the docs `. @@ -150,7 +152,7 @@ Performance - Improvements in Series.transform for significant performance gains (:issue:`6496`) - Improvements in DataFrame.transform with ufuncs and built-in grouper functions for significant performance gains (:issue:`7383`) - Regression in groupby aggregation of datetime64 dtypes (:issue:`7555`) -- Improvements in `MultiIndex.from_product` for large iterables (:issue:`7627`) +- Improvements in ``MultiIndex.from_product`` for large iterables (:issue:`7627`) .. _whatsnew_0141.experimental: @@ -217,7 +219,7 @@ Bug fixes - Bug in ``.loc`` with a list of indexers on a single-multi index level (that is not nested) (:issue:`7349`) - Bug in ``Series.map`` when mapping a dict with tuple keys of different lengths (:issue:`7333`) - Bug all ``StringMethods`` now work on empty Series (:issue:`7242`) -- Fix delegation of `read_sql` to `read_sql_query` when query does not contain 'select' (:issue:`7324`). +- Fix delegation of ``read_sql`` to ``read_sql_query`` when query does not contain 'select' (:issue:`7324`). - Bug where a string column name assignment to a ``DataFrame`` with a ``Float64Index`` raised a ``TypeError`` during a call to ``np.isnan`` (:issue:`7366`). @@ -269,7 +271,7 @@ Bug fixes - Bug in ``pandas.core.strings.str_contains`` does not properly match in a case insensitive fashion when ``regex=False`` and ``case=False`` (:issue:`7505`) - Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, and ``rolling_corr`` for two arguments with mismatched index (:issue:`7512`) - Bug in ``to_sql`` taking the boolean column as text column (:issue:`7678`) -- Bug in grouped `hist` doesn't handle `rot` kw and `sharex` kw properly (:issue:`7234`) +- Bug in grouped ``hist`` doesn't handle ``rot`` kw and ``sharex`` kw properly (:issue:`7234`) - Bug in ``.loc`` performing fallback integer indexing with ``object`` dtype indices (:issue:`7496`) - Bug (regression) in ``PeriodIndex`` constructor when passed ``Series`` objects (:issue:`7701`). diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index b80ed7446f805..fc2b070df4392 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -61,7 +61,7 @@ New features Categoricals in Series/DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new +:class:`~pandas.Categorical` can now be included in ``Series`` and ``DataFrames`` and gained new methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`, :issue:`8518`). @@ -405,7 +405,7 @@ Rolling/expanding moments improvements - :func:`rolling_window` now normalizes the weights properly in rolling mean mode (`mean=True`) so that the calculated weighted means (e.g. 'triang', 'gaussian') are distributed about the same means as those - calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) + calculated without weighting (i.e. 'boxcar'). See :ref:`the note on normalization ` for further details. (:issue:`7618`) .. ipython:: python @@ -490,7 +490,7 @@ Rolling/expanding moments improvements now have an optional ``adjust`` argument, just like :func:`ewma` does, affecting how the weights are calculated. The default value of ``adjust`` is ``True``, which is backwards-compatible. - See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7911`) + See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7911`) - :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcov`, and :func:`ewmcorr` now have an optional ``ignore_na`` argument. @@ -595,7 +595,7 @@ Rolling/expanding moments improvements 3 1.425439 dtype: float64 - See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7912`) + See :ref:`Exponentially weighted moment functions ` for details. (:issue:`7912`) .. _whatsnew_0150.sql: @@ -808,7 +808,7 @@ Other notable API changes: .. _whatsnew_0150.blanklines: -- Made both the C-based and Python engines for `read_csv` and `read_table` ignore empty lines in input as well as +- Made both the C-based and Python engines for ``read_csv`` and ``read_table`` ignore empty lines in input as well as white space-filled lines, as long as ``sep`` is not white space. This is an API change that can be controlled by the keyword parameter ``skip_blank_lines``. See :ref:`the docs ` (:issue:`4466`) @@ -830,7 +830,7 @@ Other notable API changes: Previously this would have yielded a column of ``datetime64`` dtype, but without timezone info. - The behaviour of assigning a column to an existing dataframe as `df['a'] = i` + The behaviour of assigning a column to an existing dataframe as ``df['a'] = i`` remains unchanged (this already returned an ``object`` column with a timezone). - When passing multiple levels to :meth:`~pandas.DataFrame.stack()`, it will now raise a ``ValueError`` when the @@ -894,7 +894,7 @@ a transparent change with only very limited API implications (:issue:`5080`, :is - you may need to unpickle pandas version < 0.15.0 pickles using ``pd.read_pickle`` rather than ``pickle.load``. See :ref:`pickle docs ` - when plotting with a ``PeriodIndex``, the matplotlib internal axes will now be arrays of ``Period`` rather than a ``PeriodIndex`` (this is similar to how a ``DatetimeIndex`` passes arrays of ``datetimes`` now) - MultiIndexes will now raise similarly to other pandas objects w.r.t. truth testing, see :ref:`here ` (:issue:`7897`). -- When plotting a DatetimeIndex directly with matplotlib's `plot` function, +- When plotting a DatetimeIndex directly with matplotlib's ``plot`` function, the axis labels will no longer be formatted as dates but as integers (the internal representation of a ``datetime64``). **UPDATE** This is fixed in 0.15.1, see :ref:`here `. diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index f9c17058dc3ee..a1d4f9d14a905 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -23,7 +23,7 @@ API changes .. ipython:: python - s = pd.Series(pd.date_range('20130101', periods=5, freq='D')) + s = pd.Series(pd.date_range("20130101", periods=5, freq="D")) s.iloc[2] = np.nan s @@ -52,8 +52,7 @@ API changes .. ipython:: python np.random.seed(2718281) - df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), - columns=['jim', 'joe']) + df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), columns=["jim", "joe"]) df.head() ts = pd.Series(5 * np.random.randint(0, 3, 10)) @@ -80,9 +79,9 @@ API changes .. ipython:: python - df = pd.DataFrame({'jim': range(5), 'joe': range(5, 10)}) + df = pd.DataFrame({"jim": range(5), "joe": range(5, 10)}) df - gr = df.groupby(df['jim'] < 2) + gr = df.groupby(df["jim"] < 2) previous behavior (excludes 1st column from output): @@ -106,7 +105,7 @@ API changes .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], [4, 3, 2, 1]) + s = pd.Series(["a", "b", "c", "d"], [4, 3, 2, 1]) s previous behavior: @@ -208,6 +207,7 @@ Enhancements .. ipython:: python from collections import deque + df1 = pd.DataFrame([1, 2, 3]) df2 = pd.DataFrame([4, 5, 6]) @@ -228,8 +228,9 @@ Enhancements .. ipython:: python - dfi = pd.DataFrame(1, index=pd.MultiIndex.from_product([['a'], - range(1000)]), columns=['A']) + dfi = pd.DataFrame( + 1, index=pd.MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) previous behavior: @@ -249,7 +250,7 @@ Enhancements dfi.memory_usage(index=True) -- Added Index properties `is_monotonic_increasing` and `is_monotonic_decreasing` (:issue:`8680`). +- Added Index properties ``is_monotonic_increasing`` and ``is_monotonic_decreasing`` (:issue:`8680`). - Added option to select columns when importing Stata files (:issue:`7935`) @@ -305,7 +306,7 @@ Bug fixes - Fixed a bug where plotting a column ``y`` and specifying a label would mutate the index name of the original DataFrame (:issue:`8494`) - Fix regression in plotting of a DatetimeIndex directly with matplotlib (:issue:`8614`). - Bug in ``date_range`` where partially-specified dates would incorporate current date (:issue:`6961`) -- Bug in Setting by indexer to a scalar value with a mixed-dtype `Panel4d` was failing (:issue:`8702`) +- Bug in Setting by indexer to a scalar value with a mixed-dtype ``Panel4d`` was failing (:issue:`8702`) - Bug where ``DataReader``'s would fail if one of the symbols passed was invalid. Now returns data for valid symbols and np.nan for invalid (:issue:`8494`) - Bug in ``get_quote_yahoo`` that wouldn't allow non-float return values (:issue:`5229`). diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index a4eabb97471de..b5b25796fea73 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -136,8 +136,8 @@ Enhancements - Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. -- Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. -- Added support for ``searchsorted()`` on `Categorical` class (:issue:`8420`). +- Added ability to export Categorical data to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. +- Added support for ``searchsorted()`` on ``Categorical`` class (:issue:`8420`). Other enhancements: @@ -171,7 +171,7 @@ Other enhancements: 3 False True False True 4 True True True True -- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). +- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on ``Timestamp`` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). @@ -241,7 +241,7 @@ Bug fixes - Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is not lexically sorted or unique (:issue:`7724`) - BUG CSV: fix problem with trailing white space in skipped rows, (:issue:`8679`), (:issue:`8661`), (:issue:`8983`) - Regression in ``Timestamp`` does not parse 'Z' zone designator for UTC (:issue:`8771`) -- Bug in `StataWriter` the produces writes strings with 244 characters irrespective of actual size (:issue:`8969`) +- Bug in ``StataWriter`` the produces writes strings with 244 characters irrespective of actual size (:issue:`8969`) - Fixed ValueError raised by cummin/cummax when datetime64 Series contains NaT. (:issue:`8965`) - Bug in DataReader returns object dtype if there are missing values (:issue:`8980`) - Bug in plotting if sharex was enabled and index was a timeseries, would show labels on multiple axes (:issue:`3964`). diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index 4ad533e68e275..8d0d6854cbf85 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -89,7 +89,7 @@ See the :ref:`documentation ` for more. (:issue:`922 Interaction with scipy.sparse ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here `). For example, given a SparseSeries with MultiIndex we can convert to a `scipy.sparse.coo_matrix` by specifying the row and column labels as index levels: +Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here `). For example, given a SparseSeries with MultiIndex we can convert to a ``scipy.sparse.coo_matrix`` by specifying the row and column labels as index levels: .. code-block:: python @@ -630,7 +630,7 @@ Bug fixes - Bug in ``Series.values_counts`` with excluding ``NaN`` for categorical type ``Series`` with ``dropna=True`` (:issue:`9443`) - Fixed missing numeric_only option for ``DataFrame.std/var/sem`` (:issue:`9201`) - Support constructing ``Panel`` or ``Panel4D`` with scalar data (:issue:`8285`) -- ``Series`` text representation disconnected from `max_rows`/`max_columns` (:issue:`7508`). +- ``Series`` text representation disconnected from ``max_rows``/``max_columns`` (:issue:`7508`). \ diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index 8dcac4c1044be..269854111373f 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -6,7 +6,7 @@ Version 0.16.1 (May 11, 2015) {{ header }} -This is a minor bug-fix release from 0.16.0 and includes a a large number of +This is a minor bug-fix release from 0.16.0 and includes a large number of bug fixes along several new features, enhancements, and performance improvements. We recommend that all users upgrade to this version. @@ -72,7 +72,7 @@ setting the index of a ``DataFrame/Series`` with a ``category`` dtype would conv Out[4]: Index(['c', 'a', 'b'], dtype='object') -setting the index, will create create a ``CategoricalIndex`` +setting the index, will create a ``CategoricalIndex`` .. code-block:: ipython @@ -209,9 +209,8 @@ when sampling from rows. .. ipython:: python - df = pd.DataFrame({'col1': [9, 8, 7, 6], - 'weight_column': [0.5, 0.4, 0.1, 0]}) - df.sample(n=3, weights='weight_column') + df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) + df.sample(n=3, weights="weight_column") .. _whatsnew_0161.enhancements.string: @@ -229,20 +228,20 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank']) + idx = pd.Index([" jack", "jill ", " jesse ", "frank"]) idx.str.strip() - One special case for the `.str` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor + One special case for the ``.str`` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor will return a ``np.array`` instead of a boolean ``Index`` (:issue:`8875`). This enables the following expression to work naturally: .. ipython:: python - idx = pd.Index(['a1', 'a2', 'b1', 'b2']) + idx = pd.Index(["a1", "a2", "b1", "b2"]) s = pd.Series(range(4), index=idx) s - idx.str.startswith('a') - s[s.index.str.startswith('a')] + idx.str.startswith("a") + s[s.index.str.startswith("a")] - The following new methods are accessible via ``.str`` accessor to apply the function to each values. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`, :issue:`10052`) @@ -257,21 +256,21 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - s = pd.Series(['a,b', 'a,c', 'b,c']) + s = pd.Series(["a,b", "a,c", "b,c"]) # return Series - s.str.split(',') + s.str.split(",") # return DataFrame - s.str.split(',', expand=True) + s.str.split(",", expand=True) - idx = pd.Index(['a,b', 'a,c', 'b,c']) + idx = pd.Index(["a,b", "a,c", "b,c"]) # return Index - idx.str.split(',') + idx.str.split(",") # return MultiIndex - idx.str.split(',', expand=True) + idx.str.split(",", expand=True) - Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) @@ -286,9 +285,9 @@ Other enhancements .. ipython:: python - pd.Timestamp('2014-08-01 09:00') + pd.tseries.offsets.BusinessHour() - pd.Timestamp('2014-08-01 07:00') + pd.tseries.offsets.BusinessHour() - pd.Timestamp('2014-08-01 16:30') + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 09:00") + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 07:00") + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 16:30") + pd.tseries.offsets.BusinessHour() - ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) @@ -300,8 +299,8 @@ Other enhancements .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) - df.drop(['A', 'X'], axis=1, errors='ignore') + df = pd.DataFrame(np.random.randn(3, 3), columns=["A", "B", "C"]) + df.drop(["A", "X"], axis=1, errors="ignore") - Add support for separating years and quarters using dashes, for example 2014-Q1. (:issue:`9688`) @@ -310,7 +309,7 @@ Other enhancements - ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) - ``Period`` now accepts ``datetime64`` as value input. (:issue:`9054`) -- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) +- Allow timedelta string conversion when leading zero is missing from time definition, ie ``0:00:00`` vs ``00:00:00``. (:issue:`9570`) - Allow ``Panel.shift`` with ``axis='items'`` (:issue:`9890`) - Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) @@ -329,11 +328,11 @@ Other enhancements API changes ~~~~~~~~~~~ -- When passing in an ax to ``df.plot( ..., ax=ax)``, the `sharex` kwarg will now default to `False`. +- When passing in an ax to ``df.plot( ..., ax=ax)``, the ``sharex`` kwarg will now default to ``False``. The result is that the visibility of xlabels and xticklabels will not anymore be changed. You have to do that by yourself for the right axes in your figure or set ``sharex=True`` explicitly (but this changes the visible for all axes in the figure, not only the one which is passed in!). - If pandas creates the subplots itself (e.g. no passed in `ax` kwarg), then the + If pandas creates the subplots itself (e.g. no passed in ``ax`` kwarg), then the default is still ``sharex=True`` and the visibility changes are applied. - :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously @@ -382,19 +381,16 @@ New behavior .. ipython:: python - pd.set_option('display.width', 80) - pd.Index(range(4), name='foo') - pd.Index(range(30), name='foo') - pd.Index(range(104), name='foo') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'], - ordered=True, name='foobar') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'] * 10, - ordered=True, name='foobar') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'] * 100, - ordered=True, name='foobar') - pd.date_range('20130101', periods=4, name='foo', tz='US/Eastern') - pd.date_range('20130101', periods=25, freq='D') - pd.date_range('20130101', periods=104, name='foo', tz='US/Eastern') + pd.set_option("display.width", 80) + pd.Index(range(4), name="foo") + pd.Index(range(30), name="foo") + pd.Index(range(104), name="foo") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"], ordered=True, name="foobar") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"] * 10, ordered=True, name="foobar") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"] * 100, ordered=True, name="foobar") + pd.date_range("20130101", periods=4, name="foo", tz="US/Eastern") + pd.date_range("20130101", periods=25, freq="D") + pd.date_range("20130101", periods=104, name="foo", tz="US/Eastern") .. _whatsnew_0161.performance: @@ -442,7 +438,7 @@ Bug fixes - Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) - Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) - Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`) -- Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) +- Bug in which ``SparseDataFrame`` could not take ``nan`` as a column name (:issue:`8822`) - Bug in ``to_msgpack`` and ``read_msgpack`` zlib and blosc compression support (:issue:`9783`) - Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) - Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index a3c34db09f555..37e8c64ea9ced 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -6,7 +6,7 @@ Version 0.16.2 (June 12, 2015) {{ header }} -This is a minor bug-fix release from 0.16.1 and includes a a large number of +This is a minor bug-fix release from 0.16.1 and includes a large number of bug fixes along some new features (:meth:`~DataFrame.pipe` method), enhancements, and performance improvements. We recommend that all users upgrade to this version. @@ -48,9 +48,10 @@ This can be rewritten as .. code-block:: python - (df.pipe(h) # noqa F821 - .pipe(g, arg1=1) # noqa F821 - .pipe(f, arg2=2, arg3=3) # noqa F821 + ( + df.pipe(h) # noqa F821 + .pipe(g, arg1=1) # noqa F821 + .pipe(f, arg2=2, arg3=3) # noqa F821 ) Now both the code and the logic flow from top to bottom. Keyword arguments are next to @@ -64,15 +65,16 @@ of ``(function, keyword)`` indicating where the DataFrame should flow. For examp import statsmodels.formula.api as sm - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") # sm.ols takes (formula, data) - (bb.query('h > 0') - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') - .fit() - .summary() - ) + ( + bb.query("h > 0") + .assign(ln_h=lambda df: np.log(df.h)) + .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .fit() + .summary() + ) The pipe method is inspired by unix pipes, which stream text through processes. More recently dplyr_ and magrittr_ have introduced the @@ -89,7 +91,7 @@ See the :ref:`documentation ` for more. (:issue:`10129`) Other enhancements ^^^^^^^^^^^^^^^^^^ -- Added `rsplit` to Index/Series StringMethods (:issue:`10303`) +- Added ``rsplit`` to Index/Series StringMethods (:issue:`10303`) - Removed the hard-coded size limits on the ``DataFrame`` HTML representation in the IPython notebook, and leave this to IPython itself (only for IPython @@ -145,7 +147,7 @@ Bug fixes - Bug in ``setitem`` where type promotion is applied to the entire block (:issue:`10280`) - Bug in ``Series`` arithmetic methods may incorrectly hold names (:issue:`10068`) - Bug in ``GroupBy.get_group`` when grouping on multiple keys, one of which is categorical. (:issue:`10132`) -- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetics ( :issue:`9926`) +- Bug in ``DatetimeIndex`` and ``TimedeltaIndex`` names are lost after timedelta arithmetic ( :issue:`9926`) - Bug in ``DataFrame`` construction from nested ``dict`` with ``datetime64`` (:issue:`10160`) - Bug in ``Series`` construction from ``dict`` with ``datetime64`` keys (:issue:`9456`) - Bug in ``Series.plot(label="LABEL")`` not correctly setting the label (:issue:`10119`) diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 11c252192be6b..d8f39a7d6e3c0 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -40,7 +40,7 @@ Highlights include: - Plotting methods are now available as attributes of the ``.plot`` accessor, see :ref:`here ` - The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here ` - Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here ` -- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats, +- The default for ``to_datetime`` will now be to ``raise`` when presented with unparsable formats, previously this would return the original input. Also, date parse functions now return consistent results. See :ref:`here ` - The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even @@ -80,9 +80,13 @@ The new implementation allows for having a single-timezone across all rows, with .. ipython:: python - df = pd.DataFrame({'A': pd.date_range('20130101', periods=3), - 'B': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'C': pd.date_range('20130101', periods=3, tz='CET')}) + df = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=3), + "B": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "C": pd.date_range("20130101", periods=3, tz="CET"), + } + ) df df.dtypes @@ -95,8 +99,8 @@ This uses a new-dtype representation as well, that is very similar in look-and-f .. ipython:: python - df['B'].dtype - type(df['B'].dtype) + df["B"].dtype + type(df["B"].dtype) .. note:: @@ -119,8 +123,8 @@ This uses a new-dtype representation as well, that is very similar in look-and-f .. ipython:: python - pd.date_range('20130101', periods=3, tz='US/Eastern') - pd.date_range('20130101', periods=3, tz='US/Eastern').dtype + pd.date_range("20130101", periods=3, tz="US/Eastern") + pd.date_range("20130101", periods=3, tz="US/Eastern").dtype .. _whatsnew_0170.gil: @@ -138,9 +142,10 @@ as well as the ``.sum()`` operation. N = 1000000 ngroups = 10 - df = DataFrame({'key': np.random.randint(0, ngroups, size=N), - 'data': np.random.randn(N)}) - df.groupby('key')['data'].sum() + df = DataFrame( + {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} + ) + df.groupby("key")["data"].sum() Releasing of the GIL could benefit an application that uses threads for user interactions (e.g. QT_), or performing multi-threaded computations. A nice example of a library that can handle these types of computation-in-parallel is the dask_ library. @@ -189,16 +194,16 @@ We are now supporting a ``Series.dt.strftime`` method for datetime-likes to gene .. ipython:: python # DatetimeIndex - s = pd.Series(pd.date_range('20130101', periods=4)) + s = pd.Series(pd.date_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") .. ipython:: python # PeriodIndex - s = pd.Series(pd.period_range('20130101', periods=4)) + s = pd.Series(pd.period_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") The string format is as the python standard library and details can be found `here `_ @@ -210,7 +215,7 @@ Series.dt.total_seconds .. ipython:: python # TimedeltaIndex - s = pd.Series(pd.timedelta_range('1 minutes', periods=4)) + s = pd.Series(pd.timedelta_range("1 minutes", periods=4)) s s.dt.total_seconds() @@ -225,18 +230,18 @@ A multiplied freq represents a span of corresponding length. The example below c .. ipython:: python - p = pd.Period('2015-08-01', freq='3D') + p = pd.Period("2015-08-01", freq="3D") p p + 1 p - 2 p.to_timestamp() - p.to_timestamp(how='E') + p.to_timestamp(how="E") You can use the multiplied freq in ``PeriodIndex`` and ``period_range``. .. ipython:: python - idx = pd.period_range('2015-08-01', periods=4, freq='2D') + idx = pd.period_range("2015-08-01", periods=4, freq="2D") idx idx + 1 @@ -249,14 +254,14 @@ Support for SAS XPORT files .. code-block:: python - df = pd.read_sas('sas_xport.xpt') + df = pd.read_sas("sas_xport.xpt") It is also possible to obtain an iterator and read an XPORT file incrementally. .. code-block:: python - for df in pd.read_sas('sas_xport.xpt', chunksize=10000): + for df in pd.read_sas("sas_xport.xpt", chunksize=10000): do_something(df) See the :ref:`docs ` for more details. @@ -270,12 +275,12 @@ Support for math functions in .eval() .. code-block:: python - df = pd.DataFrame({'a': np.random.randn(10)}) + df = pd.DataFrame({"a": np.random.randn(10)}) df.eval("b = sin(a)") -The support math functions are `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, -`sqrt`, `sinh`, `cosh`, `tanh`, `arcsin`, `arccos`, `arctan`, `arccosh`, -`arcsinh`, `arctanh`, `abs` and `arctan2`. +The support math functions are ``sin``, ``cos``, ``exp``, ``log``, ``expm1``, ``log1p``, +``sqrt``, ``sinh``, ``cosh``, ``tanh``, ``arcsin``, ``arccos``, ``arctan``, ``arccosh``, +``arcsinh``, ``arctanh``, ``abs`` and ``arctan2``. These functions map to the intrinsics for the ``NumExpr`` engine. For the Python engine, they are mapped to ``NumPy`` calls. @@ -292,23 +297,26 @@ See the :ref:`documentation ` for more details. .. ipython:: python - df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=pd.MultiIndex.from_product( - [['foo', 'bar'], ['a', 'b']], names=['col1', 'col2']), - index=pd.MultiIndex.from_product([['j'], ['l', 'k']], - names=['i1', 'i2'])) + df = pd.DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=pd.MultiIndex.from_product( + [["foo", "bar"], ["a", "b"]], names=["col1", "col2"] + ), + index=pd.MultiIndex.from_product([["j"], ["l", "k"]], names=["i1", "i2"]), + ) df - df.to_excel('test.xlsx') + df.to_excel("test.xlsx") - df = pd.read_excel('test.xlsx', header=[0, 1], index_col=[0, 1]) + df = pd.read_excel("test.xlsx", header=[0, 1], index_col=[0, 1]) df .. ipython:: python :suppress: import os - os.remove('test.xlsx') + + os.remove("test.xlsx") Previously, it was necessary to specify the ``has_index_names`` argument in ``read_excel``, if the serialized data had index names. For version 0.17.0 the output format of ``to_excel`` @@ -354,14 +362,14 @@ Some East Asian countries use Unicode characters its width is corresponding to 2 .. ipython:: python - df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']}) + df = pd.DataFrame({u"国籍": ["UK", u"日本"], u"名前": ["Alice", u"しのぶ"]}) df; .. image:: ../_static/option_unicode01.png .. ipython:: python - pd.set_option('display.unicode.east_asian_width', True) + pd.set_option("display.unicode.east_asian_width", True) df; .. image:: ../_static/option_unicode02.png @@ -371,7 +379,7 @@ For further details, see :ref:`here ` .. ipython:: python :suppress: - pd.set_option('display.unicode.east_asian_width', False) + pd.set_option("display.unicode.east_asian_width", False) .. _whatsnew_0170.enhancements.other: @@ -391,9 +399,9 @@ Other enhancements .. ipython:: python - df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) - df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) - pd.merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]}) + df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) + pd.merge(df1, df2, on="col1", how="outer", indicator=True) For more, see the :ref:`updated docs ` @@ -407,7 +415,7 @@ Other enhancements .. ipython:: python - foo = pd.Series([1, 2], name='foo') + foo = pd.Series([1, 2], name="foo") bar = pd.Series([1, 2]) baz = pd.Series([4, 5]) @@ -434,46 +442,43 @@ Other enhancements .. ipython:: python ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13]) - ser.interpolate(limit=1, limit_direction='both') + ser.interpolate(limit=1, limit_direction="both") - Added a ``DataFrame.round`` method to round the values to a variable number of decimal places (:issue:`10568`). .. ipython:: python - df = pd.DataFrame(np.random.random([3, 3]), - columns=['A', 'B', 'C'], - index=['first', 'second', 'third']) + df = pd.DataFrame( + np.random.random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) df df.round(2) - df.round({'A': 0, 'C': 2}) + df.round({"A": 0, "C": 2}) - ``drop_duplicates`` and ``duplicated`` now accept a ``keep`` keyword to target first, last, and all duplicates. The ``take_last`` keyword is deprecated, see :ref:`here ` (:issue:`6511`, :issue:`8505`) .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) + s = pd.Series(["A", "B", "C", "A", "B", "D"]) s.drop_duplicates() - s.drop_duplicates(keep='last') + s.drop_duplicates(keep="last") s.drop_duplicates(keep=False) - Reindex now has a ``tolerance`` argument that allows for finer control of :ref:`basics.limits_on_reindex_fill` (:issue:`10411`): .. ipython:: python - df = pd.DataFrame({'x': range(5), - 't': pd.date_range('2000-01-01', periods=5)}) - df.reindex([0.1, 1.9, 3.5], - method='nearest', - tolerance=0.2) + df = pd.DataFrame({"x": range(5), "t": pd.date_range("2000-01-01", periods=5)}) + df.reindex([0.1, 1.9, 3.5], method="nearest", tolerance=0.2) When used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. This allows you to specify tolerance with a string: .. ipython:: python - df = df.set_index('t') - df.reindex(pd.to_datetime(['1999-12-31']), - method='nearest', - tolerance='1 day') + df = df.set_index("t") + df.reindex(pd.to_datetime(["1999-12-31"]), method="nearest", tolerance="1 day") ``tolerance`` is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods. @@ -519,7 +524,7 @@ Other enhancements - ``DataFrame.apply`` will return a Series of dicts if the passed function returns a dict and ``reduce=True`` (:issue:`8735`). -- Allow passing `kwargs` to the interpolation methods (:issue:`10378`). +- Allow passing ``kwargs`` to the interpolation methods (:issue:`10378`). - Improved error message when concatenating an empty iterable of ``Dataframe`` objects (:issue:`9157`) @@ -627,13 +632,13 @@ Of course you can coerce this as well. .. ipython:: python - pd.to_datetime(['2009-07-31', 'asd'], errors='coerce') + pd.to_datetime(["2009-07-31", "asd"], errors="coerce") To keep the previous behavior, you can use ``errors='ignore'``: .. ipython:: python - pd.to_datetime(['2009-07-31', 'asd'], errors='ignore') + pd.to_datetime(["2009-07-31", "asd"], errors="ignore") Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword has been deprecated in favor of ``errors='coerce'``. @@ -667,9 +672,9 @@ New behavior: .. ipython:: python - pd.Timestamp('2012Q2') - pd.Timestamp('2014') - pd.DatetimeIndex(['2012Q2', '2014']) + pd.Timestamp("2012Q2") + pd.Timestamp("2014") + pd.DatetimeIndex(["2012Q2", "2014"]) .. note:: @@ -678,6 +683,7 @@ New behavior: .. ipython:: python import pandas.tseries.offsets as offsets + pd.Timestamp.now() pd.Timestamp.now() + offsets.DateOffset(years=1) @@ -762,7 +768,7 @@ Usually you simply want to know which values are null. .. warning:: You generally will want to use ``isnull/notnull`` for these types of comparisons, as ``isnull/notnull`` tells you which elements are null. One has to be - mindful that ``nan's`` don't compare equal, but ``None's`` do. Note that Pandas/numpy uses the fact that ``np.nan != np.nan``, and treats ``None`` like ``np.nan``. + mindful that ``nan's`` don't compare equal, but ``None's`` do. Note that pandas/numpy uses the fact that ``np.nan != np.nan``, and treats ``None`` like ``np.nan``. .. ipython:: python @@ -780,8 +786,9 @@ Previous behavior: .. ipython:: python - df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2], - 'col2': [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame( + {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} + ) df_with_missing @@ -806,18 +813,16 @@ New behavior: .. ipython:: python - df_with_missing.to_hdf('file.h5', - 'df_with_missing', - format='table', - mode='w') + df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") - pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf("file.h5", "df_with_missing") .. ipython:: python :suppress: import os - os.remove('file.h5') + + os.remove("file.h5") See the :ref:`docs ` for more details. @@ -848,8 +853,8 @@ regular formatting as well as scientific notation, similar to how numpy's ``prec .. ipython:: python - pd.set_option('display.precision', 2) - pd.DataFrame({'x': [123.456789]}) + pd.set_option("display.precision", 2) + pd.DataFrame({"x": [123.456789]}) To preserve output behavior with prior versions the default value of ``display.precision`` has been reduced to ``6`` from ``7``. @@ -857,7 +862,7 @@ from ``7``. .. ipython:: python :suppress: - pd.set_option('display.precision', 6) + pd.set_option("display.precision", 6) .. _whatsnew_0170.api_breaking.categorical_unique: @@ -871,14 +876,11 @@ Changes to ``Categorical.unique`` .. ipython:: python - cat = pd.Categorical(['C', 'A', 'B', 'C'], - categories=['A', 'B', 'C'], - ordered=True) + cat = pd.Categorical(["C", "A", "B", "C"], categories=["A", "B", "C"], ordered=True) cat cat.unique() - cat = pd.Categorical(['C', 'A', 'B', 'C'], - categories=['A', 'B', 'C']) + cat = pd.Categorical(["C", "A", "B", "C"], categories=["A", "B", "C"]) cat cat.unique() @@ -909,7 +911,7 @@ Other API changes - The metadata properties of subclasses of pandas objects will now be serialized (:issue:`10553`). - ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above (:issue:`10508`) - When constructing ``DataFrame`` with an array of ``complex64`` dtype previously meant the corresponding column - was automatically promoted to the ``complex128`` dtype. Pandas will now preserve the itemsize of the input for complex data (:issue:`10952`) + was automatically promoted to the ``complex128`` dtype. pandas will now preserve the itemsize of the input for complex data (:issue:`10952`) - some numeric reduction operators would return ``ValueError``, rather than ``TypeError`` on object types that includes strings and numbers (:issue:`11131`) - Passing currently unsupported ``chunksize`` argument to ``read_excel`` or ``ExcelFile.parse`` will now raise ``NotImplementedError`` (:issue:`8011`) - Allow an ``ExcelFile`` object to be passed into ``read_excel`` (:issue:`11198`) @@ -980,9 +982,11 @@ Removal of prior version deprecations/changes .. ipython:: python np.random.seed(1234) - df = pd.DataFrame(np.random.randn(5, 2), - columns=list('AB'), - index=pd.date_range('2013-01-01', periods=5)) + df = pd.DataFrame( + np.random.randn(5, 2), + columns=list("AB"), + index=pd.date_range("2013-01-01", periods=5), + ) df Previously @@ -1005,7 +1009,7 @@ Removal of prior version deprecations/changes .. ipython:: python - df.add(df.A, axis='index') + df.add(df.A, axis="index") - Remove ``table`` keyword in ``HDFStore.put/append``, in favor of using ``format=`` (:issue:`4645`) diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst index 5d15a01aee5a0..6b0a28ec47568 100644 --- a/doc/source/whatsnew/v0.17.1.rst +++ b/doc/source/whatsnew/v0.17.1.rst @@ -52,8 +52,8 @@ Here's a quick example: .. ipython:: python np.random.seed(123) - df = pd.DataFrame(np.random.randn(10, 5), columns=list('abcde')) - html = df.style.background_gradient(cmap='viridis', low=.5) + df = pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")) + html = df.style.background_gradient(cmap="viridis", low=0.5) We can render the HTML to get the following table. @@ -80,14 +80,14 @@ Enhancements .. ipython:: python - df = pd.DataFrame({'A': ['foo'] * 1000}) # noqa: F821 - df['B'] = df['A'].astype('category') + df = pd.DataFrame({"A": ["foo"] * 1000}) # noqa: F821 + df["B"] = df["A"].astype("category") # shows the '+' as we have object dtypes df.info() # we have an accurate memory assessment (but can be expensive to compute this) - df.info(memory_usage='deep') + df.info(memory_usage="deep") - ``Index`` now has a ``fillna`` method (:issue:`10089`) @@ -99,11 +99,11 @@ Enhancements .. ipython:: python - s = pd.Series(list('aabb')).astype('category') + s = pd.Series(list("aabb")).astype("category") s s.str.contains("a") - date = pd.Series(pd.date_range('1/1/2015', periods=5)).astype('category') + date = pd.Series(pd.date_range("1/1/2015", periods=5)).astype("category") date date.dt.day diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index fbe24675ddfe2..829c04dac9f2d 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -53,7 +53,7 @@ New features Window functions are now methods ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Window functions have been refactored to be methods on ``Series/DataFrame`` objects, rather than top-level functions, which are now deprecated. This allows these window-type functions, to have a similar API to that of ``.groupby``. See the full documentation :ref:`here ` (:issue:`11603`, :issue:`12373`) +Window functions have been refactored to be methods on ``Series/DataFrame`` objects, rather than top-level functions, which are now deprecated. This allows these window-type functions, to have a similar API to that of ``.groupby``. See the full documentation :ref:`here ` (:issue:`11603`, :issue:`12373`) .. ipython:: python @@ -290,7 +290,7 @@ A new, friendlier ``ValueError`` is added to protect against the mistake of supp .. code-block:: ipython In [2]: pd.Series(['a', 'b', np.nan, 'c']).str.cat(' ') - ValueError: Did you mean to supply a `sep` keyword? + ValueError: Did you mean to supply a ``sep`` keyword? .. _whatsnew_0180.enhancements.rounding: @@ -610,7 +610,7 @@ Subtraction by ``Timedelta`` in a ``Series`` by a ``Timestamp`` works (:issue:`1 pd.Timestamp('2012-01-01') - ser -``NaT.isoformat()`` now returns ``'NaT'``. This change allows allows +``NaT.isoformat()`` now returns ``'NaT'``. This change allows ``pd.Timestamp`` to rehydrate any timestamp like object from its isoformat (:issue:`12300`). diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 13ed6bc38163b..3db00f686d62c 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -42,6 +42,7 @@ see :ref:`Custom Business Hour ` (:issue:`11514`) from pandas.tseries.offsets import CustomBusinessHour from pandas.tseries.holiday import USFederalHolidayCalendar + bhour_us = CustomBusinessHour(calendar=USFederalHolidayCalendar()) Friday before MLK Day @@ -49,6 +50,7 @@ Friday before MLK Day .. ipython:: python import datetime + dt = datetime.datetime(2014, 1, 17, 15) dt + bhour_us @@ -72,41 +74,42 @@ Previously you would have to do this to get a rolling window mean per-group: .. ipython:: python - df = pd.DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.arange(40)}) + df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df .. ipython:: python - df.groupby('A').apply(lambda x: x.rolling(4).B.mean()) + df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) Now you can do: .. ipython:: python - df.groupby('A').rolling(4).B.mean() + df.groupby("A").rolling(4).B.mean() For ``.resample(..)`` type of operations, previously you would have to: .. ipython:: python - df = pd.DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") df .. ipython:: python - df.groupby('group').apply(lambda x: x.resample('1D').ffill()) + df.groupby("group").apply(lambda x: x.resample("1D").ffill()) Now you can do: .. ipython:: python - df.groupby('group').resample('1D').ffill() + df.groupby("group").resample("1D").ffill() .. _whatsnew_0181.enhancements.method_chain: @@ -129,9 +132,7 @@ arguments. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': [4, 5, 6], - 'C': [7, 8, 9]}) + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) df.where(lambda x: x > 4, lambda x: x + 10) Methods ``.loc[]``, ``.iloc[]``, ``.ix[]`` @@ -146,7 +147,7 @@ can return a valid boolean indexer or anything which is valid for these indexer' df.loc[lambda x: x.A >= 2, lambda x: x.sum() > 10] # callable returns list of labels - df.loc[lambda x: [1, 2], lambda x: ['A', 'B']] + df.loc[lambda x: [1, 2], lambda x: ["A", "B"]] Indexing with``[]`` """"""""""""""""""" @@ -157,17 +158,15 @@ class and index type. .. ipython:: python - df[lambda x: 'A'] + df[lambda x: "A"] Using these methods / indexers, you can chain data selection operations without using temporary variable. .. ipython:: python - bb = pd.read_csv('data/baseball.csv', index_col='id') - (bb.groupby(['year', 'team']) - .sum() - .loc[lambda df: df.r > 100]) + bb = pd.read_csv("data/baseball.csv", index_col="id") + (bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100]) .. _whatsnew_0181.partial_string_indexing: @@ -180,13 +179,13 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI dft2 = pd.DataFrame( np.random.randn(20, 1), - columns=['A'], - index=pd.MultiIndex.from_product([pd.date_range('20130101', - periods=10, - freq='12H'), - ['a', 'b']])) + columns=["A"], + index=pd.MultiIndex.from_product( + [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ), + ) dft2 - dft2.loc['2013-01-05'] + dft2.loc["2013-01-05"] On other levels @@ -195,7 +194,7 @@ On other levels idx = pd.IndexSlice dft2 = dft2.swaplevel(0, 1).sort_index() dft2 - dft2.loc[idx[:, '2013-01-05'], :] + dft2.loc[idx[:, "2013-01-05"], :] .. _whatsnew_0181.enhancements.assembling: @@ -206,10 +205,9 @@ Assembling datetimes .. ipython:: python - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [2, 3]}) + df = pd.DataFrame( + {"year": [2015, 2016], "month": [2, 3], "day": [4, 5], "hour": [2, 3]} + ) df Assembling using the passed frame. @@ -222,7 +220,7 @@ You can pass only the columns that you need to assemble. .. ipython:: python - pd.to_datetime(df[['year', 'month', 'day']]) + pd.to_datetime(df[["year", "month", "day"]]) .. _whatsnew_0181.other: @@ -243,7 +241,7 @@ Other enhancements .. ipython:: python - idx = pd.Index([1., 2., 3., 4.], dtype='float') + idx = pd.Index([1.0, 2.0, 3.0, 4.0], dtype="float") # default, allow_fill=True, fill_value=None idx.take([2, -1]) @@ -253,8 +251,8 @@ Other enhancements .. ipython:: python - idx = pd.Index(['a|b', 'a|c', 'b|c']) - idx.str.get_dummies('|') + idx = pd.Index(["a|b", "a|c", "b|c"]) + idx.str.get_dummies("|") - ``pd.crosstab()`` has gained a ``normalize`` argument for normalizing frequency tables (:issue:`12569`). Examples in the updated docs :ref:`here `. @@ -313,8 +311,7 @@ The index in ``.groupby(..).nth()`` output is now more consistent when the ``as_ .. ipython:: python - df = pd.DataFrame({'A': ['a', 'b', 'a'], - 'B': [1, 2, 3]}) + df = pd.DataFrame({"A": ["a", "b", "a"], "B": [1, 2, 3]}) df Previous behavior: @@ -337,16 +334,16 @@ New behavior: .. ipython:: python - df.groupby('A', as_index=True)['B'].nth(0) - df.groupby('A', as_index=False)['B'].nth(0) + df.groupby("A", as_index=True)["B"].nth(0) + df.groupby("A", as_index=False)["B"].nth(0) Furthermore, previously, a ``.groupby`` would always sort, regardless if ``sort=False`` was passed with ``.nth()``. .. ipython:: python np.random.seed(1234) - df = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b']) - df['c'] = np.random.randint(0, 4, 100) + df = pd.DataFrame(np.random.randn(100, 2), columns=["a", "b"]) + df["c"] = np.random.randint(0, 4, 100) Previous behavior: @@ -374,8 +371,8 @@ New behavior: .. ipython:: python - df.groupby('c', sort=True).nth(1) - df.groupby('c', sort=False).nth(1) + df.groupby("c", sort=True).nth(1) + df.groupby("c", sort=False).nth(1) .. _whatsnew_0181.numpy_compatibility: @@ -421,8 +418,9 @@ Using ``apply`` on resampling groupby operations (using a ``pd.TimeGrouper``) no .. ipython:: python - df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), - 'value': [10, 13]}) + df = pd.DataFrame( + {"date": pd.to_datetime(["10/10/2000", "11/10/2000"]), "value": [10, 13]} + ) df Previous behavior: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 6e8c4273a0550..340e1ce9ee1ef 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -49,10 +49,8 @@ except that we match on nearest key rather than equal keys. .. ipython:: python - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) left right @@ -62,13 +60,13 @@ recent value otherwise. .. ipython:: python - pd.merge_asof(left, right, on='a') + pd.merge_asof(left, right, on="a") We can also match rows ONLY with prior data, and not an exact match. .. ipython:: python - pd.merge_asof(left, right, on='a', allow_exact_matches=False) + pd.merge_asof(left, right, on="a", allow_exact_matches=False) In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. @@ -76,36 +74,44 @@ This also illustrates using the ``by`` parameter to group data before merging. .. ipython:: python - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', 'MSFT', - 'GOOG', 'AAPL', 'GOOG', 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) .. ipython:: python @@ -118,9 +124,7 @@ that forward filling happens automatically taking the most recent non-NaN value. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker') + pd.merge_asof(trades, quotes, on="time", by="ticker") This returns a merged DataFrame with the entries in the same order as the original left passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. @@ -131,13 +135,14 @@ Method ``.rolling()`` is now time-series aware ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ``.rolling()`` objects are now time-series aware and can accept a time-series offset (or convertible) for the ``window`` argument (:issue:`13327`, :issue:`12995`). -See the full documentation :ref:`here `. +See the full documentation :ref:`here `. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', - periods=5, freq='s')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, freq="s"), + ) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -151,20 +156,26 @@ Specifying an offset allows a more intuitive specification of the rolling freque .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06"), + ], + name="foo", + ), + ) dft dft.rolling(2).sum() @@ -173,7 +184,7 @@ Using the time-specification generates variable windows for this sparse data. .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the default of the index) in a DataFrame. @@ -182,7 +193,7 @@ default of the index) in a DataFrame. dft = dft.reset_index() dft - dft.rolling('2s', on='foo').sum() + dft.rolling("2s", on="foo").sum() .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: @@ -199,8 +210,8 @@ they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :is .. ipython:: python - data = '0,1,2\n3,4,5' - names = ['a', 'b', 'a'] + data = "0,1,2\n3,4,5" + names = ["a", "b", "a"] **Previous behavior**: @@ -235,17 +246,22 @@ converting to ``Categorical`` after parsing. See the io :ref:`docs here ` (:issue:`10008`, :issue:`13156`) @@ -388,7 +404,7 @@ Google BigQuery enhancements Fine-grained NumPy errstate ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported. Pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as ``NaN`` s. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner, only around where these operations are actually used in the pandas code base. (:issue:`13109`, :issue:`13145`) +Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported. pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as ``NaN`` s. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner, only around where these operations are actually used in the pandas code base. (:issue:`13109`, :issue:`13145`) After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. @@ -415,7 +431,7 @@ The ``pd.get_dummies`` function now returns dummy-encoded columns as small integ .. ipython:: python - pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + pd.get_dummies(["a", "b", "a", "c"]).dtypes .. _whatsnew_0190.enhancements.to_numeric_downcast: @@ -427,9 +443,9 @@ Downcast values to smallest possible dtype in ``to_numeric`` .. ipython:: python - s = ['1', 2, 3] - pd.to_numeric(s, downcast='unsigned') - pd.to_numeric(s, downcast='integer') + s = ["1", 2, 3] + pd.to_numeric(s, downcast="unsigned") + pd.to_numeric(s, downcast="integer") .. _whatsnew_0190.dev_api: @@ -447,7 +463,8 @@ The following are now part of this API: import pprint from pandas.api import types - funcs = [f for f in dir(types) if not f.startswith('_')] + + funcs = [f for f in dir(types) if not f.startswith("_")] pprint.pprint(funcs) .. note:: @@ -472,16 +489,16 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), - 'a': np.arange(5)}, - index=pd.MultiIndex.from_arrays([[1, 2, 3, 4, 5], - pd.date_range('2015-01-01', - freq='W', - periods=5) - ], names=['v', 'd'])) + df = pd.DataFrame( + {"date": pd.date_range("2015-01-01", freq="W", periods=5), "a": np.arange(5)}, + index=pd.MultiIndex.from_arrays( + [[1, 2, 3, 4, 5], pd.date_range("2015-01-01", freq="W", periods=5)], + names=["v", "d"], + ), + ) df - df.resample('M', on='date').sum() - df.resample('M', level='d').sum() + df.resample("M", on="date").sum() + df.resample("M", level="d").sum() - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) @@ -507,10 +524,9 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, - index=['row1', 'row2']) + df = pd.DataFrame({"A": [2, 7], "B": [3, 5], "C": [4, 8]}, index=["row1", "row2"]) df - df.sort_values(by='row2', axis=1) + df.sort_values(by="row2", axis=1) - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) - :meth:`~DataFrame.to_html` now has a ``border`` argument to control the value in the opening ```` tag. The default is the value of the ``html.border`` option, which defaults to 1. This also affects the notebook HTML repr, but since Jupyter's CSS includes a border-width attribute, the visual effect is the same. (:issue:`11563`). @@ -583,12 +599,12 @@ Arithmetic operators align both ``index`` (no changes). .. ipython:: python - s1 = pd.Series([1, 2, 3], index=list('ABC')) - s2 = pd.Series([2, 2, 2], index=list('ABD')) + s1 = pd.Series([1, 2, 3], index=list("ABC")) + s2 = pd.Series([2, 2, 2], index=list("ABD")) s1 + s2 - df1 = pd.DataFrame([1, 2, 3], index=list('ABC')) - df2 = pd.DataFrame([2, 2, 2], index=list('ABD')) + df1 = pd.DataFrame([1, 2, 3], index=list("ABC")) + df2 = pd.DataFrame([2, 2, 2], index=list("ABD")) df1 + df2 Comparison operators @@ -661,8 +677,8 @@ Logical operators align both ``.index`` of left and right hand side. .. ipython:: python - s1 = pd.Series([True, False, True], index=list('ABC')) - s2 = pd.Series([True, True, True], index=list('ABD')) + s1 = pd.Series([True, False, True], index=list("ABC")) + s2 = pd.Series([True, True, True], index=list("ABD")) s1 & s2 .. note:: @@ -679,8 +695,8 @@ Logical operators align both ``.index`` of left and right hand side. .. ipython:: python - df1 = pd.DataFrame([True, False, True], index=list('ABC')) - df2 = pd.DataFrame([True, True, True], index=list('ABD')) + df1 = pd.DataFrame([True, False, True], index=list("ABC")) + df2 = pd.DataFrame([True, True, True], index=list("ABD")) df1 & df2 Flexible comparison methods @@ -691,8 +707,8 @@ which has the different ``index``. .. ipython:: python - s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - s2 = pd.Series([2, 2, 2], index=['b', 'c', 'd']) + s1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s2 = pd.Series([2, 2, 2], index=["b", "c", "d"]) s1.eq(s2) s1.ge(s2) @@ -749,7 +765,7 @@ This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python - pd.to_datetime([1, 'foo'], errors='coerce') + pd.to_datetime([1, "foo"], errors="coerce") Bug fixes related to ``.to_datetime()``: @@ -768,9 +784,9 @@ Merging will now preserve the dtype of the join keys (:issue:`8596`) .. ipython:: python - df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 = pd.DataFrame({"key": [1], "v1": [10]}) df1 - df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) df2 **Previous behavior**: @@ -796,16 +812,16 @@ We are able to preserve the join keys .. ipython:: python - pd.merge(df1, df2, how='outer') - pd.merge(df1, df2, how='outer').dtypes + pd.merge(df1, df2, how="outer") + pd.merge(df1, df2, how="outer").dtypes Of course if you have missing values that are introduced, then the resulting dtype will be upcast, which is unchanged from previous. .. ipython:: python - pd.merge(df1, df2, how='outer', on='key') - pd.merge(df1, df2, how='outer', on='key').dtypes + pd.merge(df1, df2, how="outer", on="key") + pd.merge(df1, df2, how="outer", on="key").dtypes .. _whatsnew_0190.api.describe: @@ -889,7 +905,7 @@ As a consequence of this change, ``PeriodIndex`` no longer has an integer dtype: .. ipython:: python - pi = pd.PeriodIndex(['2016-08-01'], freq='D') + pi = pd.PeriodIndex(["2016-08-01"], freq="D") pi pd.api.types.is_integer_dtype(pi) pd.api.types.is_period_dtype(pi) @@ -916,7 +932,7 @@ These result in ``pd.NaT`` without providing ``freq`` option. .. ipython:: python - pd.Period('NaT') + pd.Period("NaT") pd.Period(None) @@ -955,7 +971,7 @@ of integers (:issue:`13988`). .. ipython:: python - pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") pi.values @@ -985,7 +1001,7 @@ Previous behavior: .. ipython:: python - pd.Index(['a', 'b']) + pd.Index(['a', 'c']) + pd.Index(["a", "b"]) + pd.Index(["a", "c"]) Note that numeric Index objects already performed element-wise operations. For example, the behavior of adding two integer Indexes is unchanged. @@ -1011,8 +1027,10 @@ DatetimeIndex objects resulting in a TimedeltaIndex: .. ipython:: python - (pd.DatetimeIndex(['2016-01-01', '2016-01-02']) - - pd.DatetimeIndex(['2016-01-02', '2016-01-03'])) + ( + pd.DatetimeIndex(["2016-01-01", "2016-01-02"]) + - pd.DatetimeIndex(["2016-01-02", "2016-01-03"]) + ) .. _whatsnew_0190.api.difference: @@ -1073,8 +1091,9 @@ Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex` .. ipython:: python pd.Index([1, 2, 3]).unique() - pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - tz='Asia/Tokyo').unique() + pd.DatetimeIndex( + ["2011-01-01", "2011-01-02", "2011-01-03"], tz="Asia/Tokyo" + ).unique() .. _whatsnew_0190.api.multiindex: @@ -1086,8 +1105,8 @@ in ``MultiIndex`` levels (:issue:`13743`, :issue:`13854`). .. ipython:: python - cat = pd.Categorical(['a', 'b'], categories=list("bac")) - lvl1 = ['foo', 'bar'] + cat = pd.Categorical(["a", "b"], categories=list("bac")) + lvl1 = ["foo", "bar"] midx = pd.MultiIndex.from_arrays([cat, lvl1]) midx @@ -1113,9 +1132,9 @@ As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes .. ipython:: python - df = pd.DataFrame({'A': [0, 1], 'B': [10, 11], 'C': cat}) - df_grouped = df.groupby(by=['A', 'C']).first() - df_set_idx = df.set_index(['A', 'C']) + df = pd.DataFrame({"A": [0, 1], "B": [10, 11], "C": cat}) + df_grouped = df.groupby(by=["A", "C"]).first() + df_set_idx = df.set_index(["A", "C"]) **Previous behavior**: @@ -1163,7 +1182,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument .. ipython:: python - data = 'A,B\n0,1\n2,3\n4,5\n6,7' + data = "A,B\n0,1\n2,3\n4,5\n6,7" **Previous behavior**: @@ -1248,7 +1267,7 @@ Operators now preserve dtypes .. code-block:: python - s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s = pd.SparseSeries([1.0, 0.0, 2.0, 0.0], fill_value=0) s s.astype(np.int64) @@ -1372,7 +1391,7 @@ Deprecations - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) - ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) - ``pd.tseries.util.isleapyear`` has been deprecated and will be removed in a subsequent release. Datetime-likes now have a ``.is_leap_year`` property (:issue:`13727`) -- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion (:issue:`13564`). +- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion (:issue:`13564`). - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead (:issue:`13874`) - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq`` (:issue:`13874`) - ``Categorical.from_array`` has been deprecated and will be removed in a future version (:issue:`13854`) diff --git a/doc/source/whatsnew/v0.19.1.rst b/doc/source/whatsnew/v0.19.1.rst index 9e6b884e08587..6ff3fb6900a99 100644 --- a/doc/source/whatsnew/v0.19.1.rst +++ b/doc/source/whatsnew/v0.19.1.rst @@ -8,7 +8,7 @@ Version 0.19.1 (November 3, 2016) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, @@ -29,7 +29,7 @@ Performance improvements - Fixed performance regression in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) -- Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). +- Improved performance in certain types of ``loc`` indexing with a MultiIndex (:issue:`14551`). .. _whatsnew_0191.bug_fixes: diff --git a/doc/source/whatsnew/v0.19.2.rst b/doc/source/whatsnew/v0.19.2.rst index 924c95f21ceff..bba89d78be869 100644 --- a/doc/source/whatsnew/v0.19.2.rst +++ b/doc/source/whatsnew/v0.19.2.rst @@ -8,7 +8,7 @@ Version 0.19.2 (December 24, 2016) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.19.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 09980b52b6b3a..2cb8e13e9a18a 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -26,7 +26,7 @@ Highlights include: .. warning:: - Pandas has changed the internal structure and layout of the code base. + pandas has changed the internal structure and layout of the code base. This can affect imports that are not from the top-level ``pandas.*`` namespace, please see the changes :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -243,7 +243,7 @@ The default is to infer the compression type from the extension (``compression=' UInt64 support improved ^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has significantly improved support for operations involving unsigned, +pandas has significantly improved support for operations involving unsigned, or purely non-negative, integers. Previously, handling these integers would result in improper rounding or data-type casting, leading to incorrect results. Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937`) @@ -333,7 +333,7 @@ You must enable this by setting the ``display.html.table_schema`` option to ``Tr SciPy sparse matrix from/to SparseDataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. +pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. See the :ref:`documentation ` for more information. (:issue:`4343`) All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. @@ -459,7 +459,7 @@ Selecting via a scalar value that is contained *in* the intervals. Other enhancements ^^^^^^^^^^^^^^^^^^ -- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`) +- ``DataFrame.rolling()`` now accepts the parameter ``closed='right'|'left'|'both'|'neither'`` to choose the rolling window-endpoint closedness. See the :ref:`documentation ` (:issue:`13965`) - Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. - ``Series.str.replace()`` now accepts a callable, as replacement, which is passed to ``re.sub`` (:issue:`15055`) - ``Series.str.replace()`` now accepts a compiled regular expression as a pattern (:issue:`15446`) @@ -988,7 +988,7 @@ A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ` will now return a 2-level ``MultiIndexed DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, see :ref:`here `. These are equivalent in function, but a MultiIndexed ``DataFrame`` enjoys more support in pandas. -See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) +See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) .. ipython:: python @@ -1167,7 +1167,7 @@ Other API changes - ``.loc`` has compat with ``.ix`` for accepting iterators, and NamedTuples (:issue:`15120`) - ``interpolate()`` and ``fillna()`` will raise a ``ValueError`` if the ``limit`` keyword argument is not greater than 0. (:issue:`9217`) - ``pd.read_csv()`` will now issue a ``ParserWarning`` whenever there are conflicting values provided by the ``dialect`` parameter and the user (:issue:`14898`) -- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than than one byte (:issue:`11592`) +- ``pd.read_csv()`` will now raise a ``ValueError`` for the C engine if the quote character is larger than one byte (:issue:`11592`) - ``inplace`` arguments now require a boolean value, else a ``ValueError`` is thrown (:issue:`14189`) - ``pandas.api.types.is_datetime64_ns_dtype`` will now report ``True`` on a tz-aware dtype, similar to ``pandas.api.types.is_datetime64_any_dtype`` - ``DataFrame.asof()`` will return a null filled ``Series`` instead the scalar ``NaN`` if a match is not found (:issue:`15118`) @@ -1201,7 +1201,7 @@ Modules privacy has changed Some formerly public python/c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API. Furthermore, the ``pandas.core``, ``pandas.compat``, and ``pandas.util`` top-level modules are now considered to be PRIVATE. -If indicated, a deprecation warning will be issued if you reference theses modules. (:issue:`12588`) +If indicated, a deprecation warning will be issued if you reference these modules. (:issue:`12588`) .. csv-table:: :header: "Previous Location", "New Location", "Deprecated" @@ -1315,7 +1315,7 @@ The recommended methods of indexing are: - ``.loc`` if you want to *label* index - ``.iloc`` if you want to *positionally* index. -Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code :ref:`here `. +Using ``.ix`` will now show a ``DeprecationWarning`` with a link to some examples of how to convert code `here `__. .. ipython:: python @@ -1355,7 +1355,7 @@ Deprecate Panel ^^^^^^^^^^^^^^^ ``Panel`` is deprecated and will be removed in a future version. The recommended way to represent 3-D data are -with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:`13563`). .. code-block:: ipython @@ -1663,11 +1663,11 @@ Indexing - Bug in ``.reset_index()`` when an all ``NaN`` level of a ``MultiIndex`` would fail (:issue:`6322`) - Bug in ``.reset_index()`` when raising error for index name already present in ``MultiIndex`` columns (:issue:`16120`) - Bug in creating a ``MultiIndex`` with tuples and not passing a list of names; this will now raise ``ValueError`` (:issue:`15110`) -- Bug in the HTML display with with a ``MultiIndex`` and truncation (:issue:`14882`) +- Bug in the HTML display with a ``MultiIndex`` and truncation (:issue:`14882`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) - Bug in ``pd.concat()`` where the names of ``MultiIndex`` of resulting ``DataFrame`` are not handled correctly when ``None`` is presented in the names of ``MultiIndex`` of input ``DataFrame`` (:issue:`15787`) - Bug in ``DataFrame.sort_index()`` and ``Series.sort_index()`` where ``na_position`` doesn't work with a ``MultiIndex`` (:issue:`14784`, :issue:`16604`) -- Bug in in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) +- Bug in ``pd.concat()`` when combining objects with a ``CategoricalIndex`` (:issue:`16111`) - Bug in indexing with a scalar and a ``CategoricalIndex`` (:issue:`16123`) IO diff --git a/doc/source/whatsnew/v0.20.2.rst b/doc/source/whatsnew/v0.20.2.rst index 7f84c6b3f17bd..430a39d2d2e97 100644 --- a/doc/source/whatsnew/v0.20.2.rst +++ b/doc/source/whatsnew/v0.20.2.rst @@ -8,7 +8,7 @@ Version 0.20.2 (June 4, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.20.3.rst b/doc/source/whatsnew/v0.20.3.rst index 888d0048ca9f3..ff28f6830783e 100644 --- a/doc/source/whatsnew/v0.20.3.rst +++ b/doc/source/whatsnew/v0.20.3.rst @@ -8,7 +8,7 @@ Version 0.20.3 (July 7, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 926bcaa21ac3a..1bbbbdc7e5410 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -50,7 +50,7 @@ Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, su dtypes, including extension dtypes such as datetime with timezones. This functionality depends on either the `pyarrow `__ or `fastparquet `__ library. -For more details, see see :ref:`the IO docs on Parquet `. +For more details, see :ref:`the IO docs on Parquet `. .. _whatsnew_0210.enhancements.infer_objects: @@ -900,13 +900,13 @@ New behavior: No automatic Matplotlib converters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas no longer registers our ``date``, ``time``, ``datetime``, +pandas no longer registers our ``date``, ``time``, ``datetime``, ``datetime64``, and ``Period`` converters with matplotlib when pandas is imported. Matplotlib plot methods (``plt.plot``, ``ax.plot``, ...), will not nicely format the x-axis for ``DatetimeIndex`` or ``PeriodIndex`` values. You must explicitly register these methods: -Pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these +pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these converters on first-use (:issue:`17710`). .. note:: diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index f930dfac869cd..090a988d6406a 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -8,7 +8,7 @@ Version 0.21.1 (December 12, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.21.x series and includes some small regression fixes, @@ -34,7 +34,7 @@ Highlights include: Restore Matplotlib datetime converter registration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas implements some matplotlib converters for nicely formatting the axis +pandas implements some matplotlib converters for nicely formatting the axis labels on plots with ``datetime`` or ``Period`` values. Prior to pandas 0.21.0, these were implicitly registered with matplotlib, as a side effect of ``import pandas``. diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index 75949a90d09a6..ec9769c22e76b 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -1,14 +1,14 @@ .. _whatsnew_0220: -v0.22.0 (December 29, 2017) ---------------------------- +Version 0.22.0 (December 29, 2017) +---------------------------------- {{ header }} .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a major release from 0.21.1 and includes a single, API-breaking change. @@ -20,7 +20,7 @@ release note (singular!). Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The +pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The summary is that * The sum of an empty or all-*NA* ``Series`` is now ``0`` @@ -96,7 +96,7 @@ returning ``1`` instead. These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well. Finally, a few less obvious places in pandas are affected by this change. -Grouping by a categorical +Grouping by a Categorical ^^^^^^^^^^^^^^^^^^^^^^^^^ Grouping by a ``Categorical`` and summing now returns ``0`` instead of @@ -119,7 +119,7 @@ instead of ``NaN``. .. ipython:: python - grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + grouper = pd.Categorical(["a", "a"], categories=["a", "b"]) pd.Series([1, 2]).groupby(grouper).sum() To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, @@ -159,15 +159,14 @@ sum and ``1`` for product. .. ipython:: python - s = pd.Series([1, 1, np.nan, np.nan], - index=pd.date_range('2017', periods=4)) - s.resample('2d').sum() + s = pd.Series([1, 1, np.nan, np.nan], index=pd.date_range("2017", periods=4)) + s.resample("2d").sum() To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. .. ipython:: python - s.resample('2d').sum(min_count=1) + s.resample("2d").sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as upsampling introduces missing values even if the original series was @@ -190,7 +189,7 @@ entirely valid. .. ipython:: python - idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) pd.Series([1, 2], index=idx).resample("12H").sum() Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index b9e1b5060d1da..f4caea9d363eb 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -64,7 +64,7 @@ A ``DataFrame`` can now be written to and subsequently read back via JSON while new_df new_df.dtypes -Please note that the string `index` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. +Please note that the string ``index`` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. .. ipython:: python :okwarning: @@ -86,8 +86,8 @@ Please note that the string `index` is not supported with the round trip format, .. _whatsnew_0230.enhancements.assign_dependent: -``.assign()`` accepts dependent arguments -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Method ``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468 `_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the @@ -189,7 +189,7 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values Extending pandas with custom types (experimental) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas now supports storing array-like objects that aren't necessarily 1-D NumPy +pandas now supports storing array-like objects that aren't necessarily 1-D NumPy arrays as columns in a DataFrame or values in a Series. This allows third-party libraries to implement extensions to NumPy's types, similar to how pandas implemented categoricals, datetimes with timezones, periods, and intervals. @@ -244,7 +244,7 @@ documentation. If you build an extension array, publicize it on our .. _whatsnew_0230.enhancements.categorical_grouping: -New ``observed`` keyword for excluding unobserved categories in ``groupby`` +New ``observed`` keyword for excluding unobserved categories in ``GroupBy`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Grouping by a categorical includes the unobserved categories in the output. @@ -360,8 +360,8 @@ Fill all consecutive outside values in both directions .. _whatsnew_0210.enhancements.get_dummies_dtype: -``get_dummies`` now supports ``dtype`` argument -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Function ``get_dummies`` now supports ``dtype`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`) @@ -388,8 +388,8 @@ See the :ref:`documentation here `. (:issue:`19365`) .. _whatsnew_0230.enhancements.ran_inf: -``.rank()`` handles ``inf`` values when ``NaN`` are present -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Method ``.rank()`` handles ``inf`` values when ``NaN`` are present +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions, ``.rank()`` would assign ``inf`` elements ``NaN`` as their ranks. Now ranks are calculated properly. (:issue:`6945`) @@ -457,7 +457,7 @@ These bugs were squashed: Previously, :meth:`Series.str.cat` did not -- in contrast to most of ``pandas`` -- align :class:`Series` on their index before concatenation (see :issue:`18657`). The method has now gained a keyword ``join`` to control the manner of alignment, see examples below and :ref:`here `. -In v.0.23 `join` will default to None (meaning no alignment), but this default will change to ``'left'`` in a future version of pandas. +In v.0.23 ``join`` will default to None (meaning no alignment), but this default will change to ``'left'`` in a future version of pandas. .. ipython:: python :okwarning: @@ -553,7 +553,7 @@ Other enhancements - :class:`~pandas.tseries.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`). - :class:`DataFrame` and :class:`Series` now support matrix multiplication (``@``) operator (:issue:`10259`) for Python>=3.5 - Updated :meth:`DataFrame.to_gbq` and :meth:`pandas.read_gbq` signature and documentation to reflect changes from - the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ + the pandas-gbq library version 0.4.0. Adds intersphinx mapping to pandas-gbq library. (:issue:`20564`) - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`) - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) @@ -587,13 +587,13 @@ If installed, we now require: .. _whatsnew_0230.api_breaking.dict_insertion_order: -Instantiation from dicts preserves dict insertion order for python 3.6+ +Instantiation from dicts preserves dict insertion order for Python 3.6+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Until Python 3.6, dicts in Python had no formally defined ordering. For Python version 3.6 and later, dicts are ordered by insertion order, see `PEP 468 `_. -Pandas will use the dict's insertion order, when creating a ``Series`` or +pandas will use the dict's insertion order, when creating a ``Series`` or ``DataFrame`` from a dict and you're using Python version 3.6 or higher. (:issue:`19884`) @@ -643,7 +643,7 @@ Deprecate Panel ^^^^^^^^^^^^^^^ ``Panel`` was deprecated in the 0.20.x release, showing as a ``DeprecationWarning``. Using ``Panel`` will now show a ``FutureWarning``. The recommended way to represent 3-D data are -with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:`13563`, :issue:`18324`). .. code-block:: ipython @@ -836,7 +836,7 @@ Build changes Index division by zero fills correctly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) +Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and ``0 / 0`` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) Previous behavior: @@ -884,7 +884,7 @@ Extraction of matching patterns from strings By default, extracting matching patterns from strings with :func:`str.extract` used to return a ``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was -extracted). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +extracted). As of pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless ``expand`` is set to ``False``. Finally, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to ``False``), but now raises a ``ValueError``. (:issue:`11386`) @@ -974,7 +974,7 @@ automatically so that the printed data frame fits within the current terminal width (``pd.options.display.max_columns=0``) (:issue:`17023`). If Python runs as a Jupyter kernel (such as the Jupyter QtConsole or a Jupyter notebook, as well as in many IDEs), this value cannot be inferred automatically and is thus -set to `20` as in previous versions. In a terminal, this results in a much +set to ``20`` as in previous versions. In a terminal, this results in a much nicer output: .. image:: ../_static/print_df_new.png @@ -998,7 +998,7 @@ Datetimelike API changes - Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError`` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`) -- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) +- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mismatched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) - :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) @@ -1011,7 +1011,7 @@ Datetimelike API changes - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). - :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) - For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with ``freq=None``, addition or subtraction of integer-dtyped array or ``Index`` will raise ``NullFrequencyError`` instead of ``TypeError`` (:issue:`19895`) -- :class:`Timestamp` constructor now accepts a `nanosecond` keyword or positional argument (:issue:`18898`) +- :class:`Timestamp` constructor now accepts a ``nanosecond`` keyword or positional argument (:issue:`18898`) - :class:`DatetimeIndex` will now raise an ``AttributeError`` when the ``tz`` attribute is set after instantiation (:issue:`3746`) - :class:`DatetimeIndex` with a ``pytz`` timezone will now return a consistent ``pytz`` timezone (:issue:`18595`) @@ -1049,7 +1049,7 @@ Other API changes - :class:`DateOffset` objects render more simply, e.g. ```` instead of ```` (:issue:`19403`) - ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`) - ``pd.to_datetime('today')`` now returns a datetime, consistent with ``pd.Timestamp('today')``; previously ``pd.to_datetime('today')`` returned a ``.normalized()`` datetime (:issue:`19935`) -- :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) +- :func:`Series.str.replace` now takes an optional ``regex`` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) - :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`) - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`). - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`) @@ -1175,7 +1175,7 @@ Performance improvements Documentation changes ~~~~~~~~~~~~~~~~~~~~~ -Thanks to all of the contributors who participated in the Pandas Documentation +Thanks to all of the contributors who participated in the pandas Documentation Sprint, which took place on March 10th. We had about 500 participants from over 30 locations across the world. You should notice that many of the :ref:`API docstrings ` have greatly improved. @@ -1234,7 +1234,7 @@ Categorical - Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`) - Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`) - Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`) -- Bug in :meth:`Series.take` with categorical data interpreting ``-1`` in `indices` as missing value markers, rather than the last element of the Series (:issue:`20664`) +- Bug in :meth:`Series.take` with categorical data interpreting ``-1`` in ``indices`` as missing value markers, rather than the last element of the Series (:issue:`20664`) Datetimelike ^^^^^^^^^^^^ @@ -1273,7 +1273,7 @@ Timedelta - Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`, :issue:`19834`) - Bug in :func:`Timedelta.total_seconds()` causing precision errors, for example ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) - Bug in :func:`Timedelta.__rmod__` where operating with a ``numpy.timedelta64`` returned a ``timedelta64`` object instead of a ``Timedelta`` (:issue:`19820`) -- Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mis-match (:issue:`19333`) +- Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mismatch (:issue:`19333`) - Bug in indexing a :class:`TimedeltaIndex` with a ``np.timedelta64`` object which was raising a ``TypeError`` (:issue:`20393`) @@ -1316,7 +1316,7 @@ Numeric Strings ^^^^^^^ -- Bug in :func:`Series.str.get` with a dictionary in the values and the index not in the keys, raising `KeyError` (:issue:`20671`) +- Bug in :func:`Series.str.get` with a dictionary in the values and the index not in the keys, raising ``KeyError`` (:issue:`20671`) Indexing @@ -1365,11 +1365,11 @@ MultiIndex - Bug in indexing where nested indexers having only numpy arrays are handled incorrectly (:issue:`19686`) -I/O -^^^ +IO +^^ - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) -- :meth:`DataFrame.to_html` now has an option to add an id to the leading `
          ` tag (:issue:`8496`) +- :meth:`DataFrame.to_html` now has an option to add an id to the leading ``
          `` tag (:issue:`8496`) - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) @@ -1378,7 +1378,7 @@ I/O - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) - Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`) - Bug in :func:`DataFrame.to_latex()` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`) -- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the `index_names=False` option would result in incorrect output (:issue:`18326`) +- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the ``index_names=False`` option would result in incorrect output (:issue:`18326`) - Bug in :func:`DataFrame.to_latex()` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`) - Bug in :func:`DataFrame.to_latex()` where missing space characters caused wrong escaping and produced non-valid latex in some cases (:issue:`20859`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) @@ -1403,7 +1403,7 @@ Plotting - :func:`DataFrame.plot` now supports multiple columns to the ``y`` argument (:issue:`19699`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) @@ -1412,7 +1412,7 @@ Groupby/resample/rolling - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) - Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) -- Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) +- Bug in :func:`DataFrame.groupby` passing the ``on=`` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate ` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`) - Bug in :func:`DataFrame.resample` that dropped timezone information (:issue:`13238`) diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst index 03b7d9db6bc63..b51368c87f991 100644 --- a/doc/source/whatsnew/v0.23.1.rst +++ b/doc/source/whatsnew/v0.23.1.rst @@ -74,10 +74,10 @@ In addition, ordering comparisons will raise a ``TypeError`` in the future. a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). - Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values - in nested levels in JSON, and to not drop keys with value as `None` (:issue:`21158`, :issue:`21356`). + in nested levels in JSON, and to not drop keys with value as ``None`` (:issue:`21158`, :issue:`21356`). - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) -- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when ``value`` the individual categories are iterable and ``value`` is an iterable (:issue:`21097`, :issue:`19788`) - Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) - Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) @@ -106,7 +106,7 @@ Bug fixes **Data-type specific** -- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue:`21078`) +- Bug in :meth:`Series.str.replace()` where the method throws ``TypeError`` on Python 3.5.2 (:issue:`21078`) - Bug in :class:`Timedelta` where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) diff --git a/doc/source/whatsnew/v0.23.2.rst b/doc/source/whatsnew/v0.23.2.rst index 9f24092d1d4ae..99650e8291d3d 100644 --- a/doc/source/whatsnew/v0.23.2.rst +++ b/doc/source/whatsnew/v0.23.2.rst @@ -11,7 +11,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. note:: - Pandas 0.23.2 is first pandas release that's compatible with + pandas 0.23.2 is first pandas release that's compatible with Python 3.7 (:issue:`20552`) .. warning:: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 45399792baecf..ce784231a47d2 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -38,7 +38,7 @@ Enhancements Optional integer NA support ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. .. note:: @@ -277,8 +277,8 @@ For earlier versions this can be done using the following. .. _whatsnew_0240.enhancements.read_html: -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ +Function ``read_html`` enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. Now it understands them, treating them as sequences of cells with the same @@ -376,7 +376,7 @@ Other enhancements - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) - :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`) -- The :meth:`~DataFrame.shift` method now accepts `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) +- The :meth:`~DataFrame.shift` method now accepts ``fill_value`` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) - :func:`DataFrame.to_csv` and :func:`Series.to_csv` now support the ``compression`` keyword when a file handle is passed. (:issue:`21227`) @@ -384,7 +384,7 @@ Other enhancements - :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) - Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) - :func:`DataFrame.to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.8.0 + reflect changes from the `pandas-gbq library version 0.8.0 `__. Adds a ``credentials`` argument, which enables the use of any kind of `google-auth credentials @@ -419,7 +419,7 @@ Other enhancements - :meth:`Index.difference`, :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference` now have an optional ``sort`` parameter to control whether the results should be sorted if possible (:issue:`17839`, :issue:`24471`) - :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) - :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. -- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) +- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed string columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`) - :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) @@ -432,7 +432,7 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas 0.24.0 includes a number of API breaking changes. +pandas 0.24.0 includes a number of API breaking changes. .. _whatsnew_0240.api_breaking.deps: @@ -474,8 +474,8 @@ and replaced it with references to ``pyarrow`` (:issue:`21639` and :issue:`23053 .. _whatsnew_0240.api_breaking.csv_line_terminator: -`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``os.linesep`` is used for ``line_terminator`` of ``DataFrame.to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'`` for the default line terminator (:issue:`20353`). @@ -510,7 +510,7 @@ even when ``'\n'`` was passed in ``line_terminator``. *New behavior* on Windows: -Passing ``line_terminator`` explicitly, set thes ``line terminator`` to that character. +Passing ``line_terminator`` explicitly, set the ``line terminator`` to that character. .. code-block:: ipython @@ -556,8 +556,8 @@ You must pass in the ``line_terminator`` explicitly, even in this case. .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: -Proper handling of `np.NaN` in a string data-typed column with the Python engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Proper handling of ``np.NaN`` in a string data-typed column with the Python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ There was bug in :func:`read_excel` and :func:`read_csv` with the Python engine, where missing values turned to ``'nan'`` with ``dtype=str`` and @@ -1198,7 +1198,7 @@ Other API changes - :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) +- :class:`DateOffset` attribute ``_cacheable`` and method ``_should_cache`` have been removed (:issue:`23118`) - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). - :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`). - :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`). @@ -1217,7 +1217,7 @@ Extension type changes **Equality and hashability** -Pandas now requires that extension dtypes be hashable (i.e. the respective +pandas now requires that extension dtypes be hashable (i.e. the respective ``ExtensionDtype`` objects; hashability is not a requirement for the values of the corresponding ``ExtensionArray``). The base class implements a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should @@ -1317,7 +1317,7 @@ Deprecations - Timezone converting a tz-aware ``datetime.datetime`` or :class:`Timestamp` with :class:`Timestamp` and the ``tz`` argument is now deprecated. Instead, use :meth:`Timestamp.tz_convert` (:issue:`23579`) - :func:`pandas.api.types.is_period` is deprecated in favor of ``pandas.api.types.is_period_dtype`` (:issue:`23917`) - :func:`pandas.api.types.is_datetimetz` is deprecated in favor of ``pandas.api.types.is_datetime64tz`` (:issue:`23917`) -- Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`) +- Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments ``start``, ``end``, and ``periods`` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`) - Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). - The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`) - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). @@ -1371,7 +1371,7 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). .. _whatsnew_0240.deprecations.integer_tz: -Passing integer data and a timezone to datetimeindex +Passing integer data and a timezone to DatetimeIndex ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The behavior of :class:`DatetimeIndex` when passed integer data and @@ -1534,7 +1534,7 @@ Performance improvements - Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) - Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) - Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) -- Improved performance of :func:`concat` for `Series` objects (:issue:`23404`) +- Improved performance of :func:`concat` for ``Series`` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) - Fixed a performance regression on Windows with Python 3.7 of :func:`read_csv` (:issue:`23516`) @@ -1602,7 +1602,7 @@ Datetimelike - Bug in :class:`DataFrame` when creating a new column from an ndarray of :class:`Timestamp` objects with timezones creating an object-dtype column, rather than datetime with timezone (:issue:`23932`) - Bug in :class:`Timestamp` constructor which would drop the frequency of an input :class:`Timestamp` (:issue:`22311`) - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) -- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) +- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and ``dtype=object`` would incorrectly raise a ``ValueError`` (:issue:`23524`) - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) - Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are unordered and have the same categories, but in a different order (:issue:`24142`) - Bug in :func:`date_range` where using dates with millisecond resolution or higher could return incorrect values or the wrong number of values in the index (:issue:`24110`) @@ -1622,7 +1622,7 @@ Timedelta - Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) - Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) - Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) - Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) - Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) - Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) @@ -1647,7 +1647,7 @@ Timezones - Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) - Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) - Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) -- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) +- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show ``first`` and ``last`` result (:issue:`21328`) - Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) - Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) - Bug in :func:`DataFrame.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) @@ -1693,7 +1693,7 @@ Numeric - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) - Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2\ :sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) - Calls such as :meth:`DataFrame.round` with a non-unique :meth:`CategoricalIndex` now return expected data. Previously, data would be improperly duplicated (:issue:`21809`). -- Added ``log10``, `floor` and `ceil` to the list of supported functions in :meth:`DataFrame.eval` (:issue:`24139`, :issue:`24353`) +- Added ``log10``, ``floor`` and ``ceil`` to the list of supported functions in :meth:`DataFrame.eval` (:issue:`24139`, :issue:`24353`) - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) - Checking PEP 3141 numbers in :func:`~pandas.api.types.is_scalar` function returns ``True`` (:issue:`22903`) - Reduction methods like :meth:`Series.sum` now accept the default value of ``keepdims=False`` when called from a NumPy ufunc, rather than raising a ``TypeError``. Full support for ``keepdims`` has not been implemented (:issue:`24356`). @@ -1769,8 +1769,8 @@ MultiIndex - :class:`MultiIndex` has gained the :meth:`MultiIndex.from_frame`, it allows constructing a :class:`MultiIndex` object from a :class:`DataFrame` (:issue:`22420`) - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) -I/O -^^^ +IO +^^ - Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`) - Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`) @@ -1827,7 +1827,7 @@ Plotting - Bug in :func:`DataFrame.plot.bar` caused bars to use multiple colors instead of a single one (:issue:`20585`) - Bug in validating color parameter caused extra color to be appended to the given color array. This happened to multiple plotting functions using matplotlib. (:issue:`20726`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`pandas.core.window.Rolling.min` and :func:`pandas.core.window.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`) @@ -1859,7 +1859,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) -- Bug in :func:`pandas.concat` when joining only `Series` the `names` argument of `concat` is no longer ignored (:issue:`23490`) +- Bug in :func:`pandas.concat` when joining only ``Series`` the ``names`` argument of ``concat`` is no longer ignored (:issue:`23490`) - Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) - Bug in :meth:`DataFrame.where` with an empty DataFrame and empty ``cond`` having non-bool dtype (:issue:`21947`) @@ -1868,7 +1868,7 @@ Reshaping - :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) - Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) - Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) - Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) @@ -1885,7 +1885,7 @@ Reshaping - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`). - Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`) -- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a missleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) +- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a misleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) - Bug in :func:`DataFrame.unstack` where a ``ValueError`` was raised when unstacking timezone aware values (:issue:`18338`) - Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`) - Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`) @@ -1925,7 +1925,7 @@ Build changes Other ^^^^^ -- Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before Pandas. (:issue:`24113`) +- Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before pandas. (:issue:`24113`) .. _whatsnew_0.24.0.contributors: diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index aead8c48eb9b7..dd859dabc9c64 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_0241: -Whats new in 0.24.1 (February 3, 2019) --------------------------------------- +What's new in 0.24.1 (February 3, 2019) +--------------------------------------- .. warning:: @@ -33,7 +33,7 @@ This change will allow ``sort=True`` to mean "always sort" in a future release. The same change applies to :meth:`Index.difference` and :meth:`Index.symmetric_difference`, which would not sort the result when the values could not be compared. -The `sort` option for :meth:`Index.intersection` has changed in three ways. +The ``sort`` option for :meth:`Index.intersection` has changed in three ways. 1. The default has changed from ``True`` to ``False``, to restore the pandas 0.23.4 and earlier behavior of not sorting by default. @@ -55,7 +55,7 @@ Fixed regressions - Fixed regression in :class:`Index.intersection` incorrectly sorting the values by default (:issue:`24959`). - Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`). - Fixed regression in :meth:`Series.rename_axis` and :meth:`DataFrame.rename_axis` where passing ``None`` failed to remove the axis name (:issue:`25034`) -- Fixed regression in :func:`to_timedelta` with `box=False` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`) +- Fixed regression in :func:`to_timedelta` with ``box=False`` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`) - Fixed regression where custom hashable types could not be used as column keys in :meth:`DataFrame.set_index` (:issue:`24969`) .. _whatsnew_0241.bug_fixes: diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index d1a893f99cff4..36684d465373c 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_0242: -Whats new in 0.24.2 (March 12, 2019) ------------------------------------- +What's new in 0.24.2 (March 12, 2019) +------------------------------------- .. warning:: @@ -51,7 +51,6 @@ Bug fixes - Bug where calling :meth:`Series.replace` on categorical data could return a ``Series`` with incorrect dimensions (:issue:`24971`) - -- **Reshaping** diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3cd920158f774..37b661b87068d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -14,7 +14,7 @@ What's new in 0.25.0 (July 18, 2019) .. warning:: - `Panel` has been fully removed. For N-D labeled data structures, please + ``Panel`` has been fully removed. For N-D labeled data structures, please use `xarray `_ .. warning:: @@ -33,10 +33,10 @@ Enhancements .. _whatsnew_0250.enhancements.agg_relabel: -Groupby aggregation with relabeling +GroupBy aggregation with relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has added special groupby behavior, known as "named aggregation", for naming the +pandas has added special groupby behavior, known as "named aggregation", for naming the output columns when applying multiple aggregation functions to specific columns (:issue:`18366`, :issue:`26512`). .. ipython:: python @@ -53,7 +53,7 @@ output columns when applying multiple aggregation functions to specific columns Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer +aggregation function to apply. pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. .. ipython:: python @@ -85,7 +85,7 @@ See :ref:`groupby.aggregate.named` for more. .. _whatsnew_0250.enhancements.multiple_lambdas: -Groupby aggregation with multiple lambdas +GroupBy aggregation with multiple lambdas ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can now provide multiple lambda functions to a list-like aggregation in @@ -161,7 +161,7 @@ To restore the previous behaviour of a single threshold, set .. _whatsnew_0250.enhancements.json_normalize_with_max_level: -Json normalize with max_level param support +JSON normalize with max_level param support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`json_normalize` normalizes the provided input dict to all @@ -308,7 +308,7 @@ would be reassigned as -1. (:issue:`19387`) .. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: -``Groupby.apply`` on ``DataFrame`` evaluates first group only once +``GroupBy.apply`` on ``DataFrame`` evaluates first group only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The implementation of :meth:`DataFrameGroupBy.apply() ` @@ -422,10 +422,10 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t .. _whatsnew_0250.api_breaking.groupby_categorical: -Categorical dtypes are preserved during groupby +Categorical dtypes are preserved during GroupBy ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) +Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. pandas now will preserve these dtypes. (:issue:`18502`) .. ipython:: python @@ -483,7 +483,7 @@ values are coerced to floating point, which may result in loss of precision. See :ref:`indexing.set_ops` for more. -``DataFrame`` groupby ffill/bfill no longer return group labels +``DataFrame`` GroupBy ffill/bfill no longer return group labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The methods ``ffill``, ``bfill``, ``pad`` and ``backfill`` of @@ -513,7 +513,7 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() -``DataFrame`` describe on an empty categorical / object column will return top and freq +``DataFrame`` describe on an empty Categorical / object column will return top and freq ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When calling :meth:`DataFrame.describe` with an empty categorical / object @@ -540,19 +540,19 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 .. ipython:: python - df.describe() + df.describe() ``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has until now mostly defined string representations in a Pandas objects's +pandas has until now mostly defined string representations in a pandas objects' ``__str__``/``__unicode__``/``__bytes__`` methods, and called ``__str__`` from the ``__repr__`` method, if a specific ``__repr__`` method is not found. This is not needed for Python3. -In Pandas 0.25, the string representations of Pandas objects are now generally +In pandas 0.25, the string representations of pandas objects are now generally defined in ``__repr__``, and calls to ``__str__`` in general now pass the call on to the ``__repr__``, if a specific ``__str__`` method doesn't exist, as is standard for Python. -This change is backward compatible for direct usage of Pandas, but if you subclass -Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, +This change is backward compatible for direct usage of pandas, but if you subclass +pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). .. _whatsnew_0250.api_breaking.interval_indexing: @@ -881,7 +881,7 @@ Other API changes - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) -- Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +- Most pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) @@ -1085,7 +1085,6 @@ Conversion - Bug in :func:`DataFrame.astype()` when passing a dict of columns and types the ``errors`` parameter was ignored. (:issue:`25905`) - -- Strings ^^^^^^^ @@ -1139,8 +1138,8 @@ MultiIndex - Bug in which incorrect exception raised by :class:`Timedelta` when testing the membership of :class:`MultiIndex` (:issue:`24570`) - -I/O -^^^ +IO +^^ - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) @@ -1167,7 +1166,7 @@ I/O - Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). -- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. `PeriodIndex`) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) +- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. ``PeriodIndex``) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) - Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`) - Bug in :meth:`to_msgpack` and :meth:`read_msgpack` which would raise a ``ValueError`` rather than a ``FileNotFoundError`` for an invalid path (:issue:`27160`) - Fixed bug in :meth:`DataFrame.to_parquet` which would raise a ``ValueError`` when the dataframe had no columns (:issue:`27339`) @@ -1182,9 +1181,8 @@ Plotting - Fixed bug causing plots of :class:`PeriodIndex` timeseries to fail if the frequency is a multiple of the frequency rule code (:issue:`14763`) - Fixed bug when plotting a :class:`DatetimeIndex` with ``datetime.timezone.utc`` timezone (:issue:`17173`) - -- -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) @@ -1262,7 +1260,7 @@ Other - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) - Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) - Use actual class name in repr of empty objects of a ``Series`` subclass (:issue:`27001`). -- Bug in :class:`DataFrame` where passing an object array of timezone-aware `datetime` objects would incorrectly raise ``ValueError`` (:issue:`13287`) +- Bug in :class:`DataFrame` where passing an object array of timezone-aware ``datetime`` objects would incorrectly raise ``ValueError`` (:issue:`13287`) .. _whatsnew_0.250.contributors: diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 944021ca0fcae..cc24ba5d6557c 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -6,13 +6,13 @@ What's new in 0.25.1 (August 21, 2019) These are the changes in pandas 0.25.1. See :ref:`release` for a full changelog including other versions of pandas. -I/O and LZMA -~~~~~~~~~~~~ +IO and LZMA +~~~~~~~~~~~ -Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue:`27575`). -Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. -A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. -For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. +Some users may unknowingly have an incomplete Python installation lacking the ``lzma`` module from the standard library. In this case, ``import pandas`` failed due to an ``ImportError`` (:issue:`27575`). +pandas will now warn, rather than raising an ``ImportError`` if the ``lzma`` module is not present. Any subsequent attempt to use ``lzma`` methods will raise a ``RuntimeError``. +A possible fix for the lack of the ``lzma`` module is to ensure you have the necessary libraries and then re-install Python. +For example, on MacOS installing Python with ``pyenv`` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like ``xz``). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. .. _whatsnew_0251.bug_fixes: @@ -52,7 +52,7 @@ Conversion Interval ^^^^^^^^ -- Bug in :class:`IntervalIndex` where `dir(obj)` would raise ``ValueError`` (:issue:`27571`) +- Bug in :class:`IntervalIndex` where ``dir(obj)`` would raise ``ValueError`` (:issue:`27571`) Indexing ^^^^^^^^ @@ -67,8 +67,8 @@ Missing - Bug in :func:`pandas.isnull` or :func:`pandas.isna` when the input is a type e.g. ``type(pandas.Series())`` (:issue:`27482`) -I/O -^^^ +IO +^^ - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) @@ -82,20 +82,20 @@ Plotting :meth:`pandas.plotting.deregister_matplotlib_converters` (:issue:`27481`). - Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`). -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Bug in windowing over read-only arrays (:issue:`27766`) -- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`) +- Fixed segfault in ``pandas.core.groupby.DataFrameGroupBy.quantile`` when an invalid quantile was passed (:issue:`27470`) Reshaping ^^^^^^^^^ - A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`) -- Bug :meth:`merge_asof` could not merge :class:`Timedelta` objects when passing `tolerance` kwarg (:issue:`27642`) +- Bug :meth:`merge_asof` could not merge :class:`Timedelta` objects when passing ``tolerance`` kwarg (:issue:`27642`) - Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) - :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`) - Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index c0c68ce4b1f44..ab6aaebe4ed06 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -8,7 +8,7 @@ including other versions of pandas. .. note:: - Pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). + pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). .. _whatsnew_0252.bug_fixes: @@ -21,14 +21,14 @@ Indexing - Fix regression in :meth:`DataFrame.reindex` not following the ``limit`` argument (:issue:`28631`). - Fix regression in :meth:`RangeIndex.get_indexer` for decreasing :class:`RangeIndex` where target values may be improperly identified as missing/present (:issue:`28678`) -I/O -^^^ +IO +^^ - Fix regression in notebook display where ``
          `` tags were missing for :attr:`DataFrame.index` values (:issue:`28204`). - Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) - Fix :meth:`~DataFrame.to_csv` with ``ExtensionArray`` with list-like values (:issue:`28840`). -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). diff --git a/doc/source/whatsnew/v0.25.3.rst b/doc/source/whatsnew/v0.25.3.rst index f7f54198a0f82..e028c08e1e85c 100644 --- a/doc/source/whatsnew/v0.25.3.rst +++ b/doc/source/whatsnew/v0.25.3.rst @@ -11,7 +11,7 @@ including other versions of pandas. Bug fixes ~~~~~~~~~ -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`DataFrameGroupBy.quantile` where NA values in the grouping could cause segfaults or incorrect results (:issue:`28882`) diff --git a/doc/source/whatsnew/v0.5.0.rst b/doc/source/whatsnew/v0.5.0.rst index 7ccb141260f18..7447a10fa1d6b 100644 --- a/doc/source/whatsnew/v0.5.0.rst +++ b/doc/source/whatsnew/v0.5.0.rst @@ -9,7 +9,7 @@ Version 0.5.0 (October 24, 2011) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 New features diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index f984b9ad71b63..253ca4d4188e5 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -8,14 +8,14 @@ Version 0.6.0 (November 25, 2011) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 New features ~~~~~~~~~~~~ - :ref:`Added ` ``melt`` function to ``pandas.core.reshape`` - :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (:issue:`313`) -- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to to DataFrame (:issue:`296`) +- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to DataFrame (:issue:`296`) - :ref:`Added ` ``Series.isin`` function which checks if each value is contained in a passed sequence (:issue:`289`) - :ref:`Added ` ``float_format`` option to ``Series.to_string`` - :ref:`Added ` ``skip_footer`` (:issue:`291`) and ``converters`` (:issue:`343`) options to ``read_csv`` and ``read_table`` @@ -52,7 +52,7 @@ New features Performance enhancements ~~~~~~~~~~~~~~~~~~~~~~~~ - VBENCH Cythonized ``cache_readonly``, resulting in substantial micro-performance enhancements throughout the code base (:issue:`361`) -- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than `np.apply_along_axis` (:issue:`309`) +- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than ``np.apply_along_axis`` (:issue:`309`) - VBENCH Improved performance of ``MultiIndex.from_tuples`` - VBENCH Special Cython matrix iterator for applying arbitrary reduction operations - VBENCH + DOCUMENT Add ``raw`` option to ``DataFrame.apply`` for getting better performance when diff --git a/doc/source/whatsnew/v0.6.1.rst b/doc/source/whatsnew/v0.6.1.rst index 8eea0a07f1f79..139c6e2d1cb0c 100644 --- a/doc/source/whatsnew/v0.6.1.rst +++ b/doc/source/whatsnew/v0.6.1.rst @@ -16,28 +16,28 @@ New features - Add PyQt table widget to sandbox (:issue:`435`) - DataFrame.align can :ref:`accept Series arguments ` and an :ref:`axis option ` (:issue:`461`) -- Implement new :ref:`SparseArray ` and `SparseList` +- Implement new :ref:`SparseArray ` and ``SparseList`` data structures. SparseSeries now derives from SparseArray (:issue:`463`) - :ref:`Better console printing options ` (:issue:`453`) - Implement fast :ref:`data ranking ` for Series and DataFrame, fast versions of scipy.stats.rankdata (:issue:`428`) -- Implement `DataFrame.from_items` alternate +- Implement ``DataFrame.from_items`` alternate constructor (:issue:`444`) - DataFrame.convert_objects method for :ref:`inferring better dtypes ` for object columns (:issue:`302`) -- Add :ref:`rolling_corr_pairwise ` function for +- Add :ref:`rolling_corr_pairwise ` function for computing Panel of correlation matrices (:issue:`189`) - Add :ref:`margins ` option to :ref:`pivot_table ` for computing subgroup aggregates (:issue:`114`) - Add ``Series.from_csv`` function (:issue:`482`) -- :ref:`Can pass ` DataFrame/DataFrame and +- :ref:`Can pass ` DataFrame/DataFrame and DataFrame/Series to rolling_corr/rolling_cov (GH #462) - MultiIndex.get_level_values can :ref:`accept the level name ` Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Improve memory usage of `DataFrame.describe` (do not copy data +- Improve memory usage of ``DataFrame.describe`` (do not copy data unnecessarily) (PR #425) - Optimize scalar value lookups in the general case by 25% or more in Series diff --git a/doc/source/whatsnew/v0.7.0.rst b/doc/source/whatsnew/v0.7.0.rst index a193b8049e951..2fe686d8858a2 100644 --- a/doc/source/whatsnew/v0.7.0.rst +++ b/doc/source/whatsnew/v0.7.0.rst @@ -20,7 +20,7 @@ New features ``DataFrame.append`` (:issue:`468`, :issue:`479`, :issue:`273`) - :ref:`Can ` pass multiple DataFrames to - `DataFrame.append` to concatenate (stack) and multiple Series to + ``DataFrame.append`` to concatenate (stack) and multiple Series to ``Series.append`` too - :ref:`Can` pass list of dicts (e.g., a @@ -282,7 +282,7 @@ Performance improvements - Substantially improve performance of multi-GroupBy aggregation when a Python function is passed, reuse ndarray object in Cython (:issue:`496`) - Can store objects indexed by tuples and floats in HDFStore (:issue:`492`) -- Don't print length by default in Series.to_string, add `length` option (:issue:`489`) +- Don't print length by default in Series.to_string, add ``length`` option (:issue:`489`) - Improve Cython code for multi-groupby to aggregate without having to sort the data (:issue:`93`) - Improve MultiIndex reindexing speed by storing tuples in the MultiIndex, diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index 5ed48c0d8d6d9..4ca31baf560bb 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -23,7 +23,8 @@ New features .. code-block:: python from pandas.tools.plotting import scatter_matrix - scatter_matrix(df, alpha=0.2) # noqa F821 + + scatter_matrix(df, alpha=0.2) # noqa F821 - Add ``stacked`` argument to Series and DataFrame's ``plot`` method for @@ -31,12 +32,12 @@ New features .. code-block:: python - df.plot(kind='bar', stacked=True) # noqa F821 + df.plot(kind="bar", stacked=True) # noqa F821 .. code-block:: python - df.plot(kind='barh', stacked=True) # noqa F821 + df.plot(kind="barh", stacked=True) # noqa F821 - Add log x and y :ref:`scaling options ` to @@ -52,9 +53,9 @@ Reverted some changes to how NA values (represented typically as ``NaN`` or .. ipython:: python - series = pd.Series(['Steve', np.nan, 'Joe']) - series == 'Steve' - series != 'Steve' + series = pd.Series(["Steve", np.nan, "Joe"]) + series == "Steve" + series != "Steve" In comparisons, NA / NaN will always come through as ``False`` except with ``!=`` which is ``True``. *Be very careful* with boolean arithmetic, especially @@ -63,7 +64,7 @@ filter into boolean array operations if you are worried about this: .. ipython:: python - mask = series == 'Steve' + mask = series == "Steve" series[mask & series.notnull()] While propagating NA in comparisons may seem like the right behavior to some @@ -82,15 +83,18 @@ Series, to be more consistent with the ``groupby`` behavior with DataFrame: .. ipython:: python :okwarning: - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df - grouped = df.groupby('A')['C'] + grouped = df.groupby("A")["C"] grouped.describe() - grouped.apply(lambda x: x.sort_values()[-2:]) # top 2 values + grouped.apply(lambda x: x.sort_values()[-2:]) # top 2 values .. _whatsnew_0.7.3.contributors: diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 2a49315cc3b12..781054fc4de7c 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -69,19 +69,19 @@ Time Series changes and improvements series. Replaces now deprecated DateRange class - New ``PeriodIndex`` and ``Period`` classes for representing :ref:`time spans ` and performing **calendar logic**, - including the `12 fiscal quarterly frequencies `. + including the ``12 fiscal quarterly frequencies ``. This is a partial port of, and a substantial enhancement to, elements of the scikits.timeseries code base. Support for conversion between PeriodIndex and DatetimeIndex -- New Timestamp data type subclasses `datetime.datetime`, providing the same +- New Timestamp data type subclasses ``datetime.datetime``, providing the same interface while enabling working with nanosecond-resolution data. Also provides :ref:`easy time zone conversions `. - Enhanced support for :ref:`time zones `. Add - `tz_convert` and ``tz_localize`` methods to TimeSeries and DataFrame. All + ``tz_convert`` and ``tz_localize`` methods to TimeSeries and DataFrame. All timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time zone set will be localized to local time. Time zone conversions are therefore essentially free. User needs to know very little about pytz library now; only - time zone names as as strings are required. Time zone-aware timestamps are + time zone names as strings are required. Time zone-aware timestamps are equal if and only if their UTC timestamps match. Operations between time zone-aware time series with different time zones will result in a UTC-indexed time series. @@ -91,7 +91,7 @@ Time Series changes and improvements matplotlib-based plotting code - New ``date_range``, ``bdate_range``, and ``period_range`` :ref:`factory functions ` -- Robust **frequency inference** function `infer_freq` and ``inferred_freq`` +- Robust **frequency inference** function ``infer_freq`` and ``inferred_freq`` property of DatetimeIndex, with option to infer frequency on construction of DatetimeIndex - to_datetime function efficiently **parses array of strings** to @@ -159,7 +159,8 @@ New plotting methods .. code-block:: python import pandas as pd - fx = pd.read_pickle('data/fx_prices') + + fx = pd.read_pickle("data/fx_prices") import matplotlib.pyplot as plt ``Series.plot`` now supports a ``secondary_y`` option: @@ -168,20 +169,21 @@ New plotting methods plt.figure() - fx['FR'].plot(style='g') + fx["FR"].plot(style="g") - fx['IT'].plot(style='k--', secondary_y=True) + fx["IT"].plot(style="k--", secondary_y=True) Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot types. For example, ``'kde'`` is a new option: .. ipython:: python - s = pd.Series(np.concatenate((np.random.randn(1000), - np.random.randn(1000) * 0.5 + 3))) + s = pd.Series( + np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3)) + ) plt.figure() s.hist(density=True, alpha=0.2) - s.plot(kind='kde') + s.plot(kind="kde") See :ref:`the plotting page ` for much more. @@ -205,7 +207,8 @@ with code using scalar values because you are handing control over to NumPy: .. ipython:: python import datetime - rng = pd.date_range('1/1/2000', periods=10) + + rng = pd.date_range("1/1/2000", periods=10) rng[5] isinstance(rng[5], datetime.datetime) rng_asarray = np.asarray(rng) @@ -251,7 +254,7 @@ type. See `matplotlib documentation .. ipython:: python - rng = pd.date_range('1/1/2000', periods=10) + rng = pd.date_range("1/1/2000", periods=10) rng np.asarray(rng) converted = np.asarray(rng, dtype=object) diff --git a/doc/source/whatsnew/v0.9.0.rst b/doc/source/whatsnew/v0.9.0.rst index 565b965c116db..44ded51e31fda 100644 --- a/doc/source/whatsnew/v0.9.0.rst +++ b/doc/source/whatsnew/v0.9.0.rst @@ -8,7 +8,7 @@ Version 0.9.0 (October 7, 2012) This is a major release from 0.8.1 and includes several new features and enhancements along with a large number of bug fixes. New features include -vectorized unicode encoding/decoding for `Series.str`, `to_latex` method to +vectorized unicode encoding/decoding for ``Series.str``, ``to_latex`` method to DataFrame, more flexible parsing of boolean values, and enabling the download of options data from Yahoo! Finance. @@ -41,9 +41,11 @@ API changes import io - data = ('0,0,1\n' - '1,1,0\n' - '0,1,0') + data = """ + 0,0,1 + 1,1,0 + 0,1,0 + """ df = pd.read_csv(io.StringIO(data), header=None) df @@ -59,7 +61,7 @@ API changes s1 = pd.Series([1, 2, 3]) s1 - s2 = pd.Series(s1, index=['foo', 'bar', 'baz']) + s2 = pd.Series(s1, index=["foo", "bar", "baz"]) s2 - Deprecated ``day_of_year`` API removed from PeriodIndex, use ``dayofyear`` diff --git a/doc/source/whatsnew/v0.9.1.rst b/doc/source/whatsnew/v0.9.1.rst index 3b2924d175cdf..6b05e5bcded7e 100644 --- a/doc/source/whatsnew/v0.9.1.rst +++ b/doc/source/whatsnew/v0.9.1.rst @@ -15,7 +15,7 @@ DataFrame. New features ~~~~~~~~~~~~ - - `Series.sort`, `DataFrame.sort`, and `DataFrame.sort_index` can now be + - ``Series.sort``, ``DataFrame.sort``, and ``DataFrame.sort_index`` can now be specified in a per-column manner to support multiple sort orders (:issue:`928`) .. code-block:: ipython @@ -34,8 +34,8 @@ New features 1 1 0 0 5 1 0 0 - - `DataFrame.rank` now supports additional argument values for the - `na_option` parameter so missing values can be assigned either the largest + - ``DataFrame.rank`` now supports additional argument values for the + ``na_option`` parameter so missing values can be assigned either the largest or the smallest rank (:issue:`1508`, :issue:`2159`) .. ipython:: python @@ -51,10 +51,10 @@ New features df.rank(na_option='bottom') - - DataFrame has new `where` and `mask` methods to select values according to a + - DataFrame has new ``where`` and ``mask`` methods to select values according to a given boolean mask (:issue:`2109`, :issue:`2151`) - DataFrame currently supports slicing via a boolean vector the same length as the DataFrame (inside the `[]`). + DataFrame currently supports slicing via a boolean vector the same length as the DataFrame (inside the ``[]``). The returned DataFrame has the same number of columns as the original, but is sliced on its index. .. ipython:: python @@ -67,8 +67,8 @@ New features If a DataFrame is sliced with a DataFrame based boolean condition (with the same size as the original DataFrame), then a DataFrame the same size (index and columns) as the original is returned, with - elements that do not meet the boolean condition as `NaN`. This is accomplished via - the new method `DataFrame.where`. In addition, `where` takes an optional `other` argument for replacement. + elements that do not meet the boolean condition as ``NaN``. This is accomplished via + the new method ``DataFrame.where``. In addition, ``where`` takes an optional ``other`` argument for replacement. .. ipython:: python @@ -78,8 +78,8 @@ New features df.where(df>0,-df) - Furthermore, `where` now aligns the input boolean condition (ndarray or DataFrame), such that partial selection - with setting is possible. This is analogous to partial setting via `.ix` (but on the contents rather than the axis labels) + Furthermore, ``where`` now aligns the input boolean condition (ndarray or DataFrame), such that partial selection + with setting is possible. This is analogous to partial setting via ``.ix`` (but on the contents rather than the axis labels) .. ipython:: python @@ -87,7 +87,7 @@ New features df2[ df2[1:4] > 0 ] = 3 df2 - `DataFrame.mask` is the inverse boolean operation of `where`. + ``DataFrame.mask`` is the inverse boolean operation of ``where``. .. ipython:: python @@ -103,9 +103,9 @@ New features - Added option to disable pandas-style tick locators and formatters - using `series.plot(x_compat=True)` or `pandas.plot_params['x_compat'] = - True` (:issue:`2205`) - - Existing TimeSeries methods `at_time` and `between_time` were added to + using ``series.plot(x_compat=True)`` or ``pandas.plot_params['x_compat'] = + True`` (:issue:`2205`) + - Existing TimeSeries methods ``at_time`` and ``between_time`` were added to DataFrame (:issue:`2149`) - DataFrame.dot can now accept ndarrays (:issue:`2042`) - DataFrame.drop now supports non-unique indexes (:issue:`2101`) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4f0ca97310d85..6512e4cce02a9 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -18,7 +18,7 @@ including other versions of pandas. New deprecation policy ~~~~~~~~~~~~~~~~~~~~~~ -Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to +Starting with pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to version releases. Briefly, * Deprecations will be introduced in minor releases (e.g. 1.1.0, 1.2.0, 2.1.0, ...) @@ -46,7 +46,7 @@ We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the routine using `Numba `__ instead of Cython. Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and the data set is larger (1 million rows or greater). For more details, see -:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) +:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) .. _whatsnew_100.custom_window: @@ -57,7 +57,7 @@ We've added a :func:`pandas.api.indexers.BaseIndexer` class that allows users to window bounds are created during ``rolling`` operations. Users can define their own ``get_window_bounds`` method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate the start and end indices used for each window during the rolling aggregation. For more details and example usage, see -the :ref:`custom window rolling documentation ` +the :ref:`custom window rolling documentation ` .. _whatsnew_100.to_markdown: @@ -196,7 +196,7 @@ You can use the alias ``"boolean"`` as well. .. _whatsnew_100.convert_dtypes: -``convert_dtypes`` method to ease use of supported extension dtypes +Method ``convert_dtypes`` to ease use of supported extension dtypes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In order to encourage use of the extension dtypes ``StringDtype``, @@ -250,7 +250,7 @@ Other enhancements - :func:`read_excel` now can read binary Excel (``.xlsb``) files by passing ``engine='pyxlsb'``. For more details and example usage, see the :ref:`Binary Excel files documentation `. Closes :issue:`8540`. - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) -- DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) +- DataFrame constructor preserve ``ExtensionArray`` dtype with ``ExtensionArray`` (:issue:`11363`) - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`) - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) @@ -610,7 +610,7 @@ When :class:`Categorical` contains ``np.nan``, Default dtype of empty :class:`pandas.Series` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Initialising an empty :class:`pandas.Series` without specifying a dtype will raise a `DeprecationWarning` now +Initialising an empty :class:`pandas.Series` without specifying a dtype will raise a ``DeprecationWarning`` now (:issue:`17261`). The default dtype will change from ``float64`` to ``object`` in future releases so that it is consistent with the behaviour of :class:`DataFrame` and :class:`Index`. @@ -676,7 +676,7 @@ depending on how the results are cast back to the original dtype. Increased minimum version for Python ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). +pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). .. _whatsnew_100.api_breaking.deps: @@ -749,7 +749,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Build changes ^^^^^^^^^^^^^ -Pandas has added a `pyproject.toml `_ file and will no longer include +pandas has added a `pyproject.toml `_ file and will no longer include cythonized files in the source distribution uploaded to PyPI (:issue:`28341`, :issue:`20775`). If you're installing a built distribution (wheel) or via conda, this shouldn't have any effect on you. If you're building pandas from source, you should no longer need to install Cython into your build environment before calling ``pip install pandas``. @@ -763,7 +763,7 @@ Other API changes - :class:`core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) -- In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). +- In order to improve tab-completion, pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). To see which attributes are excluded, see an object's ``_deprecations`` attribute, for example ``pd.DataFrame._deprecations`` (:issue:`28805`). - The returned dtype of :func:`unique` now matches the input dtype. (:issue:`27874`) - Changed the default configuration value for ``options.matplotlib.register_converters`` from ``True`` to ``"auto"`` (:issue:`18720`). @@ -974,7 +974,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - The 'outer' method on Numpy ufuncs, e.g. ``np.subtract.outer`` operating on :class:`Series` objects is no longer supported, and will raise ``NotImplementedError`` (:issue:`27198`) - Removed ``Series.get_dtype_counts`` and ``DataFrame.get_dtype_counts`` (:issue:`27145`) - Changed the default "fill_value" argument in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`) -- Changed the default value for the `raw` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) +- Changed the default value for the ``raw`` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) - Removed deprecated behavior of :meth:`Series.argmin` and :meth:`Series.argmax`, use :meth:`Series.idxmin` and :meth:`Series.idxmax` for the old behavior (:issue:`16955`) - Passing a tz-aware ``datetime.datetime`` or :class:`Timestamp` into the :class:`Timestamp` constructor with the ``tz`` argument now raises a ``ValueError`` (:issue:`23621`) - Removed ``Series.base``, ``Index.base``, ``Categorical.base``, ``Series.flags``, ``Index.flags``, ``PeriodArray.flags``, ``Series.strides``, ``Index.strides``, ``Series.itemsize``, ``Index.itemsize``, ``Series.data``, ``Index.data`` (:issue:`20721`) @@ -1058,7 +1058,7 @@ Datetimelike - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`) - Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`) - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) -- Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`) +- Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype ``'timedelta64[ns]'`` (:issue:`28049`) - Bug in :func:`core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) @@ -1066,7 +1066,7 @@ Datetimelike - Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) -- Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) +- Bug in :func:`pandas.to_datetime` failing for ``deques`` when using ``cache=True`` (the default) (:issue:`29403`) - Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`) - Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`) - Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`) @@ -1082,23 +1082,21 @@ Timedelta ^^^^^^^^^ - Bug in subtracting a :class:`TimedeltaIndex` or :class:`TimedeltaArray` from a ``np.datetime64`` object (:issue:`29558`) - -- Timezones ^^^^^^^^^ - -- Numeric ^^^^^^^ - Bug in :meth:`DataFrame.quantile` with zero-column :class:`DataFrame` incorrectly raising (:issue:`23925`) - :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth:`DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) -- Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) +- Bug in :class:`DataFrame` logical operations (``&``, ``|``, ``^``) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) - Bug in :meth:`Series.var` not computing the right value with a nullable integer dtype series not passing through ddof argument (:issue:`29128`) -- Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) +- Improved error message when using ``frac`` > 1 and ``replace`` = False (:issue:`27451`) - Bug in numeric indexes resulted in it being possible to instantiate an :class:`Int64Index`, :class:`UInt64Index`, or :class:`Float64Index` with an invalid dtype (e.g. datetime-like) (:issue:`29539`) - Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) - Bug in :class:`NumericIndex` construction that caused indexing to fail when integers in the ``np.uint64`` range were used (:issue:`28023`) @@ -1113,7 +1111,6 @@ Numeric Conversion ^^^^^^^^^^ -- - Strings @@ -1137,8 +1134,8 @@ Indexing - Bug in assignment using a reverse slicer (:issue:`26939`) - Bug in :meth:`DataFrame.explode` would duplicate frame in the presence of duplicates in the index (:issue:`28010`) -- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) -- Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) +- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a ``Period`` (:issue:`28323`) (:issue:`28337`) +- Fix assignment of column via ``.loc`` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) - Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`) @@ -1152,18 +1149,17 @@ Indexing Missing ^^^^^^^ -- - MultiIndex ^^^^^^^^^^ - Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) -- Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) +- Series and MultiIndex ``.drop`` with ``MultiIndex`` raise exception if labels not in given in level (:issue:`8594`) - -I/O -^^^ +IO +^^ - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) - Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) @@ -1171,7 +1167,7 @@ I/O - Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`) - Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`) - Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`) -- Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with `engine='fastparquet'` if the file did not already exist (:issue:`28326`) +- Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with ``engine='fastparquet'`` if the file did not already exist (:issue:`28326`) - Bug in :func:`read_hdf` closing stores that it didn't open when Exceptions are raised (:issue:`28699`) - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) @@ -1183,9 +1179,9 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) -- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) +- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by ``record_path`` would raise a ``TypeError`` (:issue:`30148`) - :func:`read_excel` now accepts binary data (:issue:`15914`) -- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`) +- Bug in :meth:`read_csv` in which encoding handling was limited to just the string ``utf-16`` for the C engine (:issue:`24130`) Plotting ^^^^^^^^ @@ -1203,7 +1199,7 @@ Plotting - Allow :meth:`DataFrame.plot.scatter` to plot ``objects`` and ``datetime`` type data (:issue:`18755`, :issue:`30391`) - Bug in :meth:`DataFrame.hist`, ``xrot=0`` does not work with ``by`` and subplots (:issue:`30288`). -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) @@ -1236,7 +1232,7 @@ Reshaping - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). -- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) +- Better error message in :func:`get_dummies` when ``columns`` isn't a list-like value (:issue:`28383`) - Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`) - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ``ValueError`` (:issue:`28664`) - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) @@ -1244,8 +1240,8 @@ Reshaping - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) -- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) -- Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) +- Improved error message and docstring in :func:`cut` and :func:`qcut` when ``labels=True`` (:issue:`13318`) +- Bug in missing ``fill_na`` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) Sparse ^^^^^^ diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index c3f144e2f0cb3..3f7c6e85e14ca 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -47,7 +47,7 @@ Fixed regressions .. --------------------------------------------------------------------------- -Indexing with Nullable Boolean Arrays +Indexing with nullable boolean arrays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 85b29a58a1f15..e054ac830ce41 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_110: -What's new in 1.1.0 (??) ------------------------- +What's new in 1.1.0 (July 28, 2020) +----------------------------------- These are the changes in pandas 1.1.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -17,7 +17,7 @@ Enhancements KeyErrors raised by loc specify missing labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, if labels were missing for a loc call, a KeyError was raised stating that this was no longer supported. +Previously, if labels were missing for a ``.loc`` call, a KeyError was raised stating that this was no longer supported. Now the error message also includes a list of the missing labels (max 10 items, display width 80 characters). See :issue:`34272`. @@ -42,8 +42,8 @@ For example, the below now works: .. _whatsnew_110.period_index_partial_string_slicing: -Nonmonotonic PeriodIndex Partial String Slicing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Non-monotonic PeriodIndex partial string slicing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`) @@ -66,10 +66,10 @@ For example: .. _whatsnew_110.dataframe_or_series_comparing: -Comparing two `DataFrame` or two `Series` and summarizing the differences -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Comparing two ``DataFrame`` or two ``Series`` and summarizing the differences +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing two `DataFrame` or two `Series` (:issue:`30429`) +We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing two ``DataFrame`` or two ``Series`` (:issue:`30429`) .. ipython:: python @@ -116,23 +116,21 @@ compatibility (:issue:`3729`) .. ipython:: python - # Default `dropna` is set to True, which will exclude NaNs in keys + # Default ``dropna`` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() - # In order to allow NaN in keys, set `dropna` to False + # In order to allow NaN in keys, set ``dropna`` to False df_dropna.groupby(by=["b"], dropna=False).sum() The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. -.. versionadded:: 1.1.0 - .. _whatsnew_110.key_sorting: Sorting with keys ^^^^^^^^^^^^^^^^^ -We've added a ``key`` argument to the DataFrame and Series sorting methods, including +We've added a ``key`` argument to the :class:`DataFrame` and :class:`Series` sorting methods, including :meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied column-by-column to each column used for sorting, before sorting is performed (:issue:`27237`). @@ -157,8 +155,8 @@ method, we get s.sort_values(key=lambda x: x.str.lower()) -When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if -`by` is specified, e.g. +When applied to a ``DataFrame``, they key is applied per-column to all columns or a subset if +``by`` is specified, e.g. .. ipython:: python @@ -217,14 +215,14 @@ For example: Grouper and resample now supports the arguments origin and offset ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Grouper` and :class:`DataFrame.resample` now supports the arguments ``origin`` and ``offset``. It let the user control the timestamp on which to adjust the grouping. (:issue:`31809`) +:class:`Grouper` and :meth:`DataFrame.resample` now supports the arguments ``origin`` and ``offset``. It let the user control the timestamp on which to adjust the grouping. (:issue:`31809`) -The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like ``30D``) or that divides a day (like ``90s`` or ``1min``). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. -Two arguments are now deprecated (more information in the documentation of :class:`DataFrame.resample`): +Two arguments are now deprecated (more information in the documentation of :meth:`DataFrame.resample`): - ``base`` should be replaced by ``offset``. -- ``loffset`` should be replaced by directly adding an offset to the index DataFrame after being resampled. +- ``loffset`` should be replaced by directly adding an offset to the index :class:`DataFrame` after being resampled. Small example of the use of ``origin``: @@ -250,7 +248,7 @@ Resample using a fixed origin: ts.resample('17min', origin='epoch').sum() ts.resample('17min', origin='2000-01-01').sum() -If needed you can adjust the bins with the argument ``offset`` (a Timedelta) that would be added to the default ``origin``. +If needed you can adjust the bins with the argument ``offset`` (a :class:`Timedelta`) that would be added to the default ``origin``. For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. @@ -276,17 +274,25 @@ change, as ``fsspec`` will still bring in the same packages as before. Other enhancements ^^^^^^^^^^^^^^^^^^ -- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) -- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) -- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) -- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) -- Add :meth:`ExtensionArray.argmax` and :meth:`ExtensionArray.argmin` (:issue:`24382`) +- Compatibility with matplotlib 3.3.0 (:issue:`34850`) +- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:`32538`) +- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) +- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). +- Added :meth:`DataFrame.value_counts` (:issue:`5377`) +- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. +- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) +- :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`) +- :class:`~pandas.io.formats.style.Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) +- :meth:`~pandas.io.formats.style.Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) +- When writing directly to a sqlite connection :meth:`DataFrame.to_sql` now supports the ``multi`` method (:issue:`29921`) +- :class:`pandas.errors.OptionError` is now exposed in ``pandas.errors`` (:issue:`27553`) +- Added :meth:`api.extensions.ExtensionArray.argmax` and :meth:`api.extensions.ExtensionArray.argmin` (:issue:`24382`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) -- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). +- :class:`Series.str` now has a ``fullmatch`` method that matches a regular expression against the entire string in each row of the :class:`Series`, similar to ``re.fullmatch`` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) -- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). +- :meth:`Index.union` will now raise ``RuntimeWarning`` for :class:`MultiIndex` objects if the object inside are unsortable. Pass ``sort=False`` to suppress this warning (:issue:`33015`) +- Added :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). @@ -300,48 +306,52 @@ Other enhancements - :meth:`melt` has gained an ``ignore_index`` (default ``True``) argument that, if set to ``False``, prevents the method from dropping the index (:issue:`17440`). - :meth:`Series.update` now accepts objects that can be coerced to a :class:`Series`, such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) -- :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) +- :meth:`~pandas.core.groupby.DataFrameGroupBy.transform` and :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` have gained ``engine`` and ``engine_kwargs`` arguments that support executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) -- :class:`~pandas.core.groupby.generic.DataFrameGroupBy` and :class:`~pandas.core.groupby.generic.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) +- :class:`~pandas.core.groupby.DataFrameGroupBy` and :class:`~pandas.core.groupby.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) - :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`) -- The ``ExtensionArray`` class has now an :meth:`~pandas.arrays.ExtensionArray.equals` - method, similarly to :meth:`Series.equals` (:issue:`27081`). -- The minimum suppported dta version has increased to 105 in :meth:`~pandas.io.stata.read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). -- :meth:`~pandas.core.frame.DataFrame.to_stata` supports compression using the ``compression`` +- Added :class:`api.extension.ExtensionArray.equals` to the extension array interface, similar to :meth:`Series.equals` (:issue:`27081`) +- The minimum supported dta version has increased to 105 in :func:`read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). +- :meth:`~DataFrame.to_stata` supports compression using the ``compression`` keyword argument. Compression can either be inferred or explicitly set using a string or a dictionary containing both the method and any additional arguments that are passed to the compression library. Compression was also added to the low-level Stata-file writers :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). -- :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). -- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts `xlabel` and `ylabel` parameters to present labels on x and y axis (:issue:`9093`). -- Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) -- Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- :meth:`HDFStore.put` now accepts a ``track_times`` parameter. This parameter is passed to the ``create_table`` method of ``PyTables`` (:issue:`32682`). +- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts ``xlabel`` and ``ylabel`` parameters to present labels on x and y axis (:issue:`9093`). +- Made :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) +- Made ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) -- :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). -- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). +- :meth:`~pandas.core.groupby.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :func:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). - :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). -- :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). -- :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). -- :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). +- :func:`read_gbq` now allows to disable progress bar (:issue:`33360`). +- :func:`read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). +- :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ``ddof`` to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) -- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similarly to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). -- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) +- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similar to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). +- :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` now accept ``index`` argument as an alias for tabulate's ``showindex`` (:issue:`32667`) +- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable Boolean dtype (:issue:`34859`) - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) +- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) +- :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for ``yerr`` and/or ``xerr``, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) .. --------------------------------------------------------------------------- -.. _whatsnew_110.api: +.. _whatsnew_110.notable_bug_fixes: -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Notable bug fixes +~~~~~~~~~~~~~~~~~ -``MultiIndex.get_indexer`` interprets `method` argument differently +These are bug fixes that might have notable behavior changes. + +``MultiIndex.get_indexer`` interprets ``method`` argument correctly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`). @@ -401,15 +411,13 @@ And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'` df.reindex(mi_2, method='pad') -- - -.. _whatsnew_110.api_breaking.indexing_raises_key_errors: +.. _whatsnew_110.notable_bug_fixes.indexing_raises_key_errors: -Failed Label-Based Lookups Always Raise KeyError +Failed label-based lookups always raise KeyError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Label lookups ``series[key]``, ``series.loc[key]`` and ``frame.loc[key]`` -used to raises either ``KeyError`` or ``TypeError`` depending on the type of +used to raise either ``KeyError`` or ``TypeError`` depending on the type of key and type of :class:`Index`. These now consistently raise ``KeyError`` (:issue:`31867`) .. ipython:: python @@ -473,11 +481,14 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss ... KeyError: Timestamp('1970-01-01 00:00:00') -.. _whatsnew_110.api_breaking.indexing_int_multiindex_raises_key_errors: + +Similarly, :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) + +.. _whatsnew_110.notable_bug_fixes.indexing_int_multiindex_raises_key_errors: Failed Integer Lookups on MultiIndex Raise KeyError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Indexing with integers with a :class:`MultiIndex` that has a integer-dtype +Indexing with integers with a :class:`MultiIndex` that has an integer-dtype first level incorrectly failed to raise ``KeyError`` when one or more of those integer keys is not present in the first level of the index (:issue:`33539`) @@ -505,12 +516,14 @@ those integer keys is not present in the first level of the index (:issue:`33539 :meth:`DataFrame.merge` preserves right frame's row order ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) +:meth:`DataFrame.merge` now preserves the right frame's row order when executing a right merge (:issue:`27453`) .. ipython:: python - left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) - right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) + left_df = pd.DataFrame({'animal': ['dog', 'pig'], + 'max_speed': [40, 11]}) + right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], + 'max_speed': [80, 11]}) left_df right_df @@ -531,12 +544,12 @@ those integer keys is not present in the first level of the index (:issue:`33539 .. --------------------------------------------------------------------------- -.. _whatsnew_110.api_breaking.assignment_to_multiple_columns: +.. _whatsnew_110.notable_bug_fixes.assignment_to_multiple_columns: Assignment to multiple columns of a DataFrame when some columns do not exist ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns will be constructed with the right values. (:issue:`13658`) .. ipython:: python @@ -562,7 +575,7 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns df[['a', 'c']] = 1 df -.. _whatsnew_110.api_breaking.groupby_consistency: +.. _whatsnew_110.notable_bug_fixes.groupby_consistency: Consistency across groupby reductions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -609,7 +622,7 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma df.groupby("a", as_index=False).nunique() -The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a `DataFrame` instead of a `Series`. (:issue:`32599`) +The method :meth:`~pandas.core.groupby.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a :class:`DataFrame` instead of a :class:`Series`. (:issue:`32599`) *Previous behavior*: @@ -628,7 +641,44 @@ The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index df.groupby("a", as_index=False).size() -.. _whatsnew_110.api_breaking.apply_applymap_first_once: +.. _whatsnew_110.api_breaking.groupby_results_lost_as_index_false: + +:meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost results with ``as_index=False`` when relabeling columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously :meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was +set to ``False`` and the result columns were relabeled. In this case the result values were replaced with +the previous index (:issue:`32240`). + +.. ipython:: python + + df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [2]: grouped = df.groupby("key", as_index=False) + In [3]: result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + In [4]: result + Out[4]: + min_val + 0 x + 1 y + 2 z + +*New behavior*: + +.. ipython:: python + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + result + + +.. _whatsnew_110.notable_bug_fixes.apply_applymap_first_once: apply and applymap on ``DataFrame`` evaluates first row/column only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -666,43 +716,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) -.. _whatsnew_110.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` - will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) -- Added :meth:`DataFrame.value_counts` (:issue:`5377`) -- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) -- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) -- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. -- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) -- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). -- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`33610`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) - now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) -- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) -- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) -- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) -- Combining a ``Categorical`` with integer categories and which contains missing values - with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` - will now result in a float column instead of an object dtyped column (:issue:`33607`) -- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. -- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` - (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) -- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) -- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). - - Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -773,7 +786,7 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -Development Changes +Development changes ^^^^^^^^^^^^^^^^^^^ - The minimum version of Cython is now the most recent bug-fix version (0.29.16) (:issue:`33334`). @@ -784,47 +797,47 @@ Development Changes Deprecations ~~~~~~~~~~~~ -- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) +- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated and will raise in a future version. Either convert the list to a tuple, or pass the slice directly instead (:issue:`31333`) -- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) +- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include ``datetime64`` and ``datetime64tz`` columns in a future version (:issue:`29941`) - Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) -- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) +- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` and will raise in a future version (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) - Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, :issue:`34741`). -- Indexing a series with a multi-dimensional indexer like ``[:, None]`` to return an ndarray now raises a ``FutureWarning``. Convert to a NumPy array before indexing instead (:issue:`27837`) +- Indexing a :class:`Series` with a multi-dimensional indexer like ``[:, None]`` to return an ``ndarray`` now raises a ``FutureWarning``. Convert to a NumPy array before indexing instead (:issue:`27837`) - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) -- Passing any arguments but the first one to :func:`read_html` as - positional arguments is deprecated since version 1.1. All other +- Passing any arguments but the first one to :func:`read_html` as + positional arguments is deprecated. All other arguments should be given as keyword arguments (:issue:`27573`). -- Passing any arguments but `path_or_buf` (the first one) to - :func:`read_json` as positional arguments is deprecated since - version 1.1. All other arguments should be given as keyword - arguments (:issue:`27573`). +- Passing any arguments but ``path_or_buf`` (the first one) to + :func:`read_json` as positional arguments is deprecated. All + other arguments should be given as keyword arguments (:issue:`27573`). -- Passing any arguments but the first 2 to :func:`read_excel` as - positional arguments is deprecated since version 1.1. All other +- Passing any arguments but the first two to :func:`read_excel` as + positional arguments is deprecated. All other arguments should be given as keyword arguments (:issue:`27573`). -- :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use `:func:pandas.api.types.is_categorical_dtype` instead (:issue:`33385`) +- :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`33385`) - :meth:`Index.get_value` is deprecated and will be removed in a future version (:issue:`19728`) -- :meth:`Series.dt.week` and `Series.dt.weekofyear` are deprecated and will be removed in a future version, use :meth:`Series.dt.isocalendar().week` instead (:issue:`33595`) -- :meth:`DatetimeIndex.week` and `DatetimeIndex.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeIndex.isocalendar().week` instead (:issue:`33595`) -- :meth:`DatetimeArray.week` and `DatetimeArray.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeArray.isocalendar().week` instead (:issue:`33595`) +- :meth:`Series.dt.week` and :meth:`Series.dt.weekofyear` are deprecated and will be removed in a future version, use :meth:`Series.dt.isocalendar().week` instead (:issue:`33595`) +- :meth:`DatetimeIndex.week` and ``DatetimeIndex.weekofyear`` are deprecated and will be removed in a future version, use ``DatetimeIndex.isocalendar().week`` instead (:issue:`33595`) +- :meth:`DatetimeArray.week` and ``DatetimeArray.weekofyear`` are deprecated and will be removed in a future version, use ``DatetimeArray.isocalendar().week`` instead (:issue:`33595`) - :meth:`DateOffset.__call__` is deprecated and will be removed in a future version, use ``offset + other`` instead (:issue:`34171`) +- :meth:`~pandas.tseries.offsets.BusinessDay.apply_index` is deprecated and will be removed in a future version. Use ``offset + other`` instead (:issue:`34580`) - :meth:`DataFrame.tshift` and :meth:`Series.tshift` are deprecated and will be removed in a future version, use :meth:`DataFrame.shift` and :meth:`Series.shift` instead (:issue:`11631`) - Indexing an :class:`Index` object with a float key is deprecated, and will raise an ``IndexError`` in the future. You can manually convert to an integer key instead (:issue:`34191`). -- The ``squeeze`` keyword in the ``groupby`` function is deprecated and will be removed in a future version (:issue:`32380`) -- The ``tz`` keyword in :meth:`Period.to_timestamp` is deprecated and will be removed in a future version; use `per.to_timestamp(...).tz_localize(tz)`` instead (:issue:`34522`) +- The ``squeeze`` keyword in :meth:`~DataFrame.groupby` is deprecated and will be removed in a future version (:issue:`32380`) +- The ``tz`` keyword in :meth:`Period.to_timestamp` is deprecated and will be removed in a future version; use ``per.to_timestamp(...).tz_localize(tz)`` instead (:issue:`34522`) - :meth:`DatetimeIndex.to_perioddelta` is deprecated and will be removed in a future version. Use ``index - index.to_period(freq).to_timestamp()`` instead (:issue:`34853`) -- :meth:`util.testing.assert_almost_equal` now accepts both relative and absolute - precision through the ``rtol``, and ``atol`` parameters, thus deprecating the - ``check_less_precise`` parameter. (:issue:`13357`). -- :func:`DataFrame.melt` accepting a value_name that already exists is deprecated, and will be removed in a future version (:issue:`34731`) +- :meth:`DataFrame.melt` accepting a ``value_name`` that already exists is deprecated, and will be removed in a future version (:issue:`34731`) +- The ``center`` keyword in the :meth:`DataFrame.expanding` function is deprecated and will be removed in a future version (:issue:`20647`) + + .. --------------------------------------------------------------------------- @@ -837,7 +850,7 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) -- Performance improvement in arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=1`` (:issue:`33600`) +- Performance improvement in arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=1`` (:issue:`33600`) - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) @@ -847,14 +860,14 @@ Performance improvements :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first` and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) -- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). +- Performance improvement in :func:`factorize` for nullable (integer and Boolean) dtypes (:issue:`33064`). - Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`) - Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`) -- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). +- Performance improvement in reductions (``sum``, ``prod``, ``min``, ``max``) for nullable (integer and Boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) -- Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) -- Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) +- Performance improvement in arithmetic operations (``sub``, ``add``, ``mul``, ``div``) for :class:`MultiIndex` (:issue:`34297`) +- Performance improvement in ``DataFrame[bool_indexer]`` when ``bool_indexer`` is a ``list`` (:issue:`33924`) - Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`) .. --------------------------------------------------------------------------- @@ -868,75 +881,93 @@ Bug fixes Categorical ^^^^^^^^^^^ +- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) +- Combining a :class:`Categorical` with integer categories and which contains missing values with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` will now result in a float column instead of an object dtype column (:issue:`33607`) - Bug where :func:`merge` was unable to join on non-unique categorical indices (:issue:`28189`) - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) - Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) - :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`) -- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`) +- Repr of :class:`Categorical` was not distinguishing between ``int`` and ``str`` (:issue:`33676`) Datetimelike ^^^^^^^^^^^^ -- Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`) +- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) +- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) +- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) +- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) +- Bug in :class:`Timestamp` where constructing a :class:`Timestamp` from ambiguous epoch time and calling constructor again changed the :meth:`Timestamp.value` property (:issue:`24329`) - :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`) - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`) - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`) -- :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) -- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) +- :class:`Timestamp` raised a confusing error message when year, month or day is missing (:issue:`31200`) +- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtype inputs (:issue:`32668`) - Bug in :meth:`DatetimeIndex.searchsorted` not accepting a ``list`` or :class:`Series` as its argument (:issue:`32762`) - Bug where :meth:`PeriodIndex` raised when passed a :class:`Series` of strings (:issue:`26109`) -- Bug in :class:`Timestamp` arithmetic when adding or subtracting a ``np.ndarray`` with ``timedelta64`` dtype (:issue:`33296`) -- Bug in :meth:`DatetimeIndex.to_period` not infering the frequency when called with no arguments (:issue:`33358`) -- Bug in :meth:`DatetimeIndex.tz_localize` incorrectly retaining ``freq`` in some cases where the original freq is no longer valid (:issue:`30511`) +- Bug in :class:`Timestamp` arithmetic when adding or subtracting an ``np.ndarray`` with ``timedelta64`` dtype (:issue:`33296`) +- Bug in :meth:`DatetimeIndex.to_period` not inferring the frequency when called with no arguments (:issue:`33358`) +- Bug in :meth:`DatetimeIndex.tz_localize` incorrectly retaining ``freq`` in some cases where the original ``freq`` is no longer valid (:issue:`30511`) - Bug in :meth:`DatetimeIndex.intersection` losing ``freq`` and timezone in some cases (:issue:`33604`) - Bug in :meth:`DatetimeIndex.get_indexer` where incorrect output would be returned for mixed datetime-like targets (:issue:`33741`) - Bug in :class:`DatetimeIndex` addition and subtraction with some types of :class:`DateOffset` objects incorrectly retaining an invalid ``freq`` attribute (:issue:`33779`) - Bug in :class:`DatetimeIndex` where setting the ``freq`` attribute on an index could silently change the ``freq`` attribute on another index viewing the same data (:issue:`33552`) -- :meth:`DataFrame.min`/:meth:`DataFrame.max` not returning consistent result with :meth:`Series.min`/:meth:`Series.max` when called on objects initialized with empty :func:`pd.to_datetime` +- :meth:`DataFrame.min` and :meth:`DataFrame.max` were not returning consistent results with :meth:`Series.min` and :meth:`Series.max` when called on objects initialized with empty :func:`pd.to_datetime` - Bug in :meth:`DatetimeIndex.intersection` and :meth:`TimedeltaIndex.intersection` with results not having the correct ``name`` attribute (:issue:`33904`) - Bug in :meth:`DatetimeArray.__setitem__`, :meth:`TimedeltaArray.__setitem__`, :meth:`PeriodArray.__setitem__` incorrectly allowing values with ``int64`` dtype to be silently cast (:issue:`33717`) - Bug in subtracting :class:`TimedeltaIndex` from :class:`Period` incorrectly raising ``TypeError`` in some cases where it should succeed and ``IncompatibleFrequency`` in some cases where it should raise ``TypeError`` (:issue:`33883`) -- Bug in constructing a Series or Index from a read-only NumPy array with non-ns +- Bug in constructing a :class:`Series` or :class:`Index` from a read-only NumPy array with non-ns resolution which converted to object dtype instead of coercing to ``datetime64[ns]`` dtype when within the timestamp bounds (:issue:`34843`). - The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`) +- Bug in :meth:`DataFrame.append` when appending a :class:`Series` containing a scalar tz-aware :class:`Timestamp` to an empty :class:`DataFrame` resulted in an object column instead of ``datetime64[ns, tz]`` dtype (:issue:`35038`) +- ``OutOfBoundsDatetime`` issues an improved error message when timestamp is out of implementation bounds. (:issue:`32967`) +- Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) +- Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) +- Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) Timedelta ^^^^^^^^^ - Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`) -- Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta`` incorrectly returning ``NaT`` (:issue:`31869`) -- Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`) +- Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta` incorrectly returning ``NaT`` (:issue:`31869`) +- :class:`Timedelta` now understands ``µs`` as an identifier for microsecond (:issue:`32899`) - :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`) -- Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) +- Bug in comparing a :class:`Timedelta` object against an ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) - Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`) -- Bug in :class:`Timedelta` and `pandas.to_timedelta` that ignored `unit`-argument for string input (:issue:`12136`) +- Bug in :class:`Timedelta` and :func:`pandas.to_timedelta` that ignored the ``unit`` argument for string input (:issue:`12136`) Timezones ^^^^^^^^^ - Bug in :func:`to_datetime` with ``infer_datetime_format=True`` where timezone names (e.g. ``UTC``) would not be parsed correctly (:issue:`33133`) -- Numeric ^^^^^^^ - Bug in :meth:`DataFrame.floordiv` with ``axis=0`` not treating division-by-zero like :meth:`Series.floordiv` (:issue:`31271`) -- Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) -- Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) +- Bug in :func:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) +- Bug in :func:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) +- Bug in arithmetic operations between :class:`DataFrame` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) +- Bug in :meth:`Index.difference` giving incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`) +- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) +- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raise a ValueError if ``limit_direction`` is ``'forward'`` or ``'both'`` and ``method`` is ``'backfill'`` or ``'bfill'`` or ``limit_direction`` is ``'backward'`` or ``'both'`` and ``method`` is ``'pad'`` or ``'ffill'`` (:issue:`34746`) Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) - Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`32402`) - Bug in :class:`DataFrame` construction where sets would be duplicated rather than raising (:issue:`32582`) +- The :class:`DataFrame` constructor no longer accepts a list of :class:`DataFrame` objects. Because of changes to NumPy, :class:`DataFrame` objects are now consistently treated as 2D objects, so a list of :class:`DataFrame` objects is considered 3D, and no longer acceptable for the :class:`DataFrame` constructor (:issue:`32289`). +- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) +- Improved error message for invalid construction of list when creating a new index (:issue:`35190`) + Strings ^^^^^^^ @@ -944,15 +975,16 @@ Strings - Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). - Fixed issue where taking ``min`` or ``max`` of a ``StringArray`` or ``Series`` with ``StringDtype`` type would raise. (:issue:`31746`) - Bug in :meth:`Series.str.cat` returning ``NaN`` output when other had :class:`Index` type (:issue:`33425`) - +- :func:`pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. Interval ^^^^^^^^ - Bug in :class:`IntervalArray` incorrectly allowing the underlying data to be changed when setting values (:issue:`32782`) -- Indexing ^^^^^^^^ + +- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. Previously an ``AttributeError`` was raised (:issue:`33610`) - Bug in slicing on a :class:`DatetimeIndex` with a partial-timestamp dropping high-resolution indices near the end of a year, quarter, or month (:issue:`31064`) - Bug in :meth:`PeriodIndex.get_loc` treating higher-resolution strings differently from :meth:`PeriodIndex.get_value` (:issue:`31172`) - Bug in :meth:`Series.at` and :meth:`DataFrame.at` not matching ``.loc`` behavior when looking up an integer in a :class:`Float64Index` (:issue:`31329`) @@ -963,39 +995,41 @@ Indexing - Bug in :meth:`DataFrame.at` when either columns or index is non-unique (:issue:`33041`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) -- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) +- Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) - Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`) -- Bug in :class:`Index` constructor where an unhelpful error message was raised for ``numpy`` scalars (:issue:`33017`) +- Bug in :class:`Index` constructor where an unhelpful error message was raised for NumPy scalars (:issue:`33017`) - Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`) -- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`) - Bug in :class:`Interval` where a :class:`Timedelta` could not be added or subtracted from a :class:`Timestamp` interval (:issue:`32023`) -- Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`) +- Bug in :meth:`DataFrame.copy` not invalidating _item_cache after copy caused post-copy value updates to not be reflected (:issue:`31784`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` throwing an error when a ``datetime64[ns, tz]`` value is provided (:issue:`32395`) -- Bug in `Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) -- Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame`` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) -- Bug in :meth:`DatetimeIndex.insert` and :meth:`TimedeltaIndex.insert` causing index ``freq`` to be lost when setting an element into an empty :class:`Series` (:issue:33573`) +- Bug in :meth:`Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) +- Bug in :meth:`DataFrame.iloc` when slicing a single column :class:`DataFrame` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) +- Bug in :meth:`DatetimeIndex.insert` and :meth:`TimedeltaIndex.insert` causing index ``freq`` to be lost when setting an element into an empty :class:`Series` (:issue:`33573`) - Bug in :meth:`Series.__setitem__` with an :class:`IntervalIndex` and a list-like key of integers (:issue:`33473`) - Bug in :meth:`Series.__getitem__` allowing missing labels with ``np.ndarray``, :class:`Index`, :class:`Series` indexers but not ``list``, these now all raise ``KeyError`` (:issue:`33646`) - Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` where index was assumed to be monotone increasing (:issue:`33756`) -- Indexing with a list of strings representing datetimes failed on :class:`DatetimeIndex` or :class:`PeriodIndex`(:issue:`11278`) +- Indexing with a list of strings representing datetimes failed on :class:`DatetimeIndex` or :class:`PeriodIndex` (:issue:`11278`) - Bug in :meth:`Series.at` when used with a :class:`MultiIndex` would raise an exception on valid inputs (:issue:`26989`) - Bug in :meth:`DataFrame.loc` with dictionary of values changes columns with dtype of ``int`` to ``float`` (:issue:`34573`) -- Bug in :meth:`Series.loc` when used with a :class:`MultiIndex` would raise an IndexingError when accessing a None value (:issue:`34318`) +- Bug in :meth:`Series.loc` when used with a :class:`MultiIndex` would raise an ``IndexingError`` when accessing a ``None`` value (:issue:`34318`) - Bug in :meth:`DataFrame.reset_index` and :meth:`Series.reset_index` would not preserve data types on an empty :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` (:issue:`19602`) - Bug in :class:`Series` and :class:`DataFrame` indexing with a ``time`` key on a :class:`DatetimeIndex` with ``NaT`` entries (:issue:`35114`) Missing ^^^^^^^ -- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). -- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) -- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`) -- Clarified documentation on interpolate with method =akima. The ``der`` parameter must be scalar or None (:issue:`33426`) -- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) -- Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) -- passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) +- Calling :meth:`fillna` on an empty :class:`Series` now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). +- Bug in :meth:`Series.replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) +- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable Boolean dtype and with ``skipna=False`` (:issue:`33253`) +- Clarified documentation on interpolate with ``method=akima``. The ``der`` parameter must be scalar or ``None`` (:issue:`33426`) +- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`DataFrame.fillna` (:issue:`12918`, :issue:`29146`) +- Bug in :meth:`DataFrame.interpolate` when called on a :class:`DataFrame` with column names of string type was throwing a ValueError. The method is now independent of the type of the column names (:issue:`33956`) +- Passing :class:`NA` into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) +- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) MultiIndex ^^^^^^^^^^ + +- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously an ``AttributeError`` was raised (:issue:`31126`) - Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) .. ipython:: python @@ -1015,123 +1049,129 @@ MultiIndex # Common elements are now guaranteed to be ordered by the left side left.intersection(right, sort=False) -- Bug when joining 2 Multi-indexes, without specifying level with different columns. Return-indexers parameter is ignored. (:issue:`34074`) +- Bug when joining two :class:`MultiIndex` without specifying level with different columns. Return-indexers parameter was ignored. (:issue:`34074`) -I/O -^^^ +IO +^^ +- Passing a ``set`` as ``names`` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) - Bug in print-out when ``display.precision`` is zero. (:issue:`20359`) -- Bug in :meth:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) -- `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`) +- Bug in :func:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) +- :func:`read_csv` will now raise a ``ValueError`` when the arguments ``header`` and ``prefix`` both are not ``None``. (:issue:`27394`) - Bug in :meth:`DataFrame.to_json` was raising ``NotFoundError`` when ``path_or_buf`` was an S3 URI (:issue:`28375`) - Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). -- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) -- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- Bug in :func:`read_csv` was raising ``TypeError`` when ``sep=None`` was used in combination with ``comment`` keyword (:issue:`31396`) +- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a :class:`DataFrame` in Python 3 from fixed format written in Python 2 (:issue:`31750`) - :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) -- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) -- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) -- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) -- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) -- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) -- Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns -- Bug in :meth:`read_sas` was raising an ``AttributeError`` when reading files from Google Cloud Storage (issue:`33069`) +- :func:`read_csv` will raise a ``ValueError`` when the column names passed in ``parse_dates`` are missing in the :class:`Dataframe` (:issue:`31251`) +- Bug in :func:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) +- Bug in :func:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) +- Bug in :func:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) +- Bug in :func:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) +- Bug in :func:`read_csv` was raising an ``IndexError`` when ``header=None`` and two extra data columns +- Bug in :func:`read_sas` was raising an ``AttributeError`` when reading files from Google Cloud Storage (:issue:`33069`) - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) -- Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) -- Bug in :meth:`read_json` was raising ``TypeError`` when reading a list of booleans into a Series. (:issue:`31464`) -- Bug in :func:`pandas.io.json.json_normalize` where location specified by `record_path` doesn't point to an array. (:issue:`26284`) +- Bug in :func:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) +- Bug in :func:`read_json` was raising ``TypeError`` when reading a ``list`` of Booleans into a :class:`Series`. (:issue:`31464`) +- Bug in :func:`pandas.io.json.json_normalize` where location specified by ``record_path`` doesn't point to an array. (:issue:`26284`) - :func:`pandas.read_hdf` has a more explicit error message when loading an unsupported HDF file (:issue:`9539`) -- Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) -- Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) -- Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) -- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) -- :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) -- `TypeError` exceptions raised by :meth:`read_csv` and :meth:`read_table` were showing as ``parser_f`` when an unexpected keyword argument was passed (:issue:`25648`) -- Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) -- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) -- Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) -- Bug in :meth:`~HDFStore.create_table` now raises an error when `column` argument was not specified in `data_columns` on input (:issue:`28156`) -- :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. +- Bug in :meth:`~DataFrame.read_feather` was raising an ``ArrowIOError`` when reading an s3 or http file path (:issue:`29055`) +- Bug in :meth:`~DataFrame.to_excel` could not handle the column name ``render`` and was raising an ``KeyError`` (:issue:`34331`) +- Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the ``%`` character and no parameters were present (:issue:`34211`) +- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with different dtypes when reading data using an iterator. (:issue:`31544`) +- :meth:`HDFStore.keys` has now an optional ``include`` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) +- ``TypeError`` exceptions raised by :func:`read_csv` and :func:`read_table` were showing as ``parser_f`` when an unexpected keyword argument was passed (:issue:`25648`) +- Bug in :func:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) +- Bug in :func:`ujson.encode` was raising an ``OverflowError`` with numbers larger than ``sys.maxsize`` (:issue:`34395`) +- Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the ``min_itemsize`` parameter is set (:issue:`11238`) +- Bug in :meth:`~HDFStore.create_table` now raises an error when ``column`` argument was not specified in ``data_columns`` on input (:issue:`28156`) +- :func:`read_json` now could read line-delimited json file from a file url while ``lines`` and ``chunksize`` are set. - Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) -- Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` given as list (:issue:`31783`) -- Bug in "meth"`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) +- Bug where capitalised files extensions were not decompressed by read_* functions (:issue:`35164`) +- Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` is given as a ``list`` (:issue:`31783`) +- Bug in :func:`read_excel` where datetime values are used in the header in a :class:`MultiIndex` (:issue:`34748`) +- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in the keyword argument ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in the keyword argument ``encoding`` now raises a ``TypeError`` (:issue:`34464`) +- Bug in :meth:`DataFrame.to_records` was incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) Plotting ^^^^^^^^ -- :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`). +- :meth:`DataFrame.plot` for line/bar now accepts color by dictionary (:issue:`8193`). - Bug in :meth:`DataFrame.plot.hist` where weights are not working for multiple columns (:issue:`33173`) -- Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``medianprops`` (:issue:`30346`) +- Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``boxprops`` (:issue:`30346`) - Bug in :meth:`DataFrame.hist` where the order of ``column`` argument was ignored (:issue:`29235`) -- Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars alway use the first ``cmap`` (:issue:`33389`) -- Bug in :meth:`DataFrame.plot.scatter` was adding a colorbar to the plot even if the argument `c` was assigned to a column containing color names (:issue:`34316`) +- Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars always use the first ``cmap`` (:issue:`33389`) +- Bug in :meth:`DataFrame.plot.scatter` was adding a colorbar to the plot even if the argument ``c`` was assigned to a column containing color names (:issue:`34316`) - Bug in :meth:`pandas.plotting.bootstrap_plot` was causing cluttered axes and overlapping labels (:issue:`34905`) +- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) -- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Using a :class:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :class:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) now raise a ``TypeError`` if a non-accepted keyword argument is passed into it. Previously an ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted, has duplicates, and the applied ``func`` does not mutate passed in objects (:issue:`30667`) +- Bug in :meth:`DataFrameGroupBy.transform` produces an incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`Groupby.transform` was returning the wrong result when grouping by multiple keys of which some were categorical and others not (:issue:`32494`) -- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) -- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) +- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by columns contain NaNs (:issue:`32841`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean :class:`Series` (:issue:`32894`) - Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` where a large negative number would be returned when the number of non-null values was below ``min_count`` for nullable integer dtypes (:issue:`32861`) -- Bug in :meth:`SeriesGroupBy.quantile` raising on nullable integers (:issue:`33136`) +- Bug in :meth:`SeriesGroupBy.quantile` was raising on nullable integers (:issue:`33136`) - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) -- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the output index shape for functions returning a DataFrame which is equally indexed - to the input DataFrame is inconsistent. An internal heuristic to detect index mutation would behave differently for equal but not identical - indices. In particular, the result index shape might change if a copy of the input would be returned. - The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) -- Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) -- Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) +- Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of :class:`SeriesGroupBy` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) +- Bug in :meth:`DataFrame.groupby` lost the name of the :class:`Index` when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raised ``TypeError`` for non-numeric types rather than dropping the columns (:issue:`27892`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) -- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) +- Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`). +- Bug in :meth:`DataFrameGroupBy.first` and :meth:`DataFrameGroupBy.last` that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) Reshaping ^^^^^^^^^ -- Bug effecting all numeric and boolean reduction methods not returning subclassed data type. (:issue:`25596`) -- Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) -- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` can take tuple names in MultiIndexed data (:issue:`19966`) +- Bug effecting all numeric and Boolean reduction methods not returning subclassed data type. (:issue:`25596`) +- Bug in :meth:`DataFrame.pivot_table` when only :class:`MultiIndexed` columns is set (:issue:`17038`) +- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` can take tuple names in :class:`MultiIndexed` data (:issue:`19966`) - Bug in :meth:`DataFrame.pivot_table` when ``margin`` is ``True`` and only ``column`` is defined (:issue:`31016`) -- Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) -- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) +- Fixed incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) +- Bug in :func:`crosstab` when inputs are two :class:`Series` and have tuple names, the output will keep a dummy :class:`MultiIndex` as columns. (:issue:`18321`) - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) -- Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) -- :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) +- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30880`) +- Bug where :meth:`Index.astype` would lose the :attr:`name` attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) +- :meth:`Series.append` will now raise a ``TypeError`` when passed a :class:`DataFrame` or a sequence containing :class:`DataFrame` (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) -- Bug on inplace operation of a Series that was adding a column to the DataFrame from where it was originally dropped from (using inplace=True) (:issue:`30484`) +- Bug on inplace operation of a :class:`Series` that was adding a column to the :class:`DataFrame` from where it was originally dropped from (using ``inplace=True``) (:issue:`30484`) - Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`) - Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`) -- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) -- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) -- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) +- Bug in :func:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) +- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregate a non-existent column (:issue:`32755`) +- Bug in :meth:`DataFrame.unstack` when :class:`MultiIndex` columns and :class:`MultiIndex` rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) +- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a :class:`Series` if ignore_index=True or if the :class:`Series` has a name`` (:issue:`30871`) - Bug in :meth:`DataFrame.corrwith()`, :meth:`DataFrame.memory_usage()`, :meth:`DataFrame.dot()`, :meth:`DataFrame.idxmin()`, :meth:`DataFrame.idxmax()`, :meth:`DataFrame.duplicated()`, :meth:`DataFrame.isin()`, :meth:`DataFrame.count()`, :meth:`Series.explode()`, :meth:`Series.asof()` and :meth:`DataFrame.asof()` not returning subclassed types. (:issue:`31331`) -- Bug in :func:`concat` was not allowing for concatenation of ``DataFrame`` and ``Series`` with duplicate keys (:issue:`33654`) -- Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`) +- Bug in :func:`concat` was not allowing for concatenation of :class:`DataFrame` and :class:`Series` with duplicate keys (:issue:`33654`) +- Bug in :func:`cut` raised an error when the argument ``labels`` contains duplicates (:issue:`33141`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) -- Bug in :func:`Dataframe.aggregate` and :func:`Series.aggregate` was causing recursive loop in some cases (:issue:`34224`) -- Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) -- Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) -- Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) -- Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`) +- Bug in :meth:`Dataframe.aggregate` and :meth:`Series.aggregate` was causing a recursive loop in some cases (:issue:`34224`) +- Fixed bug in :func:`melt` where melting :class:`MultiIndex` columns with ``col_level > 0`` would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) +- Bug in :meth:`Series.where` with an empty :class:`Series` and empty ``cond`` having non-bool dtype (:issue:`34592`) +- Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements with ``S`` dtype (:issue:`34529`) Sparse ^^^^^^ - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) -- Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) -- Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) +- Bug in :meth:`Series.sum` with ``SparseArray`` raised a ``TypeError`` (:issue:`25777`) +- Bug where :class:`DataFrame` containing an all-sparse :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) - Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`) @@ -1140,36 +1180,26 @@ ExtensionArray ^^^^^^^^^^^^^^ - Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) -- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) +- Fixed bug in :func:`concat` when concatenating :class:`DataFrame` objects with non-overlapping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) - Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`) - Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`) - Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). - Fixed bug where :meth:`Series.update` would raise a ``ValueError`` for ``ExtensionArray`` dtypes with missing values (:issue:`33980`) - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) -- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) -- Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable Boolean dtypes (:issue:`34051`) +- Fixed bug where the constructor of :class:`DataFrame` with ``dtype='string'`` would fail (:issue:`27953`, :issue:`33623`) - Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`) -- Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). +- Fixed bug in :meth:`IntegerArray.astype` to correctly copy the mask as well (:issue:`34931`). Other ^^^^^ -- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` - instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) + - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) -- Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) -- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) -- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) -- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). -- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) -- Getting a missing attribute in a query/eval string raises the correct ``AttributeError`` (:issue:`32408`) +- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if the ``left`` argument is a different subclass with ``check_series_type=True`` (:issue:`32670`). +- Getting a missing attribute in a :meth:`DataFrame.query` or :meth:`DataFrame.eval` string raises the correct ``AttributeError`` (:issue:`32408`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) -- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) - Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) -- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) -- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) -- Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) -- Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) -- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) +- Bug in :meth:`DataFrame.equals` and :meth:`Series.equals` in allowing subclasses to be equal (:issue:`34402`). .. --------------------------------------------------------------------------- @@ -1178,3 +1208,4 @@ Other Contributors ~~~~~~~~~~~~ +.. contributors:: v1.0.5..v1.1.0|HEAD diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst new file mode 100644 index 0000000000000..77ea67f76f655 --- /dev/null +++ b/doc/source/whatsnew/v1.1.1.rst @@ -0,0 +1,56 @@ +.. _whatsnew_111: + +What's new in 1.1.1 (August 20, 2020) +------------------------------------- + +These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) +- Fixed regression in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) +- Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) +- Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) +- Fixed regression in ``.groupby(..).rolling(..)`` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`) +- Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) +- Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) +- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index``, ``right_index`` and ``tolerance`` were set (:issue:`35558`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) +- Fixed memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- Bug in :class:`~pandas.io.formats.style.Styler` whereby ``cell_ids`` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`) +- Bug in :meth:`to_timedelta` fails when ``arg`` is a :class:`Series` with ``Int64`` dtype containing null values (:issue:`35574`) +- Bug in ``.groupby(..).rolling(..)`` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when ``data`` and ``index`` have mismatched lengths (:issue:`33437`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.0..v1.1.1 diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst new file mode 100644 index 0000000000000..81b8e7df11625 --- /dev/null +++ b/doc/source/whatsnew/v1.1.2.rst @@ -0,0 +1,64 @@ +.. _whatsnew_112: + +What's new in 1.1.2 (September 8, 2020) +--------------------------------------- + +These are the changes in pandas 1.1.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) +- Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) +- Fix regression in :meth:`DataFrame.append` mixing tz-aware and tz-naive datetime columns (:issue:`35460`) +- Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Regression where :meth:`MultiIndex.get_loc` would return a slice spanning the full index when passed an empty list (:issue:`35878`) +- Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) +- Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) +- Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) +- Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) +- Fixed regression in :meth:`Series.groupby.rolling` number of levels of :class:`MultiIndex` in input was compressed to one (:issue:`36018`) +- Fixed regression in :class:`DataFrameGroupBy` on an empty :class:`DataFrame` (:issue:`36197`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) +- Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) +- Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) +- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` not respecting the ``errors`` argument when set to ``"ignore"`` for extension dtypes (:issue:`35471`) +- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) +- Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) +- Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) +- Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) +- Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) +- Bug in :meth:`import_optional_dependency` returning incorrect package names in cases where package name is different from import name (:issue:`35948`) +- Bug when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`31368`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.other: + +Other +~~~~~ +- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) +- :meth:`DataFrame.plot` and :meth:`Series.plot` raise ``UserWarning`` about usage of ``FixedFormatter`` and ``FixedLocator`` (:issue:`35684` and :issue:`35945`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.1..v1.1.2 diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst new file mode 100644 index 0000000000000..e752eb54d0c15 --- /dev/null +++ b/doc/source/whatsnew/v1.1.3.rst @@ -0,0 +1,78 @@ +.. _whatsnew_113: + +What's new in 1.1.3 (October 5, 2020) +------------------------------------- + +These are the changes in pandas 1.1.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +Enhancements +~~~~~~~~~~~~ + +Added support for new Python version +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas 1.1.3 now supports Python 3.9 (:issue:`36296`). + +Development Changes +^^^^^^^^^^^^^^^^^^^ + +- The minimum version of Cython is now the most recent bug-fix version (0.29.21) (:issue:`36296`). + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`DataFrame.agg`, :meth:`DataFrame.apply`, :meth:`Series.agg`, and :meth:`Series.apply` where internal suffix is exposed to the users when no relabelling is applied (:issue:`36189`) +- Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) +- Fixed regression when adding a :meth:`timedelta_range` to a :class:`Timestamp` raised a ``ValueError`` (:issue:`35897`) +- Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) +- Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) +- Fixed regression in modulo of :class:`Index`, :class:`Series` and :class:`DataFrame` using ``numexpr`` using C not Python semantics (:issue:`36047`, :issue:`36526`) +- Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) +- Fixed regression in :meth:`DataFrame.replace` inconsistent replace when using a float in the replace method (:issue:`35376`) +- Fixed regression in :meth:`Series.loc` on a :class:`Series` with a :class:`MultiIndex` containing :class:`Timestamp` raising ``InvalidIndexError`` (:issue:`35858`) +- Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`, :issue:`36377`) +- Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) +- Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) +- Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) +- Fixed regression in :func:`read_table` raised ``ValueError`` when ``delim_whitespace`` was set to ``True`` (:issue:`35958`) +- Fixed regression in :meth:`Series.dt.normalize` when normalizing pre-epoch dates the result was shifted one day (:issue:`36294`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :func:`read_spss` where passing a ``pathlib.Path`` as ``path`` would raise a ``TypeError`` (:issue:`33666`) +- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) +- Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) +- Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts column to categorical dtype (:issue:`36383`) +- Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) +- Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) +- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) +- Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.other: + +Other +~~~~~ +- Reverted enhancement added in pandas-1.1.0 where :func:`timedelta_range` infers a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.2..v1.1.3 diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst new file mode 100644 index 0000000000000..6353dbfafc9f1 --- /dev/null +++ b/doc/source/whatsnew/v1.1.4.rst @@ -0,0 +1,55 @@ +.. _whatsnew_114: + +What's new in 1.1.4 (October 30, 2020) +-------------------------------------- + +These are the changes in pandas 1.1.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`read_csv` raising a ``ValueError`` when ``names`` was of type ``dict_keys`` (:issue:`36928`) +- Fixed regression in :func:`read_csv` with more than 1M rows and specifying a ``index_col`` argument (:issue:`37094`) +- Fixed regression where attempting to mutate a :class:`DateOffset` object would no longer raise an ``AttributeError`` (:issue:`36940`) +- Fixed regression where :meth:`DataFrame.agg` would fail with :exc:`TypeError` when passed positional arguments to be passed on to the aggregation function (:issue:`36948`). +- Fixed regression in :class:`RollingGroupby` with ``sort=False`` not being respected (:issue:`36889`) +- Fixed regression in :meth:`Series.astype` converting ``None`` to ``"nan"`` when casting to string (:issue:`36904`) +- Fixed regression in :meth:`Series.rank` method failing for read-only data (:issue:`37290`) +- Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`) +- Fixed regression in :meth:`DataFrame.resample(...).apply(...)` raised ``AttributeError`` when input was a :class:`DataFrame` and only a :class:`Series` was evaluated (:issue:`36951`) +- Fixed regression in ``DataFrame.groupby(..).std()`` with nullable integer dtype (:issue:`37415`) +- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`) +- Fixed regression where slicing :class:`DatetimeIndex` raised :exc:`AssertionError` on irregular time series with ``pd.NaT`` or on unsorted indices (:issue:`36953` and :issue:`35509`) +- Fixed regression in certain offsets (:meth:`pd.offsets.Day() ` and below) no longer being hashable (:issue:`37267`) +- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`) +- Fixed regression in setitem with :meth:`DataFrame.iloc` which raised error when trying to set a value while filtering with a boolean list (:issue:`36741`) +- Fixed regression in setitem with a Series getting aligned before setting the values (:issue:`37427`) +- Fixed regression in :attr:`MultiIndex.is_monotonic_increasing` returning wrong results with ``NaN`` in at least one of the levels (:issue:`37220`) +- Fixed regression in inplace arithmetic operation on a Series not updating the parent DataFrame (:issue:`36373`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) +- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` raising a ``ValueError`` when the target was read-only (:issue:`37174`) +- Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`) +- Bug in :meth:`DataFrame.info` was raising a ``KeyError`` when the DataFrame has integer column names (:issue:`37245`) +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on (:issue:`35792`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.3..v1.1.4 diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst new file mode 100644 index 0000000000000..002e1f85f4127 --- /dev/null +++ b/doc/source/whatsnew/v1.1.5.rst @@ -0,0 +1,56 @@ +.. _whatsnew_115: + +What's new in 1.1.5 (December 07, 2020) +--------------------------------------- + +These are the changes in pandas 1.1.5. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in addition of a timedelta-like scalar to a :class:`DatetimeIndex` raising incorrectly (:issue:`37295`) +- Fixed regression in :meth:`Series.groupby` raising when the :class:`Index` of the :class:`Series` had a tuple as its name (:issue:`37755`) +- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`) +- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`) +- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`) +- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`) +- Fixed regression in :class:`Index` constructor raising a ``AttributeError`` when passed a :class:`SparseArray` with datetime64 values (:issue:`35843`) +- Fixed regression in :meth:`DataFrame.unstack` with columns with integer dtype (:issue:`37115`) +- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`) +- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`) +- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`) +- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`). +- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`) +- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`) +- Fixed regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` where ``None`` was considered a non-NA value (:issue:`38286`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in pytables methods in python 3.9 (:issue:`38041`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.other: + +Other +~~~~~ +- Only set ``-Werror`` as a compiler flag in the CI jobs (:issue:`33315`, :issue:`33314`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_115.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.4..v1.1.5|HEAD diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst new file mode 100644 index 0000000000000..af9219bc25931 --- /dev/null +++ b/doc/source/whatsnew/v1.2.0.rst @@ -0,0 +1,869 @@ +.. _whatsnew_120: + +What's new in 1.2.0 (??) +------------------------ + +These are the changes in pandas 1.2.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. warning:: + + The packages `xlrd `_ for reading excel + files and `xlwt `_ for + writing excel files are no longer maintained. These are the only engines in pandas + that support the xls format. + + Previously, the default argument ``engine=None`` to ``pd.read_excel`` + would result in using the ``xlrd`` engine in many cases. If + `openpyxl `_ is installed, + many of these cases will now default to using the ``openpyxl`` engine. + See the :func:`read_excel` documentation for more details. Attempting to read + ``.xls`` files or specifying ``engine="xlrd"`` to ``pd.read_excel`` will not + raise a warning. However users should be aware that ``xlrd`` is already + broken with certain package configurations, for example with Python 3.9 + when `defusedxml `_ is installed, and + is anticipated to be unusable in the future. + + Attempting to use the the ``xlwt`` engine will raise a ``FutureWarning`` + unless the option :attr:`io.excel.xls.writer` is set to ``"xlwt"``. + While this option is now deprecated and will also raise a ``FutureWarning``, + it can be globally set and the warning suppressed. Users are recommended to + write ``.xlsx`` files using the ``openpyxl`` engine instead. + +.. --------------------------------------------------------------------------- + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_120.duplicate_labels: + +Optionally disallow duplicate labels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` and :class:`DataFrame` can now be created with ``allows_duplicate_labels=False`` flag to +control whether the index or columns can contain duplicate labels (:issue:`28394`). This can be used to +prevent accidental introduction of duplicate labels, which can affect downstream operations. + +By default, duplicates continue to be allowed. + +.. code-block:: ipython + + In [1]: pd.Series([1, 2], index=['a', 'a']) + Out[1]: + a 1 + a 2 + Length: 2, dtype: int64 + + In [2]: pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) + ... + DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + +pandas will propagate the ``allows_duplicate_labels`` property through many operations. + +.. code-block:: ipython + + In [3]: a = ( + ...: pd.Series([1, 2], index=['a', 'b']) + ...: .set_flags(allows_duplicate_labels=False) + ...: ) + + In [4]: a + Out[4]: + a 1 + b 2 + Length: 2, dtype: int64 + + # An operation introducing duplicates + In [5]: a.reindex(['a', 'b', 'a']) + ... + DuplicateLabelError: Index has duplicates. + positions + label + a [0, 2] + + [1 rows x 1 columns] + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. + +See :ref:`duplicates` for more. + +The ``allows_duplicate_labels`` flag is stored in the new :attr:`DataFrame.flags` +attribute. This stores global attributes that apply to the *pandas object*. This +differs from :attr:`DataFrame.attrs`, which stores information that applies to +the dataset. + +Passing arguments to fsspec backends +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Many read/write functions have acquired the ``storage_options`` optional argument, +to pass a dictionary of parameters to the storage backend. This allows, for +example, for passing credentials to S3 and GCS storage. The details of what +parameters can be passed to which backends can be found in the documentation +of the individual storage backends (detailed from the fsspec docs for +`builtin implementations`_ and linked to `external ones`_). See +Section :ref:`io.remote`. + +:issue:`35655` added fsspec support (including ``storage_options``) +for reading excel files. + +.. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + +.. _whatsnew_120.binary_handle_to_csv: + +Support for binary file handles in ``to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) +with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`). +If pandas does not automatically detect whether the file handle is opened in binary or text mode, +it is necessary to provide ``mode="wb"``. + +For example: + +.. ipython:: python + + import io + + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, encoding="utf-8", compression="gzip") + +Support for short caption and table position in ``to_latex`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrame.to_latex` now allows one to specify +a floating table position (:issue:`35281`) +and a short caption (:issue:`36267`). + +The keyword ``position`` has been added to set the position. + +.. ipython:: python + + data = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + table = data.to_latex(position='ht') + print(table) + +Usage of the keyword ``caption`` has been extended. +Besides taking a single string as an argument, +one can optionally provide a tuple ``(full_caption, short_caption)`` +to add a short caption macro. + +.. ipython:: python + + data = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + table = data.to_latex(caption=('the full long caption', 'short caption')) + print(table) + +.. _whatsnew_120.read_csv_table_precision_default: + +Change in default floating precision for ``read_csv`` and ``read_table`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` previously defaulted to a parser that +could read floating point numbers slightly incorrectly with respect to the last bit in precision. +The option ``floating_precision="high"`` has always been available to avoid this issue. +Beginning with this version, the default is now to use the more accurate parser by making +``floating_precision=None`` correspond to the high precision parser, and the new option +``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision +parser by default should have no impact on performance. (:issue:`17154`) + +.. _whatsnew_120.floating: + +Experimental nullable data types for float data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`. +These are extension data types dedicated to floating point data that can hold the +``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). + +While the default float data type already supports missing values using ``np.nan``, +these new data types use ``pd.NA`` (and its corresponding behaviour) as the missing +value indicator, in line with the already existing nullable :ref:`integer ` +and :ref:`boolean ` data types. + +One example where the behaviour of ``np.nan`` and ``pd.NA`` is different is +comparison operations: + +.. ipython:: python + + # the default numpy float64 dtype + s1 = pd.Series([1.5, None]) + s1 + s1 > 1 + +.. ipython:: python + + # the new nullable float64 dtype + s2 = pd.Series([1.5, None], dtype="Float64") + s2 + s2 > 1 + +See the :ref:`missing_data.NA` doc section for more details on the behaviour +when using the ``pd.NA`` missing value indicator. + +As shown above, the dtype can be specified using the "Float64" or "Float32" +string (capitalized to distinguish it from the default "float64" data type). +Alternatively, you can also use the dtype object: + +.. ipython:: python + + pd.Series([1.5, None], dtype=pd.Float32Dtype()) + +Operations with the existing integer or boolean nullable data types that +give float results will now also use the nullable floating data types (:issue:`38178`). + +.. warning:: + + Experimental: the new floating data types are currently experimental, and their + behaviour or API may still change without warning. Especially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. + +.. _whatsnew_120.index_name_preservation: + +Index/column name preservation when aggregating +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When aggregating using :meth:`concat` or the :class:`DataFrame` constructor, pandas +will now attempt to preserve index and column names whenever possible (:issue:`35847`). +In the case where all inputs share a common name, this name will be assigned to the +result. When the input names do not all agree, the result will be unnamed. Here is an +example where the index name is preserved: + +.. ipython:: python + + idx = pd.Index(range(5), name='abc') + ser = pd.Series(range(5, 10), index=idx) + pd.concat({'x': ser[1:], 'y': ser[:-1]}, axis=1) + +The same is true for :class:`MultiIndex`, but the logic is applied separately on a +level-by-level basis. + +.. _whatsnew_120.groupby_ewm: + +Groupby supports EWM operations directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`.DataFrameGroupBy` now supports exponentially weighted window operations directly (:issue:`16037`). + +.. ipython:: python + + df = pd.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': range(4)}) + df + df.groupby('A').ewm(com=1.0).mean() + +Additionally ``mean`` supports execution via `Numba `__ with +the ``engine`` and ``engine_kwargs`` arguments. Numba must be installed as an optional dependency +to use this feature. + +.. _whatsnew_120.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ +- Added ``day_of_week`` (compatibility alias ``dayofweek``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`) +- Added ``day_of_year`` (compatibility alias ``dayofyear``) property to :class:`Timestamp`, :class:`.DatetimeIndex`, :class:`Period`, :class:`PeriodIndex` (:issue:`9605`) +- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a Series or DataFrame (:issue:`28394`) +- :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) +- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) +- :meth:`io.sql.get_schema` now supports a ``schema`` keyword argument that will add a schema into the create table statement (:issue:`28486`) +- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) +- :meth:`DataFrame.hist` now supports time series (datetime) data (:issue:`32590`) +- :meth:`.Styler.set_table_styles` now allows the direct styling of rows and columns and can be chained (:issue:`35607`) +- :class:`.Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) +- :meth:`.Rolling.mean` and :meth:`.Rolling.sum` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) +- :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) +- Calling a NumPy ufunc on a ``DataFrame`` with extension types now preserves the extension types when possible (:issue:`23743`). +- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`). +- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) +- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) +- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use + nullable dtypes that use ``pd.NA`` as missing value indicator where possible + for the resulting DataFrame (default is False, and only applicable for + ``engine="pyarrow"``) (:issue:`31242`) +- Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`) +- :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`) +- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`) +- :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`) +- :class:`DataFrame` now supports the ``divmod`` operation (:issue:`37165`) +- :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`) +- :class:`.Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) +- :class:`.DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) +- :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) +- :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) +- :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) +- Add support for ``min_count`` keyword for :meth:`DataFrame.groupby` and :meth:`DataFrame.resample` for functions ``min``, ``max``, ``first`` and ``last`` (:issue:`37821`, :issue:`37768`) +- Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) +- Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) +- Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) +- When :func:`read_csv/sas/json` are called with ``chuncksize``/``iterator`` they can be used in a ``with`` statement as they return context-managers (:issue:`38225`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + +Consistency of DataFrame Reductions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True`` now +determines whether to exclude object-dtype columns on a column-by-column basis, +instead of checking if *all* object-dtype columns can be considered boolean. + +This prevents pathological behavior where applying the reduction on a subset +of columns could result in a larger Series result. See (:issue:`37799`). + +.. ipython:: python + + df = pd.DataFrame({"A": ["foo", "bar"], "B": [True, False]}, dtype=object) + df["C"] = pd.Series([True, True]) + + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: df.all(bool_only=True) + Out[5]: + C True + dtype: bool + + In [6]: df[["B", "C"]].all(bool_only=True) + Out[6]: + B False + C True + dtype: bool + +*New behavior*: + +.. ipython:: python + + In [5]: df.all(bool_only=True) + + In [6]: df[["B", "C"]].all(bool_only=True) + + +Other DataFrame reductions with ``numeric_only=None`` will also avoid +this pathological behavior (:issue:`37827`): + +.. ipython:: python + + df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) + + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.mean() + Out[3]: Series([], dtype: float64) + + In [4]: df[["A"]].mean() + Out[4]: + A 1.0 + dtype: float64 + +*New behavior*: + +.. ipython:: python + + df.mean() + + df[["A"]].mean() + +Moreover, DataFrame reductions with ``numeric_only=None`` will now be +consistent with their Series counterparts. In particular, for +reductions where the Series method raises ``TypeError``, the +DataFrame reduction will now consider that column non-numeric +instead of casting to a NumPy array which may have different semantics (:issue:`36076`, +:issue:`28949`, :issue:`21020`). + +.. ipython:: python + + ser = pd.Series([0, 1], dtype="category", name="A") + df = ser.to_frame() + + +*Previous behavior*: + +.. code-block:: ipython + + In [5]: df.any() + Out[5]: + A True + dtype: bool + +*New behavior*: + +.. ipython:: python + + df.any() + + +.. _whatsnew_120.api_breaking.python: + +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +pandas 1.2.0 supports Python 3.7.1 and higher (:issue:`35214`). + +.. _whatsnew_120.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`35214`). +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| numpy | 1.16.5 | X | X | ++-----------------+-----------------+----------+---------+ +| pytz | 2017.3 | X | X | ++-----------------+-----------------+----------+---------+ +| python-dateutil | 2.7.3 | X | | ++-----------------+-----------------+----------+---------+ +| bottleneck | 1.2.1 | | | ++-----------------+-----------------+----------+---------+ +| numexpr | 2.6.8 | | X | ++-----------------+-----------------+----------+---------+ +| pytest (dev) | 5.0.1 | | X | ++-----------------+-----------------+----------+---------+ +| mypy (dev) | 0.782 | | X | ++-----------------+-----------------+----------+---------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+---------+ +| Package | Minimum Version | Changed | ++=================+=================+=========+ +| beautifulsoup4 | 4.6.0 | | ++-----------------+-----------------+---------+ +| fastparquet | 0.3.2 | | ++-----------------+-----------------+---------+ +| fsspec | 0.7.4 | | ++-----------------+-----------------+---------+ +| gcsfs | 0.6.0 | | ++-----------------+-----------------+---------+ +| lxml | 4.3.0 | X | ++-----------------+-----------------+---------+ +| matplotlib | 2.2.3 | X | ++-----------------+-----------------+---------+ +| numba | 0.46.0 | | ++-----------------+-----------------+---------+ +| openpyxl | 2.6.0 | X | ++-----------------+-----------------+---------+ +| pyarrow | 0.15.0 | X | ++-----------------+-----------------+---------+ +| pymysql | 0.7.11 | X | ++-----------------+-----------------+---------+ +| pytables | 3.5.1 | X | ++-----------------+-----------------+---------+ +| s3fs | 0.4.0 | | ++-----------------+-----------------+---------+ +| scipy | 1.2.0 | | ++-----------------+-----------------+---------+ +| sqlalchemy | 1.2.8 | X | ++-----------------+-----------------+---------+ +| xarray | 0.12.3 | X | ++-----------------+-----------------+---------+ +| xlrd | 1.2.0 | X | ++-----------------+-----------------+---------+ +| xlsxwriter | 1.0.2 | X | ++-----------------+-----------------+---------+ +| xlwt | 1.3.0 | X | ++-----------------+-----------------+---------+ +| pandas-gbq | 0.12.0 | | ++-----------------+-----------------+---------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +.. _whatsnew_120.api.other: + +Other API changes +^^^^^^^^^^^^^^^^^ + +- Sorting in descending order is now stable for :meth:`Series.sort_values` and :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses. This will affect sort order when sorting a DataFrame on multiple columns, sorting with a key function that produces duplicates, or requesting the sorting index when using :meth:`Index.sort_values`. When using :meth:`Series.value_counts`, the count of missing values is no longer necessarily last in the list of duplicate counts. Instead, its position corresponds to the position in the original Series. When using :meth:`Index.sort_values` for DateTime-like :class:`Index` subclasses, NaTs ignored the ``na_position`` argument and were sorted to the beginning. Now they respect ``na_position``, the default being ``last``, same as other :class:`Index` subclasses. (:issue:`35992`) +- Passing an invalid ``fill_value`` to :meth:`Categorical.take`, :meth:`.DatetimeArray.take`, :meth:`TimedeltaArray.take`, or :meth:`PeriodArray.take` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) +- Passing an invalid ``fill_value`` to :meth:`Series.shift` with a ``CategoricalDtype`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) +- Passing an invalid value to :meth:`IntervalIndex.insert` or :meth:`CategoricalIndex.insert` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) +- Attempting to reindex a Series with a :class:`CategoricalIndex` with an invalid ``fill_value`` now raises a ``TypeError`` instead of a ``ValueError`` (:issue:`37733`) +- :meth:`CategoricalIndex.append` with an index that contains non-category values will now cast instead of raising ``TypeError`` (:issue:`38098`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.deprecations: + +Deprecations +~~~~~~~~~~~~ +- Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) +- Deprecated parameter ``dtype`` of method :meth:`~Index.copy` for all :class:`Index` subclasses. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) +- Deprecated parameters ``levels`` and ``codes`` in :meth:`MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) +- Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) +- :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) +- The method :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) +- Deprecated indexing :class:`DataFrame` rows with a single datetime-like string as ``df[string]`` + (given the ambiguity whether it is indexing the rows or selecting a column), use + ``df.loc[string]`` instead (:issue:`36179`) +- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`.DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) +- Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) +- The default value of ``regex`` for :meth:`Series.str.replace` will change from ``True`` to ``False`` in a future release. In addition, single character regular expressions will *not* be treated as literal strings when ``regex=True`` is set. (:issue:`24804`) +- Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) +- :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) +- Using "outer" ufuncs on DataFrames to return 4d ndarray is now deprecated. Convert to an ndarray first (:issue:`23743`) +- Deprecated slice-indexing on timezone-aware :class:`DatetimeIndex` with naive ``datetime`` objects, to match scalar indexing behavior (:issue:`36148`) +- :meth:`Index.ravel` returning a ``np.ndarray`` is deprecated, in the future this will return a view on the same index (:issue:`19956`) +- Deprecate use of strings denoting units with 'M', 'Y' or 'y' in :func:`~pandas.to_timedelta` (:issue:`36666`) +- :class:`Index` methods ``&``, ``|``, and ``^`` behaving as the set operations :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference`, respectively, are deprecated and in the future will behave as pointwise boolean operations matching :class:`Series` behavior. Use the named set methods instead (:issue:`36758`) +- :meth:`Categorical.is_dtype_equal` and :meth:`CategoricalIndex.is_dtype_equal` are deprecated, will be removed in a future version (:issue:`37545`) +- :meth:`Series.slice_shift` and :meth:`DataFrame.slice_shift` are deprecated, use :meth:`Series.shift` or :meth:`DataFrame.shift` instead (:issue:`37601`) +- Partial slicing on unordered :class:`.DatetimeIndex` objects with keys that are not in the index is deprecated and will be removed in a future version (:issue:`18531`) +- The ``how`` keyword in :meth:`PeriodIndex.astype` is deprecated and will be removed in a future version, use ``index.to_timestamp(how=how)`` instead (:issue:`37982`) +- Deprecated :meth:`Index.asi8` for :class:`Index` subclasses other than :class:`.DatetimeIndex`, :class:`.TimedeltaIndex`, and :class:`PeriodIndex` (:issue:`37877`) +- The ``inplace`` parameter of :meth:`Categorical.remove_unused_categories` is deprecated and will be removed in a future version (:issue:`37643`) +- The ``null_counts`` parameter of :meth:`DataFrame.info` is deprecated and replaced by ``show_counts``. It will be removed in a future version (:issue:`37999`) + +.. --------------------------------------------------------------------------- + + +.. _whatsnew_120.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`) +- Performance improvement in :meth:`.GroupBy.agg` with the ``numba`` engine (:issue:`35759`) +- Performance improvements when creating :meth:`Series.map` from a huge dictionary (:issue:`34717`) +- Performance improvement in :meth:`.GroupBy.transform` with the ``numba`` engine (:issue:`36240`) +- :class:`.Styler` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) +- Performance improvement in :func:`to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) +- Performance improvement in setting values on an :class:`IntervalArray` (:issue:`36310`) +- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes, + avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`) +- Performance improvement in :meth:`.RollingGroupby.count` (:issue:`35625`) +- Small performance decrease to :meth:`.Rolling.min` and :meth:`.Rolling.max` for fixed windows (:issue:`36567`) +- Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) +- Faster ``dir`` calls when the object has many index labels, e.g. ``dir(ser)`` (:issue:`37450`) +- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) +- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) +- Performance improvement in :meth:`DataFrame.groupby` for ``float`` ``dtype`` (:issue:`28303`), changes of the underlying hash-function can lead to changes in float based indexes sort ordering for ties (e.g. :meth:`Index.value_counts`) +- Performance improvement in :meth:`pd.isin` for inputs with more than 1e6 elements (:issue:`36611`) +- Performance improvement for :meth:`DataFrame.__setitem__` with list-like indexers (:issue:`37954`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ +- :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`) +- Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) +- Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) +- Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) +- Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with timezone-aware ``datetime64`` categories incorrectly dropping the timezone information instead of casting to object dtype (:issue:`38136`) + +Datetimelike +^^^^^^^^^^^^ +- Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`) +- Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) +- Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) +- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) +- Bug in :meth:`.DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`.DatetimeIndex` (:issue:`35690`) +- Bug in :meth:`.DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) +- Bug in :meth:`.DatetimeIndex.searchsorted`, :meth:`.TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or :class:`Period` dtype placement of ``NaT`` values being inconsistent with NumPy (:issue:`36176`, :issue:`36254`) +- Inconsistency in :class:`.DatetimeArray`, :class:`.TimedeltaArray`, and :class:`.PeriodArray` method ``__setitem__`` casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) +- Bug in :meth:`.DatetimeArray.take` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37356`) +- Bug in :class:`.DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) +- :class:`Timestamp` and :class:`.DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) +- Bug in :meth:`.DatetimeIndex.equals` and :meth:`.TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) +- :meth:`Series.to_json`, :meth:`DataFrame.to_json`, and :meth:`read_json` now implement timezone parsing when orient structure is ``table`` (:issue:`35973`) +- :meth:`astype` now attempts to convert to ``datetime64[ns, tz]`` directly from ``object`` with inferred timezone from string (:issue:`35973`) +- Bug in :meth:`.TimedeltaIndex.sum` and :meth:`Series.sum` with ``timedelta64`` dtype on an empty index or series returning ``NaT`` instead of ``Timedelta(0)`` (:issue:`31751`) +- Bug in :meth:`.DatetimeArray.shift` incorrectly allowing ``fill_value`` with a mismatched timezone (:issue:`37299`) +- Bug in adding a :class:`.BusinessDay` with nonzero ``offset`` to a non-scalar other (:issue:`37457`) +- Bug in :func:`to_datetime` with a read-only array incorrectly raising (:issue:`34857`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` incorrectly casting integers to datetimes (:issue:`36621`) +- Bug in :meth:`Series.isin` with ``datetime64[ns]`` dtype and :meth:`.DatetimeIndex.isin` failing to consider timezone-aware and timezone-naive datetimes as always different (:issue:`35728`) +- Bug in :meth:`Series.isin` with ``PeriodDtype`` dtype and :meth:`PeriodIndex.isin` failing to consider arguments with different ``PeriodDtype`` as always different (:issue:`37528`) +- Bug in :class:`Period` constructor now correctly handles nanoseconds in the ``value`` argument (:issue:`34621` and :issue:`17053`) + +Timedelta +^^^^^^^^^ +- Bug in :class:`.TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) +- Bug in parsing of ISO 8601 durations in :class:`Timedelta` and :func:`to_datetime` (:issue:`29773`, :issue:`36204`) +- Bug in :func:`to_timedelta` with a read-only array incorrectly raising (:issue:`34857`) +- Bug in :class:`Timedelta` incorrectly truncating to sub-second portion of a string input when it has precision higher than nanoseconds (:issue:`36738`) + +Timezones +^^^^^^^^^ + +- Bug in :func:`date_range` was raising AmbiguousTimeError for valid input with ``ambiguous=False`` (:issue:`35297`) +- Bug in :meth:`Timestamp.replace` was losing fold information (:issue:`37610`) + + +Numeric +^^^^^^^ +- Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) +- Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) +- Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) +- Bug in :class:`Series` where two Series each have a :class:`.DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) +- Bug in :mod:`pandas.testing` module functions when used with ``check_exact=False`` on complex numeric types (:issue:`28235`) +- Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) +- Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) +- Bug in :class:`.IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) +- Bug in :class:`MultiIndex` comparison with tuple incorrectly treating tuple as array-like (:issue:`21517`) +- Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`) +- Bug in :class:`DataFrame` arithmetic ops incorrectly accepting keyword arguments (:issue:`36843`) +- Bug in :class:`.IntervalArray` comparisons with :class:`Series` not returning Series (:issue:`36908`) +- Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) +- Bug in :meth:`DataFrame.std` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) +- Bug in :meth:`DataFrame.idxmax` and :meth:`DataFrame.idxmin` with mixed dtypes incorrectly raising ``TypeError`` (:issue:`38195`) + +Conversion +^^^^^^^^^^ + +- Bug in :meth:`DataFrame.to_dict` with ``orient='records'`` now returns python native datetime objects for datetimelike columns (:issue:`21256`) +- Bug in :meth:`Series.astype` conversion from ``string`` to ``float`` raised in presence of ``pd.NA`` values (:issue:`37626`) +- + +Strings +^^^^^^^ +- Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`) +- Bug in :func:`to_numeric` raising a ``TypeError`` when attempting to convert a string dtype Series containing only numeric strings and ``NA`` (:issue:`37262`) +- + +Interval +^^^^^^^^ + +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Interval` dtypes would be converted to object dtypes (:issue:`34871`) +- Bug in :meth:`IntervalIndex.take` with negative indices and ``fill_value=None`` (:issue:`37330`) +- Bug in :meth:`IntervalIndex.putmask` with datetime-like dtype incorrectly casting to object dtype (:issue:`37968`) +- Bug in :meth:`IntervalArray.astype` incorrectly dropping dtype information with a :class:`CategoricalDtype` object (:issue:`37984`) +- + +Indexing +^^^^^^^^ + +- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__getitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) +- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) +- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where ``int64`` arrays are returned instead of ``intp``. (:issue:`36359`) +- Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) +- Bug in :meth:`DataFrame.reset_index` was incorrectly raising a ``ValueError`` for input with a :class:`MultiIndex` with missing values in a level with ``Categorical`` dtype (:issue:`24206`) +- Bug in indexing with boolean masks on datetime-like values sometimes returning a view instead of a copy (:issue:`36210`) +- Bug in :meth:`DataFrame.__getitem__` and :meth:`DataFrame.loc.__getitem__` with :class:`IntervalIndex` columns and a numeric indexer (:issue:`26490`) +- Bug in :meth:`Series.loc.__getitem__` with a non-unique :class:`MultiIndex` and an empty-list indexer (:issue:`13691`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`MultiIndex` and a level named ``"0"`` (:issue:`37194`) +- Bug in :meth:`Series.__getitem__` when using an unsigned integer array as an indexer giving incorrect results or segfaulting instead of raising ``KeyError`` (:issue:`37218`) +- Bug in :meth:`Index.where` incorrectly casting numeric values to strings (:issue:`37591`) +- Bug in :meth:`DataFrame.loc` returning empty result when indexer is a slice with negative step size (:issue:`38071`) +- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` raises when the index was of ``object`` dtype and the given numeric label was in the index (:issue:`26491`) +- Bug in :meth:`DataFrame.loc` returned requested key plus missing values when ``loc`` was applied to single level from a :class:`MultiIndex` (:issue:`27104`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using a listlike indexer containing NA values (:issue:`37722`) +- Bug in :meth:`DataFrame.loc.__setitem__` expanding an empty :class:`DataFrame` with mixed dtypes (:issue:`37932`) +- Bug in :meth:`DataFrame.xs` ignored ``droplevel=False`` for columns (:issue:`19056`) +- Bug in :meth:`DataFrame.reindex` raising ``IndexingError`` wrongly for empty DataFrame with ``tolerance`` not None or ``method="nearest"`` (:issue:`27315`) +- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) +- Bug on inserting a boolean label into a :class:`DataFrame` with a numeric :class:`Index` columns incorrectly casting to integer (:issue:`36319`) +- Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`) +- Bug in :meth:`MultiIndex.drop` does not raise if labels are partially found (:issue:`37820`) +- Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`) +- Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`) +- Bug in :meth:`Series.at` returning :class:`Series` with one element instead of scalar when index is a :class:`MultiIndex` with one level (:issue:`38053`) +- Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) +- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) +- Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) +- Bug in setting a new label on a :class:`DataFrame` or :class:`Series` with a :class:`CategoricalIndex` incorrectly raising ``TypeError`` when the new label is not among the index's categories (:issue:`38098`) +- Bug in :meth:`Series.loc` and :meth:`Series.iloc` raising ``ValueError`` when inserting a listlike ``np.array``, ``list`` or ``tuple`` in an ``object`` Series of equal length (:issue:`37748`, :issue:`37486`) +- Bug in :meth:`Series.loc` and :meth:`Series.iloc` setting all the values of an ``object`` Series with those of a listlike ``ExtensionArray`` instead of inserting it (:issue:`38271`) + +Missing +^^^^^^^ + +- Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) +- Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) +- Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and ``backfill`` (:issue:`31048`) +- + +MultiIndex +^^^^^^^^^^ + +- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) +- Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`) +- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`) +- Bug in :meth:`MultiIndex.drop` dropped ``NaN`` values when non existing key was given as input (:issue:`18853`) +- Bug in :meth:`MultiIndex.drop` dropping more values than expected when index has duplicates and is not sorted (:issue:`33494`) + +I/O +^^^ + +- :func:`read_sas` no longer leaks resources on failure (:issue:`35566`) +- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) +- Bug in :meth:`read_csv` with ``float_precision='round_trip'`` did not handle ``decimal`` and ``thousands`` parameters (:issue:`35365`) +- :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) +- :meth:`to_csv` passes compression arguments for ``'gzip'`` always to ``gzip.GzipFile`` (:issue:`28103`) +- :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) +- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, :issue:`32392`) +- :meth:`DataFrame.to_pickle`, :meth:`Series.to_pickle`, and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, :issue:`29570`) +- Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entries in the List of Tables of a LaTeX document (:issue:`34360`) +- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) +- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) +- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) +- Bug in :func:`read_table` and :func:`read_csv` when ``delim_whitespace=True`` and ``sep=default`` (:issue:`36583`) +- Bug in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when used with ``lines=True`` and ``orient='records'`` the last line of the record is not appended with 'new line character' (:issue:`36888`) +- Bug in :meth:`read_parquet` with fixed offset timezones. String representation of timezones was not recognized (:issue:`35997`, :issue:`36004`) +- Bug in :meth:`DataFrame.to_html`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` ignoring the ``na_rep`` argument when ``float_format`` was also specified (:issue:`9046`, :issue:`13828`) +- Bug in output rendering of complex numbers showing too many trailing zeros (:issue:`36799`) +- Bug in :class:`HDFStore` threw a ``TypeError`` when exporting an empty DataFrame with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- Bug in :class:`HDFStore` was dropping timezone information when exporting a Series with ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`20594`) +- :func:`read_csv` was closing user-provided binary file handles when ``engine="c"`` and an ``encoding`` was requested (:issue:`36980`) +- Bug in :meth:`DataFrame.to_hdf` was not dropping missing rows with ``dropna=True`` (:issue:`35719`) +- Bug in :func:`read_html` was raising a ``TypeError`` when supplying a ``pathlib.Path`` argument to the ``io`` parameter (:issue:`37705`) +- :meth:`DataFrame.to_excel`, :meth:`Series.to_excel`, :meth:`DataFrame.to_markdown`, and :meth:`Series.to_markdown` now support writing to fsspec URLs such as S3 and Google Cloud Storage (:issue:`33987`) +- Bug in :func:`read_fwf` with ``skip_blank_lines=True`` was not skipping blank lines (:issue:`37758`) +- Parse missing values using :func:`read_json` with ``dtype=False`` to ``NaN`` instead of ``None`` (:issue:`28501`) +- :meth:`read_fwf` was inferring compression with ``compression=None`` which was not consistent with the other :meth:``read_*`` functions (:issue:`37909`) +- :meth:`DataFrame.to_html` was ignoring ``formatters`` argument for ``ExtensionDtype`` columns (:issue:`36525`) +- Bumped minimum xarray version to 0.12.3 to avoid reference to the removed ``Panel`` class (:issue:`27101`) + +Period +^^^^^^ + +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` where :class:`Period` dtypes would be converted to object dtypes (:issue:`34871`) + +Plotting +^^^^^^^^ + +- Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) +- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes caused a ``ValueError`` (:issue:`21003`) +- Bug in :meth:`DataFrame.plot.bar` and :meth:`Series.plot.bar` where ticks positions were assigned by value order instead of using the actual value for numeric or a smart ordering for string (:issue:`26186`, :issue:`11465`) +- Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`) +- Bug in :meth:`Series.plot` and :meth:`DataFrame.plot` was throwing a :exc:`ValueError` when the Series or DataFrame was + indexed by a :class:`.TimedeltaIndex` with a fixed frequency and the x-axis lower limit was greater than the upper limit (:issue:`37454`) +- Bug in :meth:`.DataFrameGroupBy.boxplot` when ``subplots=False`` would raise a ``KeyError`` (:issue:`16748`) +- Bug in :meth:`DataFrame.plot` and :meth:`Series.plot` was overwriting matplotlib's shared y axes behaviour when no ``sharey`` parameter was passed (:issue:`37942`) +- Bug in :meth:`DataFrame.plot` was raising a ``TypeError`` with ``ExtensionDtype`` columns (:issue:`32073`) + + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :meth:`.DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) +- Bug in :meth:`.DataFrameGroupBy.apply` that would sometimes throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) +- Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`) +- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`) +- Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`) +- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) +- Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) +- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) +- Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) +- Bug in :meth:`.Rolling.sum` returned wrong values when dtypes where mixed between float and integer and ``axis=1`` (:issue:`20649`, :issue:`35596`) +- Bug in :meth:`.Rolling.count` returned ``np.nan`` with :class:`~pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in the window (:issue:`35579`) +- Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) +- Bug in :meth:`.DataFrameGroupBy.ffill` and :meth:`.DataFrameGroupBy.bfill` where a ``NaN`` group would return filled values instead of ``NaN`` when ``dropna=True`` (:issue:`34725`) +- Bug in :meth:`.RollingGroupby.count` where a ``ValueError`` was raised when specifying the ``closed`` parameter (:issue:`35869`) +- Bug in :meth:`.DataFrameGroupBy.rolling` returning wrong values with partial centered window (:issue:`36040`) +- Bug in :meth:`.DataFrameGroupBy.rolling` returned wrong values with timeaware window containing ``NaN``. Raises ``ValueError`` because windows are not monotonic now (:issue:`34617`) +- Bug in :meth:`.Rolling.__iter__` where a ``ValueError`` was not raised when ``min_periods`` was larger than ``window`` (:issue:`37156`) +- Using :meth:`.Rolling.var` instead of :meth:`.Rolling.std` avoids numerical issues for :meth:`.Rolling.corr` when :meth:`.Rolling.var` is still within floating point precision while :meth:`.Rolling.std` is not (:issue:`31286`) +- Bug in :meth:`.DataFrameGroupBy.quantile` and :meth:`.Resampler.quantile` raised ``TypeError`` when values were of type ``Timedelta`` (:issue:`29485`) +- Bug in :meth:`.Rolling.median` and :meth:`.Rolling.quantile` returned wrong values for :class:`.BaseIndexer` subclasses with non-monotonic starting or ending points for windows (:issue:`37153`) +- Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) +- Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) +- Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) +- Bug in :meth:`.DataFrameGroupBy.apply` dropped values on ``nan`` group when returning the same axes with the original frame (:issue:`38227`) +- Bug in :meth:`.DataFrameGroupBy.quantile` couldn't handle with arraylike ``q`` when grouping by columns (:issue:`33795`) +- Bug in :meth:`DataFrameGroupBy.rank` with ``datetime64tz`` or period dtype incorrectly casting results to those dtypes instead of returning ``float64`` dtype (:issue:`38187`) + +Reshaping +^^^^^^^^^ + +- Bug in :meth:`DataFrame.crosstab` was returning incorrect results on inputs with duplicate row names, duplicate column names or duplicate names between row and column labels (:issue:`22529`) +- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) +- Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`) +- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) +- Bug in :meth:`DataFrame.stack` where an empty DataFrame.stack would raise an error (:issue:`36113`). Now returning an empty Series with empty MultiIndex. +- Bug in :meth:`Series.unstack`. Now a Series with single level of Index trying to unstack would raise a ValueError. (:issue:`36113`) +- Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) +- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was a dictionary (:issue:`35811`) +- Bug in :meth:`DataFrame.pivot` did not preserve :class:`MultiIndex` level names for columns when rows and columns are both multiindexed (:issue:`36360`) +- Bug in :meth:`DataFrame.pivot` modified ``index`` argument when ``columns`` was passed but ``values`` was not (:issue:`37635`) +- Bug in :meth:`DataFrame.join` returned a non deterministic level-order for the resulting :class:`MultiIndex` (:issue:`36910`) +- Bug in :meth:`DataFrame.combine_first` caused wrong alignment with dtype ``string`` and one level of ``MultiIndex`` containing only ``NA`` (:issue:`37591`) +- Fixed regression in :func:`merge` on merging :class:`.DatetimeIndex` with empty DataFrame (:issue:`36895`) +- Bug in :meth:`DataFrame.apply` not setting index of return value when ``func`` return type is ``dict`` (:issue:`37544`) +- Bug in :func:`concat` resulting in a ``ValueError`` when at least one of both inputs had a non-unique index (:issue:`36263`) +- Bug in :meth:`DataFrame.merge` and :meth:`pandas.merge` returning inconsistent ordering in result for ``how=right`` and ``how=left`` (:issue:`35382`) +- Bug in :func:`merge_ordered` couldn't handle list-like ``left_by`` or ``right_by`` (:issue:`35269`) +- Bug in :func:`merge_ordered` returned wrong join result when length of ``left_by`` or ``right_by`` equals to the rows of ``left`` or ``right`` (:issue:`38166`) +- Bug in :func:`merge_ordered` didn't raise when elements in ``left_by`` or ``right_by`` not exist in ``left`` columns or ``right`` columns (:issue:`38167`) +- Bug in :func:`DataFrame.drop_duplicates` not validating bool dtype for ``ignore_index`` keyword (:issue:`38274`) + +Sparse +^^^^^^ + +- +- + +ExtensionArray +^^^^^^^^^^^^^^ + +- Fixed bug where :class:`DataFrame` column set to scalar extension type via a dict instantiation was considered an object type rather than the extension type (:issue:`35965`) +- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`28488`) +- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`) +- Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`) +- Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`) +- Fixed a bug where a ``TypeError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`) + +Other +^^^^^ + +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising an ``AssertionError`` instead of a ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) +- Fixed metadata propagation in :meth:`Series.abs` and ufuncs called on Series and DataFrames (:issue:`28283`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly casting from ``PeriodDtype`` to object dtype (:issue:`34871`) +- Fixed bug in metadata propagation incorrectly copying DataFrame columns as metadata when the column name overlaps with the metadata name (:issue:`37037`) +- Fixed metadata propagation in the :class:`Series.dt`, :class:`Series.str` accessors, :class:`DataFrame.duplicated`, :class:`DataFrame.stack`, :class:`DataFrame.unstack`, :class:`DataFrame.pivot`, :class:`DataFrame.append`, :class:`DataFrame.diff`, :class:`DataFrame.applymap` and :class:`DataFrame.update` methods (:issue:`28283`, :issue:`37381`) +- Fixed metadata propagation when selecting columns with ``DataFrame.__getitem__`` (:issue:`28283`) +- Bug in :meth:`Index.intersection` with non-:class:`Index` failing to set the correct name on the returned :class:`Index` (:issue:`38111`) +- Bug in :meth:`RangeIndex.intersection` failing to set the correct name on the returned :class:`Index` in some corner cases (:issue:`38197`) +- Bug in :meth:`Index.difference` failing to set the correct name on the returned :class:`Index` in some corner cases (:issue:`38268`) +- Bug in :meth:`Index.union` behaving differently depending on whether operand is an :class:`Index` or other list-like (:issue:`36384`) +- Bug in :meth:`Index.intersection` with non-matching numeric dtypes casting to ``object`` dtype instead of minimal common dtype (:issue:`38122`) +- Bug in :meth:`IntervalIndex.union` returning an incorrectly-typed :class:`Index` when empty (:issue:`38282`) +- Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError`` rather than a bare ``Exception`` (:issue:`35744`) +- Bug in ``dir`` where ``dir(obj)`` wouldn't show attributes defined on the instance for pandas objects (:issue:`37173`) +- Bug in :meth:`Index.drop` raising ``InvalidIndexError`` when index has duplicates (:issue:`38051`) +- Bug in :meth:`RangeIndex.difference` returning :class:`Int64Index` in some cases where it should return :class:`RangeIndex` (:issue:`38028`) +- Fixed bug in :func:`assert_series_equal` when comparing a datetime-like array with an equivalent non extension dtype array (:issue:`37609`) + + + +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.5..v1.2.0|HEAD diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst new file mode 100644 index 0000000000000..90f611c55e710 --- /dev/null +++ b/doc/source/whatsnew/v1.3.0.rst @@ -0,0 +1,199 @@ +.. _whatsnew_130: + +What's new in 1.3.0 (??) +------------------------ + +These are the changes in pandas 1.3.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +Enhancements +~~~~~~~~~~~~ + + +.. _whatsnew_130.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_130.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ + +These are bug fixes that might have notable behavior changes. + + + +.. _whatsnew_130.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +.. _whatsnew_130.api.other: + +Other API changes +^^^^^^^^^^^^^^^^^ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_130.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + + +.. _whatsnew_130.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_130.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- +- + +Categorical +^^^^^^^^^^^ + +- +- + +Datetimelike +^^^^^^^^^^^^ + +- +- + +Timedelta +^^^^^^^^^ + +- +- + +Timezones +^^^^^^^^^ + +- +- + +Numeric +^^^^^^^ +- Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`) +- +- + +Conversion +^^^^^^^^^^ +- +- + +Strings +^^^^^^^ + +- +- + +Interval +^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ + +- +- + +Missing +^^^^^^^ + +- +- + +MultiIndex +^^^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Period +^^^^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- +- + +Reshaping +^^^^^^^^^ + +- +- + +Sparse +^^^^^^ + +- +- + +ExtensionArray +^^^^^^^^^^^^^^ + +- +- + +Other +^^^^^ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_130.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/doc/sphinxext/README.rst b/doc/sphinxext/README.rst index 2be5372bc0216..8f0f4a8b2636d 100644 --- a/doc/sphinxext/README.rst +++ b/doc/sphinxext/README.rst @@ -7,7 +7,7 @@ pandas documentation. These copies originate from other projects: - ``numpydoc`` - Numpy's Sphinx extensions: this can be found at its own repository: https://github.com/numpy/numpydoc - ``ipython_directive`` and ``ipython_console_highlighting`` in the folder - `ipython_sphinxext` - Sphinx extensions from IPython: these are included + ``ipython_sphinxext`` - Sphinx extensions from IPython: these are included in IPython: https://github.com/ipython/ipython/tree/master/IPython/sphinxext .. note:: diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 9c175e4e58b45..2ec0b515ea95c 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- encoding:utf-8 -*- """ Script to generate contributor and pull request lists diff --git a/environment.yml b/environment.yml index 32ff8c91cb69c..b99b856187fb6 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: # required - - numpy>=1.15 + - numpy>=1.16.5 - python=3 - python-dateutil>=2.7.3 - pytz @@ -12,22 +12,26 @@ dependencies: - asv # building - - cython>=0.29.16 + # The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms. + - c-compiler + - cxx-compiler + - cython>=0.29.21 # code checks - - black=19.10b0 + - black=20.8b1 - cpplint - - flake8<3.8.0 # temporary pin, GH#34150 + - flake8 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - - isort=4.3.21 # check that imports are in the right order - - mypy=0.730 + - isort>=5.2.1 # check that imports are in the right order + - mypy=0.782 + - pre-commit - pycodestyle # used by flake8 + - pyupgrade # documentation - gitpython # obtain contributors from git for whatsnew - - gitdb2=2.0.6 # GH-32060 - - sphinx<=3.1.1 + - gitdb + - sphinx # documentation (jupyter notebooks) - nbconvert>=5.4.1 @@ -51,10 +55,12 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=5.0.1,<6.0.0rc0 + - flask + - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio + - pytest-instafail # downstream tests - seaborn @@ -92,11 +98,11 @@ dependencies: - odfpy - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - - pyarrow>=0.13.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - pyarrow>=0.15.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow - pyqt>=5.9.2 # pandas.read_clipboard - - pytables>=3.4.3 # pandas.read_hdf, DataFrame.to_hdf + - pytables>=3.5.1 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - fsspec>=0.7.4 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path @@ -105,6 +111,7 @@ dependencies: - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown + - natsort # DataFrame.sort_values - pip: - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master - git+https://github.com/numpy/numpydoc diff --git a/flake8/cython-template.cfg b/flake8/cython-template.cfg index 61562bd7701b1..3d7b288fd8055 100644 --- a/flake8/cython-template.cfg +++ b/flake8/cython-template.cfg @@ -1,4 +1,3 @@ [flake8] filename = *.pxi.in select = E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 - diff --git a/pandas/__init__.py b/pandas/__init__.py index d6584bf4f1c4f..cc5d835a52833 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -20,10 +20,9 @@ # numpy compat from pandas.compat.numpy import ( - _np_version_under1p16, - _np_version_under1p17, - _np_version_under1p18, - _is_numpy_dev, + np_version_under1p17 as _np_version_under1p17, + np_version_under1p18 as _np_version_under1p18, + is_numpy_dev as _is_numpy_dev, ) try: @@ -34,7 +33,7 @@ raise ImportError( f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build the C extensions first." + "'python setup.py build_ext --force' to build the C extensions first." ) from e from pandas._config import ( @@ -59,6 +58,8 @@ UInt16Dtype, UInt32Dtype, UInt64Dtype, + Float32Dtype, + Float64Dtype, CategoricalDtype, PeriodDtype, IntervalDtype, @@ -101,6 +102,7 @@ to_datetime, to_timedelta, # misc + Flags, Grouper, factorize, unique, @@ -185,181 +187,61 @@ __git_version__ = v.get("full-revisionid") del get_versions, v -# GH 27101 -# TODO: remove Panel compat in 1.0 -if pandas.compat.PY37: - - def __getattr__(name): - import warnings - - if name == "Panel": - - warnings.warn( - "The Panel class is removed from pandas. Accessing it " - "from the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) - - class Panel: - pass - - return Panel - - elif name == "datetime": - warnings.warn( - "The pandas.datetime class is deprecated " - "and will be removed from pandas in a future version. " - "Import from datetime module instead.", - FutureWarning, - stacklevel=2, - ) - - from datetime import datetime as dt - - return dt - - elif name == "np": - - warnings.warn( - "The pandas.np module is deprecated " - "and will be removed from pandas in a future version. " - "Import numpy directly instead", - FutureWarning, - stacklevel=2, - ) - import numpy as np - - return np - - elif name in {"SparseSeries", "SparseDataFrame"}: - warnings.warn( - f"The {name} class is removed from pandas. Accessing it from " - "the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) - - return type(name, (), {}) - - elif name == "SparseArray": - - warnings.warn( - "The pandas.SparseArray class is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.arrays.SparseArray instead.", - FutureWarning, - stacklevel=2, - ) - from pandas.core.arrays.sparse import SparseArray as _SparseArray - - return _SparseArray - - raise AttributeError(f"module 'pandas' has no attribute '{name}'") - -else: - - class Panel: - pass - - class SparseDataFrame: - pass - - class SparseSeries: - pass - - class __numpy: - def __init__(self): - import numpy as np - import warnings - - self.np = np - self.warnings = warnings - - def __getattr__(self, item): - self.warnings.warn( - "The pandas.np module is deprecated " - "and will be removed from pandas in a future version. " - "Import numpy directly instead", - FutureWarning, - stacklevel=2, - ) - - try: - return getattr(self.np, item) - except AttributeError as err: - raise AttributeError(f"module numpy has no attribute {item}") from err - - np = __numpy() - - class __Datetime(type): +# GH 27101 +def __getattr__(name): + import warnings + + if name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) from datetime import datetime as dt - datetime = dt - - def __getattr__(cls, item): - cls.emit_warning() - - try: - return getattr(cls.datetime, item) - except AttributeError as err: - raise AttributeError( - f"module datetime has no attribute {item}" - ) from err - - def __instancecheck__(cls, other): - return isinstance(other, cls.datetime) - - class __DatetimeSub(metaclass=__Datetime): - def emit_warning(dummy=0): - import warnings - - warnings.warn( - "The pandas.datetime class is deprecated " - "and will be removed from pandas in a future version. " - "Import from datetime instead.", - FutureWarning, - stacklevel=3, - ) - - def __new__(cls, *args, **kwargs): - cls.emit_warning() - from datetime import datetime as dt - - return dt(*args, **kwargs) - - datetime = __DatetimeSub + return dt - class __SparseArray(type): + elif name == "np": - from pandas.core.arrays.sparse import SparseArray as sa + warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + import numpy as np - SparseArray = sa + return np - def __instancecheck__(cls, other): - return isinstance(other, cls.SparseArray) + elif name in {"SparseSeries", "SparseDataFrame"}: + warnings.warn( + f"The {name} class is removed from pandas. Accessing it from " + "the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) - class __SparseArraySub(metaclass=__SparseArray): - def emit_warning(dummy=0): - import warnings + return type(name, (), {}) - warnings.warn( - "The pandas.SparseArray class is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.arrays.SparseArray instead.", - FutureWarning, - stacklevel=3, - ) + elif name == "SparseArray": - def __new__(cls, *args, **kwargs): - cls.emit_warning() - from pandas.core.arrays.sparse import SparseArray as sa + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray - return sa(*args, **kwargs) + return _SparseArray - SparseArray = __SparseArraySub + raise AttributeError(f"module 'pandas' has no attribute '{name}'") # module level doc-string diff --git a/pandas/_config/config.py b/pandas/_config/config.py index f5e16cddeb04c..512b638fc4877 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -392,7 +392,7 @@ class option_context(ContextDecorator): """ def __init__(self, *args): - if not (len(args) % 2 == 0 and len(args) >= 2): + if len(args) % 2 != 0 or len(args) < 2: raise ValueError( "Need to invoke as option_context(pat, val, [(pat, val), ...])." ) @@ -442,8 +442,8 @@ def register_option( ValueError if `validator` is specified and `defval` is not a valid value. """ - import tokenize import keyword + import tokenize key = key.lower() @@ -460,9 +460,7 @@ def register_option( path = key.split(".") for k in path: - # NOTE: tokenize.Name is not a public constant - # error: Module has no attribute "Name" [attr-defined] - if not re.match("^" + tokenize.Name + "$", k): # type: ignore + if not re.match("^" + tokenize.Name + "$", k): raise ValueError(f"{k} is not a valid identifier") if keyword.iskeyword(k): raise ValueError(f"{k} is a python keyword") @@ -650,7 +648,7 @@ def _build_option_description(k: str) -> str: s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]" if d: - rkey = d.rkey if d.rkey else "" + rkey = d.rkey or "" s += "\n (Deprecated" s += f", use `{rkey}` instead." s += ")" @@ -660,8 +658,8 @@ def _build_option_description(k: str) -> str: def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): """ Builds a concise listing of available options, grouped by prefix """ - from textwrap import wrap from itertools import groupby + from textwrap import wrap def pp(name: str, ks: Iterable[str]) -> List[str]: pfx = "- " + name + ".[" if name else "" diff --git a/pandas/_config/display.py b/pandas/_config/display.py index ef319f4447565..e4553a2107f87 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -22,7 +22,7 @@ def detect_console_encoding() -> str: encoding = None try: encoding = sys.stdout.encoding or sys.stdin.encoding - except (AttributeError, IOError): + except (AttributeError, OSError): pass # try again for something better diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 66865e1afb952..bc76aca93da2a 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -88,17 +88,18 @@ def _valid_locales(locales, normalize): valid_locales : list A list of valid locales. """ - if normalize: - normalizer = lambda x: locale.normalize(x.strip()) - else: - normalizer = lambda x: x.strip() - - return list(filter(can_set_locale, map(normalizer, locales))) + return [ + loc + for loc in ( + locale.normalize(loc.strip()) if normalize else loc.strip() + for loc in locales + ) + if can_set_locale(loc) + ] def _default_locale_getter(): - raw_locales = subprocess.check_output(["locale -a"], shell=True) - return raw_locales + return subprocess.check_output(["locale -a"], shell=True) def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter): diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6b6ead795584f..734b3d5c09cbf 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1,11 +1,12 @@ import cython from cython import Py_ssize_t -from libc.stdlib cimport malloc, free -from libc.string cimport memmove from libc.math cimport fabs, sqrt +from libc.stdlib cimport free, malloc +from libc.string cimport memmove import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_FLOAT32, @@ -31,12 +32,11 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.util cimport numeric, get_nat - from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -46,7 +46,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) - +from pandas._libs.util cimport get_nat, numeric import pandas._libs.missing as missing @@ -268,7 +268,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): ndarray[float64_t, ndim=2] result ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 - float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor + float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx + float64_t ssqdmy, covxy N, K = (mat).shape @@ -283,37 +284,29 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): with nogil: for xi in range(K): for yi in range(xi + 1): - nobs = sumxx = sumyy = sumx = sumy = 0 + # Welford's method for the variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 for i in range(N): if mask[i, xi] and mask[i, yi]: vx = mat[i, xi] vy = mat[i, yi] nobs += 1 - sumx += vx - sumy += vy + prev_meanx = meanx + prev_meany = meany + meanx = meanx + 1 / nobs * (vx - meanx) + meany = meany + 1 / nobs * (vy - meany) + ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx) + ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany) + covxy = covxy + (vx - meanx) * (vy - prev_meany) if nobs < minpv: result[xi, yi] = result[yi, xi] = NaN else: - meanx = sumx / nobs - meany = sumy / nobs - - # now the cov numerator - sumx = 0 - - for i in range(N): - if mask[i, xi] and mask[i, yi]: - vx = mat[i, xi] - meanx - vy = mat[i, yi] - meany - - sumx += vx * vy - sumxx += vx * vx - sumyy += vy * vy - - divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy) + divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) if divisor != 0: - result[xi, yi] = result[yi, xi] = sumx / divisor + result[xi, yi] = result[yi, xi] = covxy / divisor else: result[xi, yi] = result[yi, xi] = NaN @@ -325,7 +318,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): @cython.boundscheck(False) @cython.wraparound(False) -def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1) -> ndarray: +def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result @@ -412,7 +405,7 @@ ctypedef fused algos_t: uint8_t -def _validate_limit(nobs: int, limit=None) -> int: +def validate_limit(nobs: int, limit=None) -> int: """ Check that the `limit` argument is a positive integer. @@ -452,7 +445,7 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - lim = _validate_limit(nright, limit) + lim = validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: return indexer @@ -509,7 +502,7 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) val = values[0] for i in range(N): @@ -537,7 +530,7 @@ def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) for j in range(K): fill_count = 0 @@ -593,7 +586,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - lim = _validate_limit(nright, limit) + lim = validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: return indexer @@ -651,7 +644,7 @@ def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) val = values[N - 1] for i in range(N - 1, -1, -1): @@ -681,7 +674,7 @@ def backfill_2d_inplace(algos_t[:, :] values, if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) for j in range(K): fill_count = 0 @@ -799,7 +792,7 @@ ctypedef fused rank_t: @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - rank_t[:] in_arr, + ndarray[rank_t, ndim=1] in_arr, ties_method="average", bint ascending=True, na_option="keep", @@ -1018,7 +1011,7 @@ def rank_1d( def rank_2d( - rank_t[:, :] in_arr, + ndarray[rank_t, ndim=2] in_arr, int axis=0, ties_method="average", bint ascending=True, @@ -1195,19 +1188,23 @@ ctypedef fused diff_t: ctypedef fused out_t: float32_t float64_t + int64_t @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - diff_t[:, :] arr, - out_t[:, :] out, + ndarray[diff_t, ndim=2] arr, # TODO(cython 3) update to "const diff_t[:, :] arr" + ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, + bint datetimelike=False, ): cdef: Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.is_f_contig() + bint f_contig = arr.flags.f_contiguous + # bint f_contig = arr.is_f_contig() # TODO(cython 3) + diff_t left, right # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 @@ -1217,6 +1214,9 @@ def diff_2d( elif (out_t is float64_t and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): raise NotImplementedError + elif out_t is int64_t and diff_t is not int64_t: + # We only have out_t of int64_t if we have datetimelike + raise NotImplementedError else: # We put this inside an indented else block to avoid cython build # warnings about unreachable code @@ -1230,7 +1230,15 @@ def diff_2d( start, stop = 0, sx + periods for j in range(sy): for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] + left = arr[i, j] + right = arr[i - periods, j] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if periods >= 0: start, stop = periods, sy @@ -1238,7 +1246,15 @@ def diff_2d( start, stop = 0, sy + periods for j in range(start, stop): for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] + left = arr[i, j] + right = arr[i, j - periods] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if axis == 0: if periods >= 0: @@ -1247,7 +1263,15 @@ def diff_2d( start, stop = 0, sx + periods for i in range(start, stop): for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] + left = arr[i, j] + right = arr[i - periods, j] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if periods >= 0: start, stop = periods, sy @@ -1255,7 +1279,15 @@ def diff_2d( start, stop = 0, sy + periods for i in range(sx): for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] + left = arr[i, j] + right = arr[i, j - periods] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right # generated from template diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7c57e6ee9dbfd..5c4ba3b2729e3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,27 +1,51 @@ import cython from cython import Py_ssize_t -from cython cimport floating -from libc.stdlib cimport malloc, free +from cython cimport floating +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, - int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) +from numpy cimport ( + complex64_t, + complex128_t, + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) from numpy.math cimport NAN -cnp.import_array() -from pandas._libs.util cimport numeric, get_nat +cnp.import_array() -from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, - TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, - TIEBREAK_DENSE) -from pandas._libs.algos import (take_2d_axis1_float64_float64, - groupsort_indexer, tiebreakers) +from pandas._libs.algos cimport ( + TIEBREAK_AVERAGE, + TIEBREAK_DENSE, + TIEBREAK_FIRST, + TIEBREAK_MAX, + TIEBREAK_MIN, + TiebreakEnumType, + swap, +) +from pandas._libs.util cimport get_nat, numeric + +from pandas._libs.algos import ( + groupsort_indexer, + take_2d_axis1_float64_float64, + tiebreakers, +) from pandas._libs.missing cimport checknull + cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max @@ -205,7 +229,7 @@ def group_cumprod_float64(float64_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cumsum(numeric[:, :] out, - numeric[:, :] values, + ndarray[numeric, ndim=2] values, const int64_t[:] labels, int ngroups, is_datetimelike, @@ -320,7 +344,7 @@ def group_shift_indexer(int64_t[:] out, const int64_t[:] labels, @cython.boundscheck(False) def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, ndarray[uint8_t] mask, object direction, - int64_t limit): + int64_t limit, bint dropna): """ Indexes how to fill values forwards or backwards within a group. @@ -334,6 +358,7 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, direction : {'ffill', 'bfill'} Direction for fill to be applied (forwards or backwards, respectively) limit : Consecutive values to fill before stopping, or -1 for no limit + dropna : Flag to indicate if NaN groups should return all NaN values Notes ----- @@ -357,7 +382,9 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, with nogil: for i in range(N): idx = sorted_labels[i] - if mask[idx] == 1: # is missing + if dropna and labels[idx] == -1: # nan-group gets nan-values + curr_fill_idx = -1 + elif mask[idx] == 1: # is missing # Stop filling once we've hit the limit if filled_vals >= limit and limit != -1: curr_fill_idx = -1 @@ -448,7 +475,7 @@ ctypedef fused complexfloating_t: @cython.boundscheck(False) def _group_add(complexfloating_t[:, :] out, int64_t[:] counts, - complexfloating_t[:, :] values, + ndarray[complexfloating_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -459,8 +486,9 @@ def _group_add(complexfloating_t[:, :] out, complexfloating_t val, count complexfloating_t[:, :] sumx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) - if len(values) != len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -506,7 +534,7 @@ group_add_complex128 = _group_add['double complex'] @cython.boundscheck(False) def _group_prod(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -517,8 +545,9 @@ def _group_prod(floating[:, :] out, floating val, count floating[:, :] prodx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -558,7 +587,7 @@ group_prod_float64 = _group_prod['double'] @cython.cdivision(True) def _group_var(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1, int64_t ddof=1): @@ -567,10 +596,11 @@ def _group_var(floating[:, :] out, floating val, ct, oldmean floating[:, :] mean int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -615,7 +645,7 @@ group_var_float64 = _group_var['double'] @cython.boundscheck(False) def _group_mean(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): cdef: @@ -623,10 +653,11 @@ def _group_mean(floating[:, :] out, floating val, count floating[:, :] sumx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -665,7 +696,7 @@ group_mean_float64 = _group_mean['double'] @cython.boundscheck(False) def _group_ohlc(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -716,7 +747,7 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t] out, - numeric[:] values, + ndarray[numeric, ndim=1] values, ndarray[int64_t] labels, ndarray[uint8_t] mask, float64_t q, @@ -872,13 +903,12 @@ def group_last(rank_t[:, :] out, ndarray[int64_t, ndim=2] nobs bint runtime_error = False - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -898,9 +928,7 @@ def group_last(rank_t[:, :] out, for j in range(K): val = values[i, j] - # None should not be treated like other NA-like - # so that it won't be converted to nan - if not checknull(val) or val is None: + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -908,8 +936,8 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN + if nobs[i, j] < min_count: + out[i, j] = None else: out[i, j] = resx[i, j] else: @@ -930,7 +958,7 @@ def group_last(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -955,8 +983,9 @@ def group_last(rank_t[:, :] out, def group_nth(rank_t[:, :] out, int64_t[:] counts, ndarray[rank_t, ndim=2] values, - const int64_t[:] labels, int64_t rank=1, - Py_ssize_t min_count=-1): + const int64_t[:] labels, + int64_t min_count=-1, int64_t rank=1 + ): """ Only aggregates on axis=0 """ @@ -967,13 +996,12 @@ def group_nth(rank_t[:, :] out, ndarray[int64_t, ndim=2] nobs bint runtime_error = False - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) if rank_t is object: resx = np.empty((out).shape, dtype=object) @@ -993,9 +1021,7 @@ def group_nth(rank_t[:, :] out, for j in range(K): val = values[i, j] - # None should not be treated like other NA-like - # so that it won't be converted to nan - if not checknull(val) or val is None: + if not checknull(val): # NB: use _treat_as_na here once # conditional-nogil is available. nobs[lab, j] += 1 @@ -1004,8 +1030,8 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN + if nobs[i, j] < min_count: + out[i, j] = None else: out[i, j] = resx[i, j] @@ -1028,7 +1054,7 @@ def group_nth(rank_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if rank_t is int64_t: out[i, j] = NPY_NAT elif rank_t is uint64_t: @@ -1048,7 +1074,7 @@ def group_nth(rank_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, :] out, - rank_t[:, :] values, + ndarray[rank_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike, object ties_method="average", @@ -1265,13 +1291,12 @@ def group_max(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) maxx = np.empty_like(out) @@ -1308,11 +1333,12 @@ def group_max(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break else: + out[i, j] = nan_val else: out[i, j] = maxx[i, j] @@ -1340,13 +1366,12 @@ def group_min(groupby_t[:, :] out, bint runtime_error = False int64_t[:, :] nobs - assert min_count == -1, "'min_count' only used in add and prod" - # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") + min_count = max(min_count, 1) nobs = np.zeros((out).shape, dtype=np.int64) minx = np.empty_like(out) @@ -1382,7 +1407,7 @@ def group_min(groupby_t[:, :] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: if groupby_t is uint64_t: runtime_error = True break @@ -1400,7 +1425,7 @@ def group_min(groupby_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cummin(groupby_t[:, :] out, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike): @@ -1460,7 +1485,7 @@ def group_cummin(groupby_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cummax(groupby_t[:, :] out, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike): diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index a98820ca57895..f2af04d91a3e3 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -2,10 +2,13 @@ # at https://github.com/veorq/SipHash import cython -from libc.stdlib cimport malloc, free + +from libc.stdlib cimport free, malloc import numpy as np -from numpy cimport ndarray, uint8_t, uint32_t, uint64_t, import_array + +from numpy cimport import_array, ndarray, uint8_t, uint32_t, uint64_t + import_array() from pandas._libs.util cimport is_nan diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 0499eabf708af..7b630c264753f 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,7 +1,29 @@ +from numpy cimport intp_t, ndarray + from pandas._libs.khash cimport ( - kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t, - int64_t, float64_t) -from numpy cimport ndarray + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + kh_float32_t, + kh_float64_t, + kh_int8_t, + kh_int16_t, + kh_int32_t, + kh_int64_t, + kh_pymap_t, + kh_str_t, + kh_uint8_t, + kh_uint16_t, + kh_uint32_t, + kh_uint64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) # prototypes for sharing @@ -20,12 +42,54 @@ cdef class Int64HashTable(HashTable): cpdef get_item(self, int64_t val) cpdef set_item(self, int64_t key, Py_ssize_t val) +cdef class UInt32HashTable(HashTable): + cdef kh_uint32_t *table + + cpdef get_item(self, uint32_t val) + cpdef set_item(self, uint32_t key, Py_ssize_t val) + +cdef class Int32HashTable(HashTable): + cdef kh_int32_t *table + + cpdef get_item(self, int32_t val) + cpdef set_item(self, int32_t key, Py_ssize_t val) + +cdef class UInt16HashTable(HashTable): + cdef kh_uint16_t *table + + cpdef get_item(self, uint16_t val) + cpdef set_item(self, uint16_t key, Py_ssize_t val) + +cdef class Int16HashTable(HashTable): + cdef kh_int16_t *table + + cpdef get_item(self, int16_t val) + cpdef set_item(self, int16_t key, Py_ssize_t val) + +cdef class UInt8HashTable(HashTable): + cdef kh_uint8_t *table + + cpdef get_item(self, uint8_t val) + cpdef set_item(self, uint8_t key, Py_ssize_t val) + +cdef class Int8HashTable(HashTable): + cdef kh_int8_t *table + + cpdef get_item(self, int8_t val) + cpdef set_item(self, int8_t key, Py_ssize_t val) + cdef class Float64HashTable(HashTable): cdef kh_float64_t *table cpdef get_item(self, float64_t val) cpdef set_item(self, float64_t key, Py_ssize_t val) +cdef class Float32HashTable(HashTable): + cdef kh_float32_t *table + + cpdef get_item(self, float32_t val) + cpdef set_item(self, float32_t key, Py_ssize_t val) + cdef class PyObjectHashTable(HashTable): cdef kh_pymap_t *table diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c3dcbb942d7fe..963fddd4d5af9 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,65 +1,28 @@ cimport cython - -from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Free - -from libc.stdlib cimport malloc, free +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from cpython.ref cimport Py_INCREF, PyObject +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint32_t, float64_t +from numpy cimport float64_t, ndarray, uint8_t, uint32_t from numpy.math cimport NAN -cnp.import_array() - -from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, - kh_init_str, - kh_put_str, - kh_exist_str, - kh_get_str, - kh_destroy_str, - kh_resize_str, - kh_put_strbox, - kh_get_strbox, - kh_init_strbox, - kh_int64_t, - kh_init_int64, - kh_resize_int64, - kh_destroy_int64, - kh_get_int64, - kh_exist_int64, - kh_put_int64, - kh_float64_t, - kh_exist_float64, - kh_put_float64, - kh_init_float64, - kh_get_float64, - kh_destroy_float64, - kh_resize_float64, - kh_resize_uint64, - kh_exist_uint64, - kh_destroy_uint64, - kh_put_uint64, - kh_get_uint64, - kh_init_uint64, - kh_destroy_pymap, - kh_exist_pymap, - kh_init_pymap, - kh_get_pymap, - kh_put_pymap, - kh_resize_pymap, -) +cnp.import_array() from pandas._libs cimport util - +from pandas._libs.khash cimport KHASH_TRACE_DOMAIN, kh_str_t, khiter_t from pandas._libs.missing cimport checknull +def get_hashtable_trace_domain(): + return KHASH_TRACE_DOMAIN + + cdef int64_t NPY_NAT = util.get_nat() -_SIZE_HINT_LIMIT = (1 << 20) + 7 +SIZE_HINT_LIMIT = (1 << 20) + 7 cdef Py_ssize_t _INIT_VEC_CAP = 128 @@ -179,7 +142,7 @@ def unique_label_indices(const int64_t[:] labels): ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data - kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + kh_resize_int64(table, min(n, SIZE_HINT_LIMIT)) with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e0e026fe7cb5e..b582ed1533a8e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -5,6 +5,35 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +{{py: + +# name +cimported_types = ['float32', + 'float64', + 'int8', + 'int16', + 'int32', + 'int64', + 'pymap', + 'str', + 'strbox', + 'uint8', + 'uint16', + 'uint32', + 'uint64'] +}} + +{{for name in cimported_types}} +from pandas._libs.khash cimport ( + kh_destroy_{{name}}, + kh_exist_{{name}}, + kh_get_{{name}}, + kh_init_{{name}}, + kh_put_{{name}}, + kh_resize_{{name}}, +) +{{endfor}} + # ---------------------------------------------------------------------- # VectorData # ---------------------------------------------------------------------- @@ -20,9 +49,16 @@ from pandas._libs.missing cimport C_NA # for uniques in hashtables) dtypes = [('Float64', 'float64', 'float64_t'), + ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), + ('Int32', 'int32', 'int32_t'), + ('Int16', 'int16', 'int16_t'), + ('Int8', 'int8', 'int8_t'), ('String', 'string', 'char *'), - ('UInt64', 'uint64', 'uint64_t')] + ('UInt64', 'uint64', 'uint64_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('UInt8', 'uint8', 'uint8_t')] }} {{for name, dtype, c_type in dtypes}} @@ -49,8 +85,15 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData + Int32VectorData + Int16VectorData + Int8VectorData UInt64VectorData + UInt32VectorData + UInt16VectorData + UInt8VectorData Float64VectorData + Float32VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: @@ -65,7 +108,14 @@ cdef inline bint needs_resize(vector_data *data) nogil: # name, dtype, c_type dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), - ('Int64', 'int64', 'int64_t')] + ('Int64', 'int64', 'int64_t'), + ('Float32', 'float32', 'float32_t'), + ('UInt32', 'uint32', 'uint32_t'), + ('Int32', 'int32', 'int32_t'), + ('UInt16', 'uint16', 'uint16_t'), + ('Int16', 'int16', 'int16_t'), + ('UInt8', 'uint8', 'uint8_t'), + ('Int8', 'int8', 'int8_t')] }} @@ -253,22 +303,29 @@ cdef class HashTable: {{py: -# name, dtype, float_group, default_na_value -dtypes = [('Float64', 'float64', True, 'np.nan'), - ('UInt64', 'uint64', False, 0), - ('Int64', 'int64', False, 'NPY_NAT')] +# name, dtype, float_group +dtypes = [('Float64', 'float64', True), + ('UInt64', 'uint64', False), + ('Int64', 'int64', False), + ('Float32', 'float32', True), + ('UInt32', 'uint32', False), + ('Int32', 'int32', False), + ('UInt16', 'uint16', False), + ('Int16', 'int16', False), + ('UInt8', 'uint8', False), + ('Int8', 'int8', False)] }} -{{for name, dtype, float_group, default_na_value in dtypes}} +{{for name, dtype, float_group in dtypes}} cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: @@ -287,9 +344,11 @@ cdef class {{name}}HashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof({{dtype}}_t) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs cpdef get_item(self, {{dtype}}_t val): cdef: @@ -347,7 +406,7 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{dtype}}_t val khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) with nogil: for i in range(n): @@ -430,7 +489,7 @@ cdef class {{name}}HashTable(HashTable): # which is only used if it's *specified*. na_value2 = <{{dtype}}_t>na_value else: - na_value2 = {{default_na_value}} + na_value2 = 0 with nogil: for i in range(n): @@ -551,7 +610,7 @@ cdef class {{name}}HashTable(HashTable): def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) - int64_t[:] labels + intp_t[:] labels Py_ssize_t idx, count = 0 int ret = 0 {{dtype}}_t val @@ -559,7 +618,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) ud = uniques.data with nogil: @@ -603,7 +662,7 @@ cdef class StringHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_str() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_str(self.table, size_hint) def __dealloc__(self): @@ -612,10 +671,11 @@ cdef class StringHashTable(HashTable): self.table = NULL def sizeof(self, deep=False): - """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof(char *) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof(char *) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs cpdef get_item(self, str val): cdef: @@ -648,8 +708,8 @@ cdef class StringHashTable(HashTable): def get_indexer(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - int64_t *resbuf = labels.data + ndarray[intp_t] labels = np.empty(n, dtype=np.intp) + intp_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table const char *v @@ -680,7 +740,7 @@ cdef class StringHashTable(HashTable): object val const char *v khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) @@ -916,7 +976,7 @@ cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_pymap(self.table, size_hint) def __dealloc__(self): @@ -937,9 +997,11 @@ cdef class PyObjectHashTable(HashTable): def sizeof(self, deep=False): """ return the size of my table in bytes """ - return self.table.n_buckets * (sizeof(PyObject *) + # keys - sizeof(Py_ssize_t) + # vals - sizeof(uint32_t)) # flags + overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) + for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) + for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys + sizeof(Py_ssize_t)) # vals + return overhead + for_flags + for_pairs cpdef get_item(self, object val): cdef: @@ -986,7 +1048,7 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) for i in range(n): val = values[i] diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 0cc0a6b192df5..7c5afa4ff6b27 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -8,9 +8,16 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # dtype, ttype, c_type dtypes = [('float64', 'float64', 'float64_t'), + ('float32', 'float32', 'float32_t'), ('uint64', 'uint64', 'uint64_t'), + ('uint32', 'uint32', 'uint32_t'), + ('uint16', 'uint16', 'uint16_t'), + ('uint8', 'uint8', 'uint8_t'), ('object', 'pymap', 'object'), - ('int64', 'int64', 'int64_t')] + ('int64', 'int64', 'int64_t'), + ('int32', 'int32', 'int32_t'), + ('int16', 'int16', 'int16_t'), + ('int8', 'int8', 'int8_t')] }} @@ -54,7 +61,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, for i in range(n): val = values[i] - {{if dtype == 'float64'}} + {{if dtype == 'float64' or dtype == 'float32'}} if val == val or not dropna: {{else}} if True: @@ -138,7 +145,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_{{ttype}}(table, min(n, _SIZE_HINT_LIMIT)) + kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') @@ -208,7 +215,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): {{if dtype == 'object'}} def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} -def ismember_{{dtype}}(const {{c_type}}[:] arr, {{c_type}}[:] values): +def ismember_{{dtype}}(const {{c_type}}[:] arr, const {{c_type}}[:] values): {{endif}} """ Return boolean of values in arr on an @@ -275,8 +282,15 @@ def ismember_{{dtype}}(const {{c_type}}[:] arr, {{c_type}}[:] values): # dtype, ctype, table_type, npy_dtype dtypes = [('float64', 'float64_t', 'float64', 'float64'), + ('float32', 'float32_t', 'float32', 'float32'), ('int64', 'int64_t', 'int64', 'int64'), + ('int32', 'int32_t', 'int32', 'int32'), + ('int16', 'int16_t', 'int16', 'int16'), + ('int8', 'int8_t', 'int8', 'int8'), ('uint64', 'uint64_t', 'uint64', 'uint64'), + ('uint32', 'uint32_t', 'uint32', 'uint32'), + ('uint16', 'uint16_t', 'uint16', 'uint16'), + ('uint8', 'uint8_t', 'uint8', 'uint8'), ('object', 'object', 'pymap', 'object_')] }} diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 35c4b73b47695..e31c3739f456d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,6 +1,7 @@ import warnings import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,17 +17,16 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - +from pandas._libs.hashtable cimport HashTable from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timedeltas cimport _Timedelta - -from pandas._libs.hashtable cimport HashTable +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs import algos, hashtable as _hash from pandas._libs.missing import checknull @@ -80,7 +80,11 @@ cdef class IndexEngine: values = self._get_index_values() self._check_type(val) - loc = _bin_search(values, val) # .searchsorted(val, side='left') + try: + loc = _bin_search(values, val) # .searchsorted(val, side='left') + except TypeError: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: @@ -256,13 +260,13 @@ cdef class IndexEngine: def get_indexer_non_unique(self, targets): """ Return an indexer suitable for taking from a non unique index - return the labels in the same order ast the target + return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results """ cdef: ndarray values, x - ndarray[int64_t] result, missing + ndarray[intp_t] result, missing set stargets, remaining_stargets dict d = {} object val @@ -279,8 +283,8 @@ cdef class IndexEngine: else: n_alloc = n - result = np.empty(n_alloc, dtype=np.int64) - missing = np.empty(n_t, dtype=np.int64) + result = np.empty(n_alloc, dtype=np.intp) + missing = np.empty(n_t, dtype=np.intp) # map each starget to its position in the index if stargets and len(stargets) < 5 and self.is_monotonic_increasing: diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index c7b67667bda17..69680e472bbc2 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -10,21 +10,21 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dtype, hashtable_name -dtypes = [('Float64', 'float64', 'Float64'), - ('Float32', 'float32', 'Float64'), - ('Int64', 'int64', 'Int64'), - ('Int32', 'int32', 'Int64'), - ('Int16', 'int16', 'Int64'), - ('Int8', 'int8', 'Int64'), - ('UInt64', 'uint64', 'UInt64'), - ('UInt32', 'uint32', 'UInt64'), - ('UInt16', 'uint16', 'UInt64'), - ('UInt8', 'uint8', 'UInt64'), +# name, dtype +dtypes = [('Float64', 'float64'), + ('Float32', 'float32'), + ('Int64', 'int64'), + ('Int32', 'int32'), + ('Int16', 'int16'), + ('Int8', 'int8'), + ('UInt64', 'uint64'), + ('UInt32', 'uint32'), + ('UInt16', 'uint16'), + ('UInt8', 'uint8'), ] }} -{{for name, dtype, hashtable_name in dtypes}} +{{for name, dtype in dtypes}} cdef class {{name}}Engine(IndexEngine): @@ -32,7 +32,7 @@ cdef class {{name}}Engine(IndexEngine): # returns an ndarray with dtype {{dtype}}_t cdef _make_hash_table(self, Py_ssize_t n): - return _hash.{{hashtable_name}}HashTable(n) + return _hash.{{name}}HashTable(n) {{if name not in {'Float64', 'Float32'} }} cdef _check_type(self, object val): @@ -41,9 +41,7 @@ cdef class {{name}}Engine(IndexEngine): {{endif}} cdef void _call_map_locations(self, values): - # self.mapping is of type {{hashtable_name}}HashTable, - # so convert dtype of values - self.mapping.map_locations(algos.ensure_{{hashtable_name.lower()}}(values)) + self.mapping.map_locations(algos.ensure_{{name.lower()}}(values)) cdef _maybe_get_bool_indexer(self, object val): cdef: diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index f9aedeb8ad93e..7966fe8d4f045 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -1,4 +1,4 @@ -cdef class _NDFrameIndexerBase: +cdef class NDFrameIndexerBase: """ A base class for _NDFrameIndexer for fast instantiation and attribute access. """ diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b4b490f49b12..006fd34632d5a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -5,12 +5,15 @@ from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx + cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np + cimport numpy as cnp from numpy cimport NPY_INT64, int64_t + cnp.import_array() from pandas._libs.algos import ensure_int64 @@ -204,7 +207,7 @@ cdef slice slice_canonize(slice s): Convert slice to canonical bounded form. """ cdef: - Py_ssize_t start = 0, stop = 0, step = 1, length + Py_ssize_t start = 0, stop = 0, step = 1 if s.step is None: step = 1 @@ -236,7 +239,7 @@ cdef slice slice_canonize(slice s): if stop > start: stop = start - if start < 0 or (stop < 0 and s.stop is not None): + if start < 0 or (stop < 0 and s.stop is not None and step > 0): raise ValueError("unbounded slice") if stop < 0: diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 95881ebf1385c..10becdce5d6dd 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,7 +1,8 @@ import numbers from operator import le, lt -from cpython.datetime cimport PyDelta_Check, PyDateTime_IMPORT +from cpython.datetime cimport PyDateTime_IMPORT, PyDelta_Check + PyDateTime_IMPORT from cpython.object cimport ( @@ -16,8 +17,8 @@ from cpython.object cimport ( import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_QUICKSORT, @@ -30,23 +31,22 @@ from numpy cimport ( ndarray, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - from pandas._libs.hashtable cimport Int64Vector +from pandas._libs.tslibs.timedeltas cimport _Timedelta +from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs.util cimport ( - is_integer_object, is_float_object, + is_integer_object, is_timedelta64_object, ) -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timedeltas cimport _Timedelta - -_VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) +VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) cdef class IntervalMixin: @@ -179,7 +179,8 @@ cdef class IntervalMixin: return (self.right == self.left) & (self.closed != 'both') def _check_closed_matches(self, other, name='other'): - """Check if the closed attribute of `other` matches. + """ + Check if the closed attribute of `other` matches. Note that 'left' and 'right' are considered different from 'both'. @@ -291,12 +292,6 @@ cdef class Interval(IntervalMixin): True >>> year_2017.length Timedelta('365 days 00:00:00') - - And also you can create string intervals - - >>> volume_1 = pd.Interval('Ant', 'Dog', closed='both') - >>> 'Bee' in volume_1 - True """ _typ = "interval" __array_priority__ = 1000 @@ -324,7 +319,7 @@ cdef class Interval(IntervalMixin): self._validate_endpoint(left) self._validate_endpoint(right) - if closed not in _VALID_CLOSED: + if closed not in VALID_CLOSED: raise ValueError(f"invalid option for 'closed': {closed}") if not left <= right: raise ValueError("left side of interval must be <= right side") @@ -358,6 +353,11 @@ cdef class Interval(IntervalMixin): self_tuple = (self.left, self.right, self.closed) other_tuple = (other.left, other.right, other.closed) return PyObject_RichCompare(self_tuple, other_tuple, op) + elif util.is_array(other): + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) return NotImplemented diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 54892a7e4bc77..1b79d68c13570 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,6 +16,7 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs.algos import ( @@ -267,7 +268,7 @@ ctypedef fused join_t: @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_unique(join_t[:] left, join_t[:] right): +def left_join_indexer_unique(ndarray[join_t] left, ndarray[join_t] right): cdef: Py_ssize_t i, j, nleft, nright ndarray[int64_t] indexer @@ -640,7 +641,11 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): # ---------------------------------------------------------------------- from pandas._libs.hashtable cimport ( - HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable) + HashTable, + Int64HashTable, + PyObjectHashTable, + UInt64HashTable, +) ctypedef fused asof_t: uint8_t diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index b5fe73df5d9be..0d0c5ae058b21 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -1,7 +1,21 @@ from cpython.object cimport PyObject -from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) + cdef extern from "khash_python.h": + const int KHASH_TRACE_DOMAIN + ctypedef uint32_t khint_t ctypedef khint_t khiter_t @@ -66,72 +80,6 @@ cdef extern from "khash_python.h": void kh_destroy_str_starts(kh_str_starts_t*) nogil void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil - ctypedef struct kh_int64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int64_t *keys - size_t *vals - - kh_int64_t* kh_init_int64() nogil - void kh_destroy_int64(kh_int64_t*) nogil - void kh_clear_int64(kh_int64_t*) nogil - khint_t kh_get_int64(kh_int64_t*, int64_t) nogil - void kh_resize_int64(kh_int64_t*, khint_t) nogil - khint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil - void kh_del_int64(kh_int64_t*, khint_t) nogil - - bint kh_exist_int64(kh_int64_t*, khiter_t) nogil - - ctypedef uint64_t khuint64_t - - ctypedef struct kh_uint64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - khuint64_t *keys - size_t *vals - - kh_uint64_t* kh_init_uint64() nogil - void kh_destroy_uint64(kh_uint64_t*) nogil - void kh_clear_uint64(kh_uint64_t*) nogil - khint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil - void kh_resize_uint64(kh_uint64_t*, khint_t) nogil - khint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil - void kh_del_uint64(kh_uint64_t*, khint_t) nogil - - bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil - - ctypedef struct kh_float64_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - float64_t *keys - size_t *vals - - kh_float64_t* kh_init_float64() nogil - void kh_destroy_float64(kh_float64_t*) nogil - void kh_clear_float64(kh_float64_t*) nogil - khint_t kh_get_float64(kh_float64_t*, float64_t) nogil - void kh_resize_float64(kh_float64_t*, khint_t) nogil - khint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil - void kh_del_float64(kh_float64_t*, khint_t) nogil - - bint kh_exist_float64(kh_float64_t*, khiter_t) nogil - - ctypedef struct kh_int32_t: - khint_t n_buckets, size, n_occupied, upper_bound - uint32_t *flags - int32_t *keys - size_t *vals - - kh_int32_t* kh_init_int32() nogil - void kh_destroy_int32(kh_int32_t*) nogil - void kh_clear_int32(kh_int32_t*) nogil - khint_t kh_get_int32(kh_int32_t*, int32_t) nogil - void kh_resize_int32(kh_int32_t*, khint_t) nogil - khint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil - void kh_del_int32(kh_int32_t*, khint_t) nogil - - bint kh_exist_int32(kh_int32_t*, khiter_t) nogil - # sweep factorize ctypedef struct kh_strbox_t: @@ -149,3 +97,5 @@ cdef extern from "khash_python.h": void kh_del_strbox(kh_strbox_t*, khint_t) nogil bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil + +include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/khash_for_primitive_helper.pxi.in b/pandas/_libs/khash_for_primitive_helper.pxi.in new file mode 100644 index 0000000000000..db8d3e0b19417 --- /dev/null +++ b/pandas/_libs/khash_for_primitive_helper.pxi.in @@ -0,0 +1,42 @@ +""" +Template for wrapping khash-tables for each primitive `dtype` + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +{{py: + +# name, c_type +primitive_types = [('int64', 'int64_t'), + ('uint64', 'uint64_t'), + ('float64', 'float64_t'), + ('int32', 'int32_t'), + ('uint32', 'uint32_t'), + ('float32', 'float32_t'), + ('int16', 'int16_t'), + ('uint16', 'uint16_t'), + ('int8', 'int8_t'), + ('uint8', 'uint8_t'), + ] +}} + +{{for name, c_type in primitive_types}} + +cdef extern from "khash_python.h": + ctypedef struct kh_{{name}}_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + {{c_type}} *keys + size_t *vals + + kh_{{name}}_t* kh_init_{{name}}() nogil + void kh_destroy_{{name}}(kh_{{name}}_t*) nogil + void kh_clear_{{name}}(kh_{{name}}_t*) nogil + khint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil + void kh_resize_{{name}}(kh_{{name}}_t*, khint_t) nogil + khint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil + void kh_del_{{name}}(kh_{{name}}_t*, khint_t) nogil + + bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil + +{{endfor}} diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 37d83a73c6597..c5fb20596d7b6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -5,23 +5,24 @@ import warnings import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_RichCompareBool, Py_EQ -from cpython.ref cimport Py_INCREF -from cpython.tuple cimport PyTuple_SET_ITEM, PyTuple_New -from cpython.iterator cimport PyIter_Check -from cpython.sequence cimport PySequence_Check -from cpython.number cimport PyNumber_Check - from cpython.datetime cimport ( - PyDateTime_Check, PyDate_Check, - PyTime_Check, - PyDelta_Check, + PyDateTime_Check, PyDateTime_IMPORT, + PyDelta_Check, + PyTime_Check, ) +from cpython.iterator cimport PyIter_Check +from cpython.number cimport PyNumber_Check +from cpython.object cimport Py_EQ, PyObject_RichCompareBool +from cpython.ref cimport Py_INCREF +from cpython.sequence cimport PySequence_Check +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM + PyDateTime_IMPORT import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_OBJECT, @@ -35,10 +36,12 @@ from numpy cimport ( float32_t, float64_t, int64_t, + intp_t, ndarray, uint8_t, uint64_t, ) + cnp.import_array() cdef extern from "numpy/arrayobject.h": @@ -63,28 +66,23 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 from pandas._libs cimport util -from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan from pandas._libs.tslib import array_to_datetime -from pandas._libs.tslibs.nattype cimport ( - NPY_NAT, - c_NaT as NaT, - checknull_with_nat, -) -from pandas._libs.tslibs.conversion cimport convert_to_tsobject -from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.missing cimport ( + C_NA, checknull, - isnaobj, is_null_datetime64, is_null_timedelta64, - C_NA, + isnaobj, ) - +from pandas._libs.tslibs.conversion cimport convert_to_tsobject +from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT, checknull_with_nat +from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.period cimport is_period_object +from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 +from pandas._libs.tslibs.timezones cimport tz_compare # constants that will be compared to potentially arbitrarily large # python int @@ -120,6 +118,8 @@ def memory_usage_of_objects(arr: object[:]) -> int64_t: def is_scalar(val: object) -> bool: """ + Return True if given object is scalar. + Parameters ---------- val : object @@ -493,7 +493,7 @@ def has_infs_f8(const float64_t[:] arr) -> bool: return False -def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len): +def maybe_indices_to_slice(ndarray[intp_t] indices, int max_len): cdef: Py_ssize_t i, n = len(indices) int k, vstart, vlast, v @@ -587,14 +587,16 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: elif not (PyObject_RichCompareBool(x, y, Py_EQ) or (x is None or is_nan(x)) and (y is None or is_nan(y))): return False - except TypeError as err: - # Avoid raising TypeError on tzawareness mismatch - # TODO: This try/except can be removed if/when Timestamp - # comparisons are changed to match datetime, see GH#28507 - if "tz-naive and tz-aware" in str(err): + except ValueError: + # Avoid raising ValueError when comparing Numpy arrays to other types + if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y): + # Only compare scalars to scalars and non-scalars to non-scalars + return False + elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y)) + and not (isinstance(x, type(y)) or isinstance(y, type(x)))): + # Check if non-scalars have the same type return False raise - return True @@ -621,35 +623,62 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) -def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: - """ - Convert all elements in an array to string. +cpdef ndarray[object] ensure_string_array( + arr, + object na_value=np.nan, + bint convert_na_value=True, + bint copy=True, + bint skipna=True, +): + """Returns a new numpy array with object dtype and only strings and na values. Parameters ---------- - arr : ndarray - The array whose elements we are casting. - skipna : bool, default False + arr : array-like + The values to be converted to str, if needed. + na_value : Any, default np.nan + The value to use for na. For example, np.nan or pd.NA. + convert_na_value : bool, default True + If False, existing na values will be used unchanged in the new array. + copy : bool, default True + Whether to ensure that a new array is returned. + skipna : bool, default True Whether or not to coerce nulls to their stringified form - (e.g. NaN becomes 'nan'). + (e.g. if False, NaN becomes 'nan'). Returns ------- ndarray - A new array with the input array's elements casted. + An array with the input array's elements casted to str or nan-like. """ cdef: - object arr_i - Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) + Py_ssize_t i = 0, n = len(arr) + + if hasattr(arr, "to_numpy"): + arr = arr.to_numpy() + elif not isinstance(arr, np.ndarray): + arr = np.array(arr, dtype="object") + + result = np.asarray(arr, dtype="object") + + if copy and result is arr: + result = result.copy() for i in range(n): - arr_i = arr[i] + val = arr[i] - if not (skipna and checknull(arr_i)): - arr_i = str(arr_i) + if isinstance(val, str): + continue - result[i] = arr_i + if not checknull(val): + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = str(val) return result @@ -869,21 +898,28 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, if lab != cur: if lab != -1: - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][i - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) - + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][i - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][i - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:i] start = i cur = lab - tup = PyTuple_New(k) - for j in range(k): - val = keys[j][sorted_labels[j][n - 1]] - PyTuple_SET_ITEM(tup, j, val) - Py_INCREF(val) + if k == 1: + # When k = 1 we do not want to return a tuple as key + tup = keys[0][sorted_labels[0][n - 1]] + else: + tup = PyTuple_New(k) + for j in range(k): + val = keys[j][sorted_labels[j][n - 1]] + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) result[tup] = index[start:] return result @@ -893,6 +929,8 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, def is_float(obj: object) -> bool: """ + Return True if given object is float. + Returns ------- bool @@ -902,6 +940,8 @@ def is_float(obj: object) -> bool: def is_integer(obj: object) -> bool: """ + Return True if given object is integer. + Returns ------- bool @@ -911,6 +951,8 @@ def is_integer(obj: object) -> bool: def is_bool(obj: object) -> bool: """ + Return True if given object is boolean. + Returns ------- bool @@ -920,6 +962,8 @@ def is_bool(obj: object) -> bool: def is_complex(obj: object) -> bool: """ + Return True if given object is complex. + Returns ------- bool @@ -937,7 +981,7 @@ cpdef bint is_interval(object obj): def is_period(val: object) -> bool: """ - Return a boolean if this is a Period object. + Return True if given object is Period. Returns ------- @@ -983,7 +1027,7 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: False >>> is_list_like(np.array([2])) True - >>> is_list_like(np.array(2))) + >>> is_list_like(np.array(2)) False """ return c_is_list_like(obj, allow_sets) @@ -1317,8 +1361,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if not isinstance(value, list): value = list(value) - from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike) + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike values = construct_1d_object_array_from_listlike(value) # make contiguous @@ -1394,10 +1437,12 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "time" elif is_decimal(val): - return "decimal" + if is_decimal_array(values): + return "decimal" elif is_complex(val): - return "complex" + if is_complex_array(values): + return "complex" elif util.is_float_object(val): if is_float_array(values): @@ -1438,7 +1483,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "mixed" -def infer_datetimelike_array(arr: object) -> object: +def infer_datetimelike_array(arr: ndarray[object]) -> str: """ Infer if we have a datetime or timedelta array. - date: we have *only* date and maybe strings, nulls @@ -1451,7 +1496,7 @@ def infer_datetimelike_array(arr: object) -> object: Parameters ---------- - arr : object array + arr : ndarray[object] Returns ------- @@ -1682,6 +1727,34 @@ cpdef bint is_float_array(ndarray values): return validator.validate(values) +cdef class ComplexValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return ( + util.is_complex_object(value) + or (util.is_float_object(value) and is_nan(value)) + ) + + cdef inline bint is_array_typed(self) except -1: + return issubclass(self.dtype.type, np.complexfloating) + + +cdef bint is_complex_array(ndarray values): + cdef: + ComplexValidator validator = ComplexValidator(len(values), values.dtype) + return validator.validate(values) + + +cdef class DecimalValidator(Validator): + cdef inline bint is_value_typed(self, object value) except -1: + return is_decimal(value) + + +cdef bint is_decimal_array(ndarray values): + cdef: + DecimalValidator validator = DecimalValidator(len(values), values.dtype) + return validator.validate(values) + + cdef class StringValidator(Validator): cdef inline bint is_value_typed(self, object value) except -1: return isinstance(value, str) @@ -1969,7 +2042,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, elif util.is_bool_object(val): floats[i] = uints[i] = ints[i] = bools[i] = val seen.bool_ = True - elif val is None: + elif val is None or val is C_NA: seen.saw_null() floats[i] = complexes[i] = NaN elif hasattr(val, '__len__') and len(val) == 0: @@ -2364,7 +2437,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr @cython.boundscheck(False) @cython.wraparound(False) -def map_infer(ndarray arr, object f, bint convert=True): +def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2372,6 +2445,9 @@ def map_infer(ndarray arr, object f, bint convert=True): ---------- arr : ndarray f : function + convert : bint + ignore_na : bint + If True, NA values will not have f applied Returns ------- @@ -2385,6 +2461,9 @@ def map_infer(ndarray arr, object f, bint convert=True): n = len(arr) result = np.empty(n, dtype=object) for i in range(n): + if ignore_na and checknull(arr[i]): + result[i] = arr[i] + continue val = f(arr[i]) if cnp.PyArray_IsZeroDim(val): @@ -2520,8 +2599,6 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): # kludge, for Series return np.empty(0, dtype='f8') - keys = getattr(keys, 'values', keys) - for i in range(n): val = keys[i] if val in mapping: diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 090c5c5173280..e02b84381b62c 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,5 +1,6 @@ from numpy cimport ndarray, uint8_t + cpdef bint checknull(object val) cpdef bint checknull_old(object val) cpdef ndarray[uint8_t] isnaobj(ndarray arr) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index fdd06fe631b97..abf38265ddc6d 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,26 +1,24 @@ -import cython -from cython import Py_ssize_t - import numbers +import cython +from cython import Py_ssize_t import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t + cnp.import_array() from pandas._libs cimport util - - -from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, checknull_with_nat, is_null_datetimelike, ) -from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op - -from pandas.compat import is_platform_32bit +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op +from pandas.compat import IS64 cdef: float64_t INF = np.inf @@ -28,7 +26,7 @@ cdef: int64_t NPY_NAT = util.get_nat() - bint is_32bit = is_platform_32bit() + bint is_32bit = not IS64 cpdef bint checknull(object val): @@ -157,7 +155,10 @@ def isnaobj_old(arr: ndarray) -> ndarray: result = np.zeros(n, dtype=np.uint8) for i in range(n): val = arr[i] - result[i] = checknull(val) or val == INF or val == NEGINF + result[i] = ( + checknull(val) + or util.is_float_object(val) and (val == INF or val == NEGINF) + ) return result.view(np.bool_) diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 658600cdfbe6c..d1f897d237c1b 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -10,18 +10,17 @@ from cpython.object cimport ( PyObject_RichCompareBool, ) - import cython from cython import Py_ssize_t - import numpy as np -from numpy cimport ndarray, uint8_t, import_array -import_array() +from numpy cimport import_array, ndarray, uint8_t + +import_array() -from pandas._libs.util cimport UINT8_MAX, is_nan from pandas._libs.missing cimport checknull +from pandas._libs.util cimport UINT8_MAX, is_nan @cython.wraparound(False) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6ffb036e01595..eae72d700190d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,25 +1,19 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license -import bz2 -import gzip -import io -import os +from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC +from errno import ENOENT import sys import time import warnings -import zipfile - -from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE -from errno import ENOENT from libc.stdlib cimport free -from libc.string cimport strncpy, strlen, strcasecmp +from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t -from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString -from cpython.exc cimport PyErr_Occurred, PyErr_Fetch +from cpython.bytes cimport PyBytes_AsString +from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode @@ -30,42 +24,62 @@ cdef extern from "Python.h": import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t + cnp.import_array() from pandas._libs cimport util -from pandas._libs.util cimport UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX + import pandas._libs.lib as lib from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, - kh_float64_t, kh_get_float64, kh_destroy_float64, - kh_put_float64, kh_init_float64, kh_resize_float64, - kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, + kh_destroy_float64, + kh_destroy_str, + kh_destroy_str_starts, kh_destroy_strbox, - kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, - kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) + kh_exist_str, + kh_float64_t, + kh_get_float64, + kh_get_str, + kh_get_str_starts_item, + kh_get_strbox, + kh_init_float64, + kh_init_str, + kh_init_str_starts, + kh_init_strbox, + kh_put_float64, + kh_put_str, + kh_put_str_starts_item, + kh_put_strbox, + kh_resize_float64, + kh_resize_str_starts, + kh_str_starts_t, + kh_str_t, + kh_strbox_t, + khiter_t, +) + +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( + is_bool_dtype, is_categorical_dtype, - is_integer_dtype, is_float_dtype, - is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype, is_extension_array_dtype) + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + pandas_dtype, +) from pandas.core.dtypes.concat import union_categoricals -from pandas.compat import _import_lzma, _get_lzma_file -from pandas.errors import (ParserError, DtypeWarning, - EmptyDataError, ParserWarning) - -lzma = _import_lzma() - cdef: float64_t INF = np.inf float64_t NEGINF = -INF + int64_t DEFAULT_CHUNKSIZE = 256 * 1024 cdef extern from "headers/portable.h": @@ -147,7 +161,6 @@ cdef extern from "parser/tokenizer.h": char commentchar int allow_embedded_newline - int strict # raise exception on bad CSV */ int usecols @@ -254,14 +267,15 @@ cdef extern from "parser/io.h": size_t *bytes_read, int *status) -DEFAULT_CHUNKSIZE = 256 * 1024 - - cdef class TextReader: """ # source: StringIO or file object + ..versionchange:: 1.2.0 + removed 'compression', 'memory_map', and 'encoding' argument. + These arguments are outsourced to CParserWrapper. + 'source' has to be a file handle. """ cdef: @@ -278,7 +292,7 @@ cdef class TextReader: cdef public: int64_t leading_cols, table_width, skipfooter, buffer_lines - bint allow_leading_cols, mangle_dupe_cols, memory_map, low_memory + bint allow_leading_cols, mangle_dupe_cols, low_memory bint delim_whitespace object delimiter, converters object na_values @@ -286,8 +300,6 @@ cdef class TextReader: object index_col object skiprows object dtype - object encoding - object compression object usecols list dtype_cast_order set unnamed_cols @@ -300,10 +312,8 @@ cdef class TextReader: header_end=0, index_col=None, names=None, - bint memory_map=False, tokenize_chunksize=DEFAULT_CHUNKSIZE, bint delim_whitespace=False, - compression=None, converters=None, bint skipinitialspace=False, escapechar=None, @@ -311,7 +321,6 @@ cdef class TextReader: quotechar=b'"', quoting=0, lineterminator=None, - encoding=None, comment=None, decimal=b'.', thousands=None, @@ -335,15 +344,7 @@ cdef class TextReader: bint skip_blank_lines=True): # set encoding for native Python and C library - if encoding is not None: - if not isinstance(encoding, bytes): - encoding = encoding.encode('utf-8') - encoding = encoding.lower() - self.c_encoding = encoding - else: - self.c_encoding = NULL - - self.encoding = encoding + self.c_encoding = NULL self.parser = parser_new() self.parser.chunksize = tokenize_chunksize @@ -353,9 +354,6 @@ cdef class TextReader: # For timekeeping self.clocks = [] - self.compression = compression - self.memory_map = memory_map - self.parser.usecols = (usecols is not None) self._setup_parser_source(source) @@ -455,10 +453,13 @@ cdef class TextReader: if float_precision == "round_trip": # see gh-15140 self.parser.double_converter = round_trip - elif float_precision == "high": + elif float_precision == "legacy": + self.parser.double_converter = xstrtod + elif float_precision == "high" or float_precision is None: self.parser.double_converter = precise_xstrtod else: - self.parser.double_converter = xstrtod + raise ValueError(f'Unrecognized float_precision option: ' + f'{float_precision}') if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) @@ -538,11 +539,6 @@ cdef class TextReader: parser_del(self.parser) def close(self): - # we need to properly close an open derived - # filehandle here, e.g. and UTFRecoder - if self.handle is not None: - self.handle.close() - # also preemptively free all allocated memory parser_free(self.parser) if self.true_set: @@ -590,82 +586,15 @@ cdef class TextReader: cdef: void *ptr - self.parser.cb_io = NULL - self.parser.cb_cleanup = NULL - - if self.compression: - if self.compression == 'gzip': - if isinstance(source, str): - source = gzip.GzipFile(source, 'rb') - else: - source = gzip.GzipFile(fileobj=source) - elif self.compression == 'bz2': - source = bz2.BZ2File(source, 'rb') - elif self.compression == 'zip': - zip_file = zipfile.ZipFile(source) - zip_names = zip_file.namelist() - - if len(zip_names) == 1: - file_name = zip_names.pop() - source = zip_file.open(file_name) - - elif len(zip_names) == 0: - raise ValueError(f'Zero files found in compressed ' - f'zip file {source}') - else: - raise ValueError(f'Multiple files found in compressed ' - f'zip file {zip_names}') - elif self.compression == 'xz': - if isinstance(source, str): - source = _get_lzma_file(lzma)(source, 'rb') - else: - source = _get_lzma_file(lzma)(filename=source) - else: - raise ValueError(f'Unrecognized compression type: ' - f'{self.compression}') - - if (self.encoding and hasattr(source, "read") and - not hasattr(source, "encoding")): - source = io.TextIOWrapper( - source, self.encoding.decode('utf-8'), newline='') - - self.encoding = b'utf-8' - self.c_encoding = self.encoding - - self.handle = source - - if isinstance(source, str): - encoding = sys.getfilesystemencoding() or "utf-8" - usource = source - source = source.encode(encoding) - - if self.memory_map: - ptr = new_mmap(source) - if ptr == NULL: - # fall back - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - else: - self.parser.cb_io = &buffer_mmap_bytes - self.parser.cb_cleanup = &del_mmap - else: - ptr = new_file_source(source, self.parser.chunksize) - self.parser.cb_io = &buffer_file_bytes - self.parser.cb_cleanup = &del_file_source - self.parser.source = ptr - - elif hasattr(source, 'read'): - # e.g., StringIO - - ptr = new_rd_source(source) - self.parser.source = ptr - self.parser.cb_io = &buffer_rd_bytes - self.parser.cb_cleanup = &del_rd_source - else: + if not hasattr(source, "read"): raise IOError(f'Expected file path name or file-like object, ' f'got {type(source)} type') + ptr = new_rd_source(source) + self.parser.source = ptr + self.parser.cb_io = &buffer_rd_bytes + self.parser.cb_cleanup = &del_rd_source + cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 97c491776f831..4b6b71088cb7c 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,24 +1,25 @@ from copy import copy -from cython import Py_ssize_t - -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t +from numpy cimport int64_t, ndarray + cnp.import_array() -from pandas._libs cimport util -from pandas._libs.lib import maybe_convert_objects, is_scalar +from pandas._libs.util cimport is_array, set_array_not_contiguous +from pandas._libs.lib import is_scalar, maybe_convert_objects -cdef _check_result_array(object obj, Py_ssize_t cnt): - if (util.is_array(obj) or +cpdef check_result_array(object obj, Py_ssize_t cnt): + + if (is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): - raise ValueError('Function does not reduce') + raise ValueError('Must produce aggregated value') cdef class _BaseGrouper: @@ -30,7 +31,7 @@ cdef class _BaseGrouper: if (dummy.dtype != self.arr.dtype and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') - if util.is_array(values) and not values.flags.contiguous: + if is_array(values) and not values.flags.contiguous: # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values @@ -43,13 +44,16 @@ cdef class _BaseGrouper: Slider islider, Slider vslider): if cached_typ is None: cached_ityp = self.ityp(islider.buf) - cached_typ = self.typ(vslider.buf, index=cached_ityp, name=self.name) + cached_typ = self.typ( + vslider.buf, dtype=vslider.buf.dtype, index=cached_ityp, name=self.name + ) else: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference # to a 1-d ndarray like datetime / timedelta / period. object.__setattr__(cached_ityp, '_index_data', islider.buf) cached_ityp._engine.clear_mapping() + cached_ityp._cache.clear() # e.g. inferred_freq must go object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf) object.__setattr__(cached_typ._mgr._block, 'mgr_locs', slice(len(vslider.buf))) @@ -68,13 +72,16 @@ cdef class _BaseGrouper: object res cached_ityp._engine.clear_mapping() + cached_ityp._cache.clear() # e.g. inferred_freq must go res = self.f(cached_typ) - res = _extract_result(res) + res = extract_result(res) if not initialized: # On the first pass, we check the output shape to see # if this looks like a reduction. initialized = True - _check_result_array(res, len(self.dummy_arr)) + # In all tests other than test_series_grouper and + # test_series_bin_grouper, we have len(self.dummy_arr) == 0 + check_result_array(res, len(self.dummy_arr)) return res, initialized @@ -99,7 +106,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.f = f values = series.values - if util.is_array(values) and not values.flags.c_contiguous: + if is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -197,7 +204,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.f = f values = series.values - if util.is_array(values) and not values.flags.c_contiguous: + if is_array(values) and not values.flags.c_contiguous: # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values @@ -273,12 +280,17 @@ cdef class SeriesGrouper(_BaseGrouper): return result, counts -cdef inline _extract_result(object res, bint squeeze=True): +cpdef inline extract_result(object res, bint squeeze=True): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ - if hasattr(res, 'values') and util.is_array(res.values): + if hasattr(res, "_values"): + # Preserve EA + res = res._values + if squeeze and res.ndim == 1 and len(res) == 1: + res = res[0] + if hasattr(res, 'values') and is_array(res.values): res = res.values - if util.is_array(res): + if is_array(res): if res.ndim == 0: res = res.item() elif squeeze and res.ndim == 1 and len(res) == 1: @@ -292,7 +304,7 @@ cdef class Slider: """ cdef: ndarray values, buf - Py_ssize_t stride, orig_len, orig_stride + Py_ssize_t stride char *orig_data def __init__(self, ndarray values, ndarray buf): @@ -304,11 +316,9 @@ cdef class Slider: self.values = values self.buf = buf - self.stride = values.strides[0] + self.stride = values.strides[0] self.orig_data = self.buf.data - self.orig_len = self.buf.shape[0] - self.orig_stride = self.buf.strides[0] self.buf.data = self.values.data self.buf.strides[0] = self.stride @@ -321,10 +331,8 @@ cdef class Slider: self.buf.shape[0] = end - start cdef reset(self): - - self.buf.shape[0] = self.orig_len self.buf.data = self.orig_data - self.buf.strides[0] = self.orig_stride + self.buf.shape[0] = 0 class InvalidApply(Exception): @@ -359,14 +367,14 @@ def apply_frame_axis0(object frame, object f, object names, try: piece = f(chunk) - except Exception: + except Exception as err: # We can't be more specific without knowing something about `f` - raise InvalidApply('Let this error raise above us') + raise InvalidApply("Let this error raise above us") from err # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if not piece.index.equals(chunk.index): + if not piece.index is chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int @@ -396,39 +404,34 @@ cdef class BlockSlider: """ Only capable of sliding on axis=0 """ - - cdef public: - object frame, dummy, index - int nblocks - Slider idx_slider - list blocks - cdef: + object frame, dummy, index, block + list blk_values + ndarray values + Slider idx_slider char **base_ptrs + int nblocks + Py_ssize_t i def __init__(self, object frame): - cdef: - Py_ssize_t i - object b - self.frame = frame self.dummy = frame[:0] self.index = self.dummy.index - self.blocks = [b.values for b in self.dummy._mgr.blocks] + self.blk_values = [block.values for block in self.dummy._mgr.blocks] - for x in self.blocks: - util.set_array_not_contiguous(x) + for values in self.blk_values: + set_array_not_contiguous(values) - self.nblocks = len(self.blocks) + self.nblocks = len(self.blk_values) # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference to a 1-d # ndarray like datetime / timedelta / period. self.idx_slider = Slider( self.frame.index._index_data, self.dummy.index._index_data) - self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) - for i, block in enumerate(self.blocks): + self.base_ptrs = malloc(sizeof(char*) * self.nblocks) + for i, block in enumerate(self.blk_values): self.base_ptrs[i] = (block).data def __dealloc__(self): @@ -438,10 +441,9 @@ cdef class BlockSlider: cdef: ndarray arr Py_ssize_t i - # move blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] + arr.strides[1] * start @@ -452,15 +454,14 @@ cdef class BlockSlider: object.__setattr__(self.index, '_index_data', self.idx_slider.buf) self.index._engine.clear_mapping() + self.index._cache.clear() # e.g. inferred_freq must go cdef reset(self): cdef: ndarray arr Py_ssize_t i - - # reset blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index da4dd00027395..75dbb4b74aabd 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -16,7 +16,9 @@ from numpy cimport ( ) import numpy as np + cimport numpy as cnp + cnp.import_array() from pandas._libs.lib cimport c_is_list_like @@ -122,7 +124,8 @@ def explode(ndarray[object] values): counts = np.zeros(n, dtype='int64') for i in range(n): v = values[i] - if c_is_list_like(v, False): + + if c_is_list_like(v, True): if len(v): counts[i] += len(v) else: @@ -136,8 +139,9 @@ def explode(ndarray[object] values): for i in range(n): v = values[i] - if c_is_list_like(v, False): + if c_is_list_like(v, True): if len(v): + v = list(v) for j in range(len(v)): result[count] = v[j] count += 1 diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 7c9575d921dc9..0c3d8915b749b 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -1,9 +1,18 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, - float64_t, float32_t) +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, +) + cnp.import_array() @@ -94,7 +103,7 @@ cdef class IntIndex(SparseIndex): if not monotonic: raise ValueError("Indices must be strictly increasing") - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: if not isinstance(other, IntIndex): return False @@ -390,7 +399,7 @@ cdef class BlockIndex(SparseIndex): if blengths[i] == 0: raise ValueError(f'Zero-length block {i}') - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: if not isinstance(other, BlockIndex): return False diff --git a/pandas/_libs/src/klib/khash.h b/pandas/_libs/src/klib/khash.h index 916838d1e9584..bb56b2fe2d145 100644 --- a/pandas/_libs/src/klib/khash.h +++ b/pandas/_libs/src/klib/khash.h @@ -115,6 +115,24 @@ int main() { #include "../inline_helper.h" +// hooks for memory allocator, C-runtime allocator used per default +#ifndef KHASH_MALLOC +#define KHASH_MALLOC malloc +#endif + +#ifndef KHASH_REALLOC +#define KHASH_REALLOC realloc +#endif + +#ifndef KHASH_CALLOC +#define KHASH_CALLOC calloc +#endif + +#ifndef KHASH_FREE +#define KHASH_FREE free +#endif + + #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; #elif ULONG_MAX == 0xffffffffu @@ -122,14 +140,23 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khuint64_t; -typedef signed long khint64_t; +typedef unsigned long khint64_t; #else -typedef unsigned long long khuint64_t; -typedef signed long long khint64_t; +typedef unsigned long long khint64_t; +#endif + +#if UINT_MAX == 0xffffu +typedef unsigned int khint16_t; +#elif USHRT_MAX == 0xffffu +typedef unsigned short khint16_t; +#endif + +#if UCHAR_MAX == 0xffu +typedef unsigned char khint8_t; #endif typedef double khfloat64_t; +typedef float khfloat32_t; typedef khint32_t khint_t; typedef khint_t khiter_t; @@ -143,10 +170,86 @@ typedef khint_t khiter_t; #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) ((void)0) + +// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp +khint32_t PANDAS_INLINE murmur2_32to32(khint32_t k){ + const khint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khint32_t h = SEED ^ 4; + + //handle 4 bytes: + k *= M_32; + k ^= k >> R_32; + k *= M_32; + + h *= M_32; + h ^= k; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. (Really needed here?) + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; +} + +// it is possible to have a special x64-version, which would need less operations, but +// using 32bit version always has also some benifits: +// - one code for 32bit and 64bit builds +// - the same case for 32bit and 64bit builds +// - no performance difference could be measured compared to a possible x64-version + +khint32_t PANDAS_INLINE murmur2_32_32to32(khint32_t k1, khint32_t k2){ + const khint32_t SEED = 0xc70f6907UL; + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + const khint32_t M_32 = 0x5bd1e995; + const int R_32 = 24; + + // Initialize the hash to a 'random' value + khint32_t h = SEED ^ 4; + + //handle first 4 bytes: + k1 *= M_32; + k1 ^= k1 >> R_32; + k1 *= M_32; + + h *= M_32; + h ^= k1; + + //handle second 4 bytes: + k2 *= M_32; + k2 ^= k2 >> R_32; + k2 *= M_32; + + h *= M_32; + h ^= k2; + + // Do a few final mixes of the hash to ensure the "last few + // bytes" are well-incorporated. + h ^= h >> 13; + h *= M_32; + h ^= h >> 15; + return h; +} + +khint32_t PANDAS_INLINE murmur2_64to32(khint64_t k){ + khint32_t k1 = (khint32_t)k; + khint32_t k2 = (khint32_t)(k >> 32); + + return murmur2_32_32to32(k1, k2); +} + + #ifdef KHASH_LINEAR #define __ac_inc(k, m) 1 #else -#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m) #endif #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) @@ -180,14 +283,14 @@ static const double __ac_HASH_UPPER = 0.77; khval_t *vals; \ } kh_##name##_t; \ SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ + KHASH_FREE(h->keys); KHASH_FREE(h->flags); \ + KHASH_FREE(h->vals); \ + KHASH_FREE(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ @@ -220,11 +323,11 @@ static const double __ac_HASH_UPPER = 0.77; if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + new_flags = (khint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ } /* otherwise shrink */ \ } \ } \ @@ -257,10 +360,10 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); /* free the working space */ \ + KHASH_FREE(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ @@ -512,15 +615,25 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ + +// we implicitly convert signed int to unsigned int, thus potential overflows +// for operations (<<,*,+) don't trigger undefined behavior, also >>-operator +// is implementation defined for signed ints if sign-bit is set. +// because we never really "get" the keys, there will be no convertion from +// unsigend int to (signed) int (which would be implementation defined behavior) +// this holds also for 64-, 16- and 8-bit integers #define KHASH_MAP_INIT_INT(name, khval_t) \ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) +#define KHASH_MAP_INIT_UINT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -531,11 +644,34 @@ PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) +/*! @function + @abstract Instantiate a hash map containing 16bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT16(name, khval_t) \ + KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 8bit-integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +#define KHASH_MAP_INIT_UINT8(name, khval_t) \ + KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + + typedef const char *kh_cstr_t; /*! @function @@ -558,12 +694,23 @@ typedef const char *kh_cstr_t; #define kh_exist_float64(h, k) (kh_exist(h, k)) #define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_float32(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) +#define kh_exist_uint32(h, k) (kh_exist(h, k)) +#define kh_exist_int16(h, k) (kh_exist(h, k)) +#define kh_exist_uint16(h, k) (kh_exist(h, k)) +#define kh_exist_int8(h, k) (kh_exist(h, k)) +#define kh_exist_uint8(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_UINT(uint32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) KHASH_MAP_INIT_UINT64(uint64, size_t) +KHASH_MAP_INIT_INT16(int16, size_t) +KHASH_MAP_INIT_UINT16(uint16, size_t) +KHASH_MAP_INIT_INT8(int8, size_t) +KHASH_MAP_INIT_UINT8(uint8, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 82251744915a5..8e4e61b4f3077 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -1,6 +1,59 @@ #include #include +// khash should report usage to tracemalloc +#if PY_VERSION_HEX >= 0x03060000 +#include +#if PY_VERSION_HEX < 0x03070000 +#define PyTraceMalloc_Track _PyTraceMalloc_Track +#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack +#endif +#else +#define PyTraceMalloc_Track(...) +#define PyTraceMalloc_Untrack(...) +#endif + + +static const int KHASH_TRACE_DOMAIN = 424242; +void *traced_malloc(size_t size){ + void * ptr = malloc(size); + if(ptr!=NULL){ + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; +} + +void *traced_calloc(size_t num, size_t size){ + void * ptr = calloc(num, size); + if(ptr!=NULL){ + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size); + } + return ptr; +} + +void *traced_realloc(void* old_ptr, size_t size){ + void * ptr = realloc(old_ptr, size); + if(ptr!=NULL){ + if(old_ptr != ptr){ + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr); + } + PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size); + } + return ptr; +} + +void traced_free(void* ptr){ + if(ptr!=NULL){ + PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr); + } + free(ptr); +} + + +#define KHASH_MALLOC traced_malloc +#define KHASH_REALLOC traced_realloc +#define KHASH_CALLOC traced_calloc +#define KHASH_FREE traced_free #include "khash.h" // Previously we were using the built in cpython hash function for doubles @@ -13,33 +66,68 @@ // is 64 bits the truncation causes collission issues. Given all that, we use our own // simple hash, viewing the double bytes as an int64 and using khash's default // hash for 64 bit integers. -// GH 13436 +// GH 13436 showed that _Py_HashDouble doesn't work well with khash +// GH 28303 showed, that the simple xoring-version isn't good enough +// See GH 36729 for evaluation of the currently used murmur2-hash version +// An interesting alternative to expensive murmur2-hash would be to change +// the probing strategy and use e.g. the probing strategy from CPython's +// implementation of dicts, which shines for smaller sizes but is more +// predisposed to superlinear running times (see GH 36729 for comparison) + + khint64_t PANDAS_INLINE asint64(double key) { - khint64_t val; - memcpy(&val, &key, sizeof(double)); - return val; + khint64_t val; + memcpy(&val, &key, sizeof(double)); + return val; } -// correct for all inputs but not -0.0 and NaNs -#define kh_float64_hash_func_0_NAN(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) +khint32_t PANDAS_INLINE asint32(float key) { + khint32_t val; + memcpy(&val, &key, sizeof(float)); + return val; +} -// correct for all inputs but not NaNs -#define kh_float64_hash_func_NAN(key) ((key) == 0.0 ? \ - kh_float64_hash_func_0_NAN(0.0) : \ - kh_float64_hash_func_0_NAN(key)) +#define ZERO_HASH 0 +#define NAN_HASH 0 -// correct for all -#define kh_float64_hash_func(key) ((key) != (key) ? \ - kh_float64_hash_func_NAN(Py_NAN) : \ - kh_float64_hash_func_NAN(key)) +khint32_t PANDAS_INLINE kh_float64_hash_func(double val){ + // 0.0 and -0.0 should have the same hash: + if (val == 0.0){ + return ZERO_HASH; + } + // all nans should have the same hash: + if ( val!=val ){ + return NAN_HASH; + } + khint64_t as_int = asint64(val); + return murmur2_64to32(as_int); +} -#define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) +khint32_t PANDAS_INLINE kh_float32_hash_func(float val){ + // 0.0 and -0.0 should have the same hash: + if (val == 0.0f){ + return ZERO_HASH; + } + // all nans should have the same hash: + if ( val!=val ){ + return NAN_HASH; + } + khint32_t as_int = asint32(val); + return murmur2_32to32(as_int); +} + +#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ - KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal) KHASH_MAP_INIT_FLOAT64(float64, size_t) +#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \ + KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal) + +KHASH_MAP_INIT_FLOAT32(float32, size_t) + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); @@ -93,7 +181,7 @@ typedef struct { typedef kh_str_starts_t* p_kh_str_starts_t; p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) { - kh_str_starts_t *result = (kh_str_starts_t*)calloc(1, sizeof(kh_str_starts_t)); + kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t)); result->table = kh_init_str(); return result; } @@ -116,9 +204,9 @@ khint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { kh_destroy_str(table->table); - free(table); + KHASH_FREE(table); } void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { kh_resize_str(table->table, val); -} \ No newline at end of file +} diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 2ada0a4bd173d..d161c4e29fe15 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -18,7 +18,9 @@ int to_double(char *item, double *p_value, char sci, char decimal, char *p_end = NULL; int error = 0; - *p_value = xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int); + /* Switch to precise xstrtod GH 31364 */ + *p_value = precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, + &error, maybe_int); return (error == 0) && (!*p_end); } diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a195c0daf5271..965fece370721 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -91,7 +91,6 @@ void parser_set_default_options(parser_t *self) { self->skipinitialspace = 0; self->quoting = QUOTE_MINIMAL; self->allow_embedded_newline = 1; - self->strict = 0; self->expected_fields = -1; self->error_bad_lines = 0; @@ -159,7 +158,7 @@ int parser_init(parser_t *self) { self->warn_msg = NULL; // token stream - self->stream = (char *)malloc(STREAM_INIT_SIZE * sizeof(char)); + self->stream = malloc(STREAM_INIT_SIZE * sizeof(char)); if (self->stream == NULL) { parser_cleanup(self); return PARSER_OUT_OF_MEMORY; @@ -170,16 +169,16 @@ int parser_init(parser_t *self) { // word pointers and metadata sz = STREAM_INIT_SIZE / 10; sz = sz ? sz : 1; - self->words = (char **)malloc(sz * sizeof(char *)); - self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t)); + self->words = malloc(sz * sizeof(char *)); + self->word_starts = malloc(sz * sizeof(int64_t)); self->max_words_cap = sz; self->words_cap = sz; self->words_len = 0; // line pointers and metadata - self->line_start = (int64_t *)malloc(sz * sizeof(int64_t)); + self->line_start = malloc(sz * sizeof(int64_t)); - self->line_fields = (int64_t *)malloc(sz * sizeof(int64_t)); + self->line_fields = malloc(sz * sizeof(int64_t)); self->lines_cap = sz; self->lines = 0; @@ -345,7 +344,7 @@ static int push_char(parser_t *self, char c) { "self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) int64_t bufsize = 100; - self->error_msg = (char *)malloc(bufsize); + self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; @@ -362,7 +361,7 @@ int PANDAS_INLINE end_field(parser_t *self) { "self->words_cap(%zu)\n", self->words_len, self->words_cap)) int64_t bufsize = 100; - self->error_msg = (char *)malloc(bufsize); + self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - possible malformed input file.\n"); return PARSER_OUT_OF_MEMORY; @@ -398,7 +397,7 @@ static void append_warning(parser_t *self, const char *msg) { void *newptr; if (self->warn_msg == NULL) { - self->warn_msg = (char *)malloc(length + 1); + self->warn_msg = malloc(length + 1); snprintf(self->warn_msg, length + 1, "%s", msg); } else { ex_length = strlen(self->warn_msg); @@ -459,10 +458,10 @@ static int end_line(parser_t *self) { // file_lines is now the actual file line number (starting at 1) if (self->error_bad_lines) { - self->error_msg = (char *)malloc(bufsize); + self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, - "Expected %d fields in line %lld, saw %lld\n", - ex_fields, (long long)self->file_lines, (long long)fields); + "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", + ex_fields, self->file_lines, fields); TRACE(("Error at line %d, %d fields\n", self->file_lines, fields)); @@ -471,11 +470,10 @@ static int end_line(parser_t *self) { // simply skip bad lines if (self->warn_bad_lines) { // pass up error message - msg = (char *)malloc(bufsize); + msg = malloc(bufsize); snprintf(msg, bufsize, - "Skipping line %lld: expected %d fields, saw %lld\n", - (long long)self->file_lines, ex_fields, - (long long)fields); + "Skipping line %" PRIu64 ": expected %d fields, saw %" + PRId64 "\n", self->file_lines, ex_fields, fields); append_warning(self, msg); free(msg); } @@ -487,7 +485,7 @@ static int end_line(parser_t *self) { // might overrun the buffer when closing fields if (make_stream_space(self, ex_fields - fields) < 0) { int64_t bufsize = 100; - self->error_msg = (char *)malloc(bufsize); + self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; } @@ -508,7 +506,7 @@ static int end_line(parser_t *self) { "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) int64_t bufsize = 100; - self->error_msg = (char *)malloc(bufsize); + self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Buffer overflow caught - " "possible malformed input file.\n"); @@ -569,7 +567,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { if (status != REACHED_EOF && self->data == NULL) { int64_t bufsize = 200; - self->error_msg = (char *)malloc(bufsize); + self->error_msg = malloc(bufsize); if (status == CALLING_READ_FAILED) { snprintf(self->error_msg, bufsize, @@ -600,7 +598,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \ self->stream_cap)) \ int64_t bufsize = 100; \ - self->error_msg = (char *)malloc(bufsize); \ + self->error_msg = malloc(bufsize); \ snprintf(self->error_msg, bufsize, \ "Buffer overflow caught - possible malformed input file.\n");\ return PARSER_OUT_OF_MEMORY; \ @@ -730,7 +728,7 @@ int tokenize_bytes(parser_t *self, if (make_stream_space(self, self->datalen - self->datapos) < 0) { int64_t bufsize = 100; - self->error_msg = (char *)malloc(bufsize); + self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "out of memory"); return -1; } @@ -1032,15 +1030,9 @@ int tokenize_bytes(parser_t *self, } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; - } else if (!self->strict) { + } else { PUSH_CHAR(c); self->state = IN_FIELD; - } else { - int64_t bufsize = 100; - self->error_msg = (char *)malloc(bufsize); - snprintf(self->error_msg, bufsize, - "delimiter expected after quote in quote"); - goto parsingerror; } break; @@ -1150,8 +1142,8 @@ static int parser_handle_eof(parser_t *self) { case IN_QUOTED_FIELD: self->error_msg = (char *)malloc(bufsize); snprintf(self->error_msg, bufsize, - "EOF inside string starting at row %lld", - (long long)self->file_lines); + "EOF inside string starting at row %" PRIu64, + self->file_lines); return -1; case ESCAPED_CHAR: @@ -1203,7 +1195,7 @@ int parser_consume_rows(parser_t *self, size_t nrows) { /* move stream, only if something to move */ if (char_count < self->stream_len) { - memmove((void *)self->stream, (void *)(self->stream + char_count), + memmove(self->stream, (self->stream + char_count), self->stream_len - char_count); } /* buffer counts */ @@ -1269,20 +1261,16 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); - newptr = realloc((void *)self->words, new_cap * sizeof(char *)); - if (newptr == NULL) { + self->words = realloc(self->words, new_cap * sizeof(char *)); + if (self->words == NULL) { return PARSER_OUT_OF_MEMORY; - } else { - self->words = (char **)newptr; } - newptr = realloc((void *)self->word_starts, - new_cap * sizeof(int64_t)); - if (newptr == NULL) { + self->word_starts = realloc(self->word_starts, + new_cap * sizeof(int64_t)); + if (self->word_starts == NULL) { return PARSER_OUT_OF_MEMORY; - } else { - self->word_starts = (int64_t *)newptr; - self->words_cap = new_cap; } + self->words_cap = new_cap; } /* trim stream */ @@ -1295,7 +1283,7 @@ int parser_trim_buffers(parser_t *self) { TRACE( ("parser_trim_buffers: new_cap < self->stream_cap, calling " "realloc\n")); - newptr = realloc((void *)self->stream, new_cap); + newptr = realloc(self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { @@ -1321,19 +1309,19 @@ int parser_trim_buffers(parser_t *self) { new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); - newptr = realloc((void *)self->line_start, + newptr = realloc(self->line_start, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_start = (int64_t *)newptr; + self->line_start = newptr; } - newptr = realloc((void *)self->line_fields, + newptr = realloc(self->line_fields, new_cap * sizeof(int64_t)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - self->line_fields = (int64_t *)newptr; + self->line_fields = newptr; self->lines_cap = new_cap; } } @@ -1778,20 +1766,73 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, return number; } +/* copy a decimal number string with `decimal`, `tsep` as decimal point + and thousands separator to an equivalent c-locale decimal string (striping + `tsep`, replacing `decimal` with '.'). The returned memory should be free-d + with a call to `free`. +*/ + +char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, + char tsep) { + const char *p = s; + size_t length = strlen(s); + char *s_copy = malloc(length + 1); + char *dst = s_copy; + // Copy Leading sign + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy integer part dropping `tsep` + while (isdigit_ascii(*p)) { + *dst++ = *p++; + p += (tsep != '\0' && *p == tsep); + } + // Replace `decimal` with '.' + if (*p == decimal) { + *dst++ = '.'; + p++; + } + // Copy the remainder of the string as is. + strncpy(dst, p, length + 1 - (p - s)); + if (endpos != NULL) + *endpos = (char *)(s + length); + return s_copy; +} + + double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { + // 'normalize' representation to C-locale; replace decimal with '.' and + // remove t(housand)sep. + char *endptr; + char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate; gstate = PyGILState_Ensure(); - - double r = PyOS_string_to_double(p, q, 0); + char *endpc; + double r = PyOS_string_to_double(pc, &endpc, 0); + // PyOS_string_to_double needs to consume the whole string + if (endpc == pc + strlen(pc)) { + if (q != NULL) { + // report endptr from source string (p) + *q = endptr; + } + } else { + *error = -1; + if (q != NULL) { + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; // TODO(willayd): this could be undefined behavior + } + } if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); + free(pc); return r; } @@ -1810,7 +1851,7 @@ int uint64_conflict(uint_state *self) { int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { - const char *p = (const char *)p_item; + const char *p = p_item; int isneg = 0; int64_t number = 0; int d; @@ -1930,7 +1971,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { - const char *p = (const char *)p_item; + const char *p = p_item; uint64_t pre_max = uint_max / 10; int dig_pre_max = uint_max % 10; uint64_t number = 0; diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 7dfae737718a5..876e2267906ee 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -132,7 +132,6 @@ typedef struct parser_t { char commentchar; int allow_embedded_newline; - int strict; /* raise exception on bad CSV */ int usecols; // Boolean: 1: usecols provided, 0: none provided diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 5343999c369f7..2af10a5b72d33 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1134,7 +1134,7 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, } break; - + } } diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 785a4d1f8b923..7a2fa471b9ba8 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,35 +1,17 @@ +import cmath import math import numpy as np + from numpy cimport import_array + import_array() -from pandas._libs.util cimport is_array +from pandas._libs.lib import is_complex +from pandas._libs.util cimport is_array, is_real_number_object -from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.common import is_dtype_equal - -cdef NUMERIC_TYPES = ( - bool, - int, - float, - np.bool_, - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - np.float16, - np.float32, - np.float64, -) - - -cdef bint is_comparable_as_number(obj): - return isinstance(obj, NUMERIC_TYPES) +from pandas.core.dtypes.missing import array_equivalent, isna cdef bint isiterable(obj): @@ -129,6 +111,7 @@ cpdef assert_almost_equal(a, b, if not isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) @@ -181,6 +164,7 @@ cpdef assert_almost_equal(a, b, elif isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) @@ -193,7 +177,7 @@ cpdef assert_almost_equal(a, b, # object comparison return True - if is_comparable_as_number(a) and is_comparable_as_number(b): + if is_real_number_object(a) and is_real_number_object(b): if array_equivalent(a, b, strict_nan=True): # inf comparison return True @@ -205,4 +189,14 @@ cpdef assert_almost_equal(a, b, f"with rtol={rtol}, atol={atol}") return True + if is_complex(a) and is_complex(b): + if array_equivalent(a, b, strict_nan=True): + # inf comparison + return True + + if not cmath.isclose(a, b, rel_tol=rtol, abs_tol=atol): + assert False, (f"expected {b:.5f} but got {a:.5f}, " + f"with rtol={rtol}, atol={atol}") + return True + raise AssertionError(f"{a} != {b}") diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 35d5cd8f1e275..b1b38505b9476 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -7,23 +7,20 @@ from cpython.datetime cimport ( datetime, tzinfo, ) + # import datetime C API PyDateTime_IMPORT cimport numpy as cnp from numpy cimport float64_t, int64_t, ndarray + import numpy as np + cnp.import_array() import pytz -from pandas._libs.util cimport ( - is_datetime64_object, - is_float_object, - is_integer_object, -) - from pandas._libs.tslibs.np_datetime cimport ( _string_to_dts, check_dts_bounds, @@ -34,9 +31,9 @@ from pandas._libs.tslibs.np_datetime cimport ( pydate_to_dt64, pydatetime_to_dt64, ) +from pandas._libs.util cimport is_datetime64_object, is_float_object, is_integer_object from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime - from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.conversion cimport ( @@ -44,23 +41,20 @@ from pandas._libs.tslibs.conversion cimport ( cast_from_unit, convert_datetime_to_tsobject, get_datetime64_nanos, + precision_from_unit, ) - from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, ) - from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timestamps import Timestamp -from pandas._libs.tslibs.tzconversion cimport ( - tz_localize_to_utc_single, -) +from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na +from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single def _test_parse_iso8601(ts: str): @@ -212,6 +206,7 @@ def array_with_unit_to_datetime( cdef: Py_ssize_t i, j, n=len(values) int64_t m + int prec = 0 ndarray[float64_t] fvalues bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' @@ -224,38 +219,48 @@ def array_with_unit_to_datetime( assert is_ignore or is_coerce or is_raise - if unit == 'ns': - if issubclass(values.dtype.type, np.integer): - result = values.astype('M8[ns]') + if unit == "ns": + if issubclass(values.dtype.type, (np.integer, np.float_)): + result = values.astype("M8[ns]", copy=False) else: result, tz = array_to_datetime(values.astype(object), errors=errors) return result, tz - m = cast_from_unit(None, unit) + m, p = precision_from_unit(unit) if is_raise: - - # try a quick conversion to i8 + # try a quick conversion to i8/f8 # if we have nulls that are not type-compat # then need to iterate - if values.dtype.kind == "i": - # Note: this condition makes the casting="same_kind" redundant - iresult = values.astype('i8', casting='same_kind', copy=False) - # fill by comparing to NPY_NAT constant + + if values.dtype.kind == "i" or values.dtype.kind == "f": + iresult = values.astype("i8", copy=False) + # fill missing values by comparing to NPY_NAT mask = iresult == NPY_NAT iresult[mask] = 0 - fvalues = iresult.astype('f8') * m + fvalues = iresult.astype("f8") * m need_to_iterate = False - # check the bounds if not need_to_iterate: - - if ((fvalues < Timestamp.min.value).any() - or (fvalues > Timestamp.max.value).any()): + # check the bounds + if (fvalues < Timestamp.min.value).any() or ( + (fvalues > Timestamp.max.value).any() + ): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - result = (iresult * m).astype('M8[ns]') - iresult = result.view('i8') + + if values.dtype.kind == "i": + result = (iresult * m).astype("M8[ns]") + + elif values.dtype.kind == "f": + fresult = (values * m).astype("f8") + fresult[mask] = 0 + if prec: + fresult = round(fresult, prec) + result = fresult.astype("M8[ns]", copy=False) + + iresult = result.view("i8") iresult[mask] = NPY_NAT + return result, tz result = np.empty(n, dtype='M8[ns]') diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index 4eb5188b8a04b..388fd0c62b937 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -1,6 +1,5 @@ from cython cimport Py_ssize_t - -from numpy cimport int64_t, int32_t +from numpy cimport int32_t, int64_t ctypedef (int32_t, int32_t, int32_t) iso_calendar_t diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 00cecd25e5225..d8c83daa661a3 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -5,7 +5,7 @@ Cython implementations of functions resembling the stdlib calendar module import cython -from numpy cimport int64_t, int32_t +from numpy cimport int32_t, int64_t # ---------------------------------------------------------------------- # Constants @@ -201,10 +201,10 @@ cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil: iso_week = 1 iso_year = year - if iso_week == 1 and doy > 7: + if iso_week == 1 and month == 12: iso_year += 1 - elif iso_week >= 52 and doy < 7: + elif iso_week >= 52 and month == 1: iso_year -= 1 return iso_year, iso_week, dow + 1 diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 73772e5ab4577..c80be79a12d90 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,6 +1,5 @@ from cpython.datetime cimport datetime, tzinfo - -from numpy cimport int64_t, int32_t, ndarray +from numpy cimport int32_t, int64_t, ndarray from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct @@ -24,5 +23,6 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef datetime localize_pydatetime(datetime dt, object tz) cdef int64_t cast_from_unit(object ts, str unit) except? -1 +cpdef (int64_t, int) precision_from_unit(str unit) cdef int64_t normalize_i8_stamp(int64_t local_val) nogil diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 85da7a60a029a..3b52b4d499694 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,44 +1,68 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int32_t, intp_t, ndarray +from numpy cimport int32_t, int64_t, intp_t, ndarray + cnp.import_array() import pytz # stdlib datetime imports -from cpython.datetime cimport (datetime, time, tzinfo, - PyDateTime_Check, PyDate_Check, - PyDateTime_IMPORT) + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + datetime, + time, + tzinfo, +) + PyDateTime_IMPORT from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, - _string_to_dts, npy_datetime, dt64_to_dtstruct, dtstruct_to_dt64, - get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64, - NPY_DATETIMEUNIT, NPY_FR_ns) -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + NPY_DATETIMEUNIT, + NPY_FR_ns, + _string_to_dts, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + get_datetime64_unit, + get_datetime64_value, + npy_datetime, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, + pydatetime_to_dt64, +) -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_integer_object, is_float_object) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - maybe_get_tz, tz_compare, + get_dst_info, + get_utcoffset, + is_fixed_offset, + is_tzlocal, + is_utc, + maybe_get_tz, + tz_compare, utc_pytz as UTC, ) +from pandas._libs.tslibs.util cimport ( + is_datetime64_object, + is_float_object, + is_integer_object, +) + from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - checknull_with_nat, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, ) - from pandas._libs.tslibs.tzconversion cimport ( tz_convert_utc_to_tzlocal, tz_localize_to_utc_single, @@ -639,11 +663,20 @@ cdef inline check_overflows(_TSObject obj): # GH#12677 if obj.dts.year == 1677: if not (obj.value < 0): - raise OutOfBoundsDatetime + from pandas._libs.tslibs.timestamps import Timestamp + fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " + f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + raise OutOfBoundsDatetime( + f"Converting {fmt} underflows past {Timestamp.min}" + ) elif obj.dts.year == 2262: if not (obj.value > 0): - raise OutOfBoundsDatetime - + from pandas._libs.tslibs.timestamps import Timestamp + fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " + f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + raise OutOfBoundsDatetime( + f"Converting {fmt} overflows past {Timestamp.max}" + ) # ---------------------------------------------------------------------- # Localization @@ -797,7 +830,7 @@ cpdef inline datetime localize_pydatetime(datetime dt, object tz): # ---------------------------------------------------------------------- # Normalization -@cython.cdivision +@cython.cdivision(False) cdef inline int64_t normalize_i8_stamp(int64_t local_val) nogil: """ Round the localized nanosecond timestamp down to the previous midnight. diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 1d1f900bc18b3..16fa05c3801c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -6,26 +6,37 @@ from locale import LC_TIME import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t +from numpy cimport int8_t, int32_t, int64_t, ndarray, uint32_t + cnp.import_array() from pandas._config.localization import set_locale -from pandas._libs.tslibs.ccalendar import MONTHS_FULL, DAYS_FULL +from pandas._libs.tslibs.ccalendar import DAYS_FULL, MONTHS_FULL + from pandas._libs.tslibs.ccalendar cimport ( - get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year, get_iso_calendar, iso_calendar_t, - month_offset, + dayofweek, + get_day_of_year, + get_days_in_month, get_firstbday, + get_iso_calendar, get_lastbday, + get_week_of_year, + is_leapyear, + iso_calendar_t, + month_offset, ) -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, - td64_to_tdstruct) from pandas._libs.tslibs.nattype cimport NPY_NAT +from pandas._libs.tslibs.np_datetime cimport ( + dt64_to_dtstruct, + npy_datetimestruct, + pandas_timedeltastruct, + td64_to_tdstruct, +) + from pandas._libs.tslibs.strptime import LocaleTime diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index 3f7240654d7e8..d38f4518f9bf0 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -1,6 +1,7 @@ from cpython.datetime cimport datetime - from numpy cimport int64_t + + cdef int64_t NPY_NAT cdef bint _nat_scalar_rules[6] diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 264013f928d22..561143f48e0ec 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,3 +1,10 @@ +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, +) from cpython.object cimport ( Py_EQ, Py_GE, @@ -8,28 +15,19 @@ from cpython.object cimport ( PyObject_RichCompare, ) -from cpython.datetime cimport ( - PyDateTime_Check, - PyDateTime_IMPORT, - PyDelta_Check, - datetime, - timedelta, -) PyDateTime_IMPORT from cpython.version cimport PY_MINOR_VERSION import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() -from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, - get_timedelta64_value, -) cimport pandas._libs.tslibs.util as util - +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value # ---------------------------------------------------------------------- # Constants @@ -109,30 +107,25 @@ cdef class _NaT(datetime): __array_priority__ = 100 def __richcmp__(_NaT self, object other, int op): - cdef: - int ndim = getattr(other, "ndim", -1) + if util.is_datetime64_object(other) or PyDateTime_Check(other): + # We treat NaT as datetime-like for this comparison + return _nat_scalar_rules[op] - if ndim == -1: + elif util.is_timedelta64_object(other) or PyDelta_Check(other): + # We treat NaT as timedelta-like for this comparison return _nat_scalar_rules[op] elif util.is_array(other): - result = np.empty(other.shape, dtype=np.bool_) - result.fill(_nat_scalar_rules[op]) - return result - - elif ndim == 0: - if util.is_datetime64_object(other): - return _nat_scalar_rules[op] + if other.dtype.kind in "mM": + result = np.empty(other.shape, dtype=np.bool_) + result.fill(_nat_scalar_rules[op]) + elif other.dtype.kind == "O": + result = np.array([PyObject_RichCompare(self, x, op) for x in other]) else: - raise TypeError( - f"Cannot compare type {type(self).__name__} " - f"with type {type(other).__name__}" - ) + return NotImplemented + return result - # Note: instead of passing "other, self, _reverse_ops[op]", we observe - # that `_nat_scalar_rules` is invariant under `_reverse_ops`, - # rendering it unnecessary. - return PyObject_RichCompare(other, self, op) + return NotImplemented def __add__(self, other): if self is not c_NaT: @@ -364,10 +357,12 @@ class NaTType(_NaT): week = property(fget=lambda self: np.nan) dayofyear = property(fget=lambda self: np.nan) + day_of_year = property(fget=lambda self: np.nan) weekofyear = property(fget=lambda self: np.nan) days_in_month = property(fget=lambda self: np.nan) daysinmonth = property(fget=lambda self: np.nan) dayofweek = property(fget=lambda self: np.nan) + day_of_week = property(fget=lambda self: np.nan) # inject Timedelta properties days = property(fget=lambda self: np.nan) @@ -399,9 +394,7 @@ class NaTType(_NaT): Returns ------- - month_name : string - - .. versionadded:: 0.23.0 + str """, ) day_name = _make_nan_func( @@ -416,9 +409,7 @@ class NaTType(_NaT): Returns ------- - day_name : string - - .. versionadded:: 0.23.0 + str """, ) # _nat_methods @@ -427,7 +418,6 @@ class NaTType(_NaT): utctimetuple = _make_error_func("utctimetuple", datetime) timetz = _make_error_func("timetz", datetime) timetuple = _make_error_func("timetuple", datetime) - strftime = _make_error_func("strftime", datetime) isocalendar = _make_error_func("isocalendar", datetime) dst = _make_error_func("dst", datetime) ctime = _make_error_func("ctime", datetime) @@ -444,6 +434,23 @@ class NaTType(_NaT): # The remaining methods have docstrings copy/pasted from the analogous # Timestamp methods. + strftime = _make_error_func( + "strftime", + """ + Timestamp.strftime(format) + + Return a string representing the given POSIX timestamp + controlled by an explicit format string. + + Parameters + ---------- + format : str + Format string to convert Timestamp to string. + See strftime documentation for more information on the format string: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + """, + ) + strptime = _make_error_func( "strptime", """ @@ -466,7 +473,7 @@ class NaTType(_NaT): """ Timestamp.fromtimestamp(ts) - timestamp[, tz] -> tz's local time from POSIX timestamp. + Transform timestamp[, tz] to tz's local time from POSIX timestamp. """, ) combine = _make_error_func( @@ -474,7 +481,7 @@ class NaTType(_NaT): """ Timestamp.combine(date, time) - date, time -> datetime with same date and time fields. + Combine date, time into datetime with same date and time fields. """, ) utcnow = _make_error_func( @@ -615,7 +622,7 @@ timedelta}, default 'raise' floor = _make_nat_func( "floor", """ - return a new Timestamp floored to this resolution. + Return a new Timestamp floored to this resolution. Parameters ---------- @@ -654,7 +661,7 @@ timedelta}, default 'raise' ceil = _make_nat_func( "ceil", """ - return a new Timestamp ceiled to this resolution. + Return a new Timestamp ceiled to this resolution. Parameters ---------- @@ -770,7 +777,7 @@ default 'raise' replace = _make_nat_func( "replace", """ - implements datetime.replace, handles nanoseconds. + Implements datetime.replace, handles nanoseconds. Parameters ---------- @@ -783,7 +790,7 @@ default 'raise' microsecond : int, optional nanosecond : int, optional tzinfo : tz-convertible, optional - fold : int, optional, default is 0 + fold : int, optional Returns ------- diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index eebdcb3ace507..b2524c6bc6c0d 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -1,6 +1,6 @@ from cpython.datetime cimport date, datetime +from numpy cimport int32_t, int64_t -from numpy cimport int64_t, int32_t cdef extern from "numpy/ndarrayobject.h": ctypedef int64_t npy_timedelta diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 31cc55ad981bb..12aaaf4ce3977 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,5 +1,3 @@ -from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE - from cpython.datetime cimport ( PyDateTime_DATE_GET_HOUR, PyDateTime_DATE_GET_MICROSECOND, @@ -10,11 +8,15 @@ from cpython.datetime cimport ( PyDateTime_GET_YEAR, PyDateTime_IMPORT, ) +from cpython.object cimport Py_EQ, Py_GE, Py_GT, Py_LE, Py_LT, Py_NE + PyDateTime_IMPORT from numpy cimport int64_t + from pandas._libs.tslibs.util cimport get_c_string_buf_and_size + cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, npy_datetimestruct *b) diff --git a/pandas/_libs/tslibs/offsets.pxd b/pandas/_libs/tslibs/offsets.pxd index 9a9244db4a565..215c3f849281f 100644 --- a/pandas/_libs/tslibs/offsets.pxd +++ b/pandas/_libs/tslibs/offsets.pxd @@ -1,5 +1,6 @@ from numpy cimport int64_t + cpdef to_offset(object obj) cdef bint is_offset_object(object obj) cdef bint is_tick_object(object obj) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b0c6648514e99..1339dee954603 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,39 +1,51 @@ -import cython - import operator import re import time from typing import Any import warnings -from cpython.datetime cimport (PyDateTime_IMPORT, - PyDateTime_Check, - PyDate_Check, - PyDelta_Check, - datetime, timedelta, date, - time as dt_time) + +import cython + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + date, + datetime, + time as dt_time, + timedelta, +) + PyDateTime_IMPORT -from dateutil.relativedelta import relativedelta from dateutil.easter import easter - +from dateutil.relativedelta import relativedelta import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure + from pandas._libs.properties import cache_readonly from pandas._libs.tslibs cimport util from pandas._libs.tslibs.util cimport ( - is_integer_object, is_datetime64_object, is_float_object, + is_integer_object, ) from pandas._libs.tslibs.ccalendar import ( - MONTH_ALIASES, MONTH_TO_CAL_NUM, weekday_to_int, int_to_weekday, + MONTH_ALIASES, + MONTH_TO_CAL_NUM, + int_to_weekday, + weekday_to_int, ) + from pandas._libs.tslibs.ccalendar cimport ( DAY_NANOS, dayofweek, @@ -47,17 +59,20 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, pydate_to_dtstruct, ) from pandas._libs.tslibs.tzconversion cimport tz_convert_from_utc_single from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds + from .timedeltas import Timedelta + from .timestamps cimport _Timestamp + from .timestamps import Timestamp # --------------------------------------------------------------------- @@ -86,19 +101,38 @@ cdef bint _is_normalized(datetime dt): return True +def apply_wrapper_core(func, self, other) -> ndarray: + result = func(self, other) + result = np.asarray(result) + + if self.normalize: + # TODO: Avoid circular/runtime import + from .vectorized import normalize_i8_timestamps + result = normalize_i8_timestamps(result.view("i8"), None) + + return result + + def apply_index_wraps(func): # Note: normally we would use `@functools.wraps(func)`, but this does # not play nicely with cython class methods - def wrapper(self, other) -> np.ndarray: + def wrapper(self, other): # other is a DatetimeArray + result = apply_wrapper_core(func, self, other) + result = type(other)(result) + warnings.warn("'Offset.apply_index(other)' is deprecated. " + "Use 'offset + other' instead.", FutureWarning) + return result - result = func(self, other) - result = np.asarray(result) + return wrapper - if self.normalize: - # TODO: Avoid circular/runtime import - from .vectorized import normalize_i8_timestamps - result = normalize_i8_timestamps(result.view("i8"), None) + +def apply_array_wraps(func): + # Note: normally we would use `@functools.wraps(func)`, but this does + # not play nicely with cython class methods + def wrapper(self, other) -> np.ndarray: + # other is a DatetimeArray + result = apply_wrapper_core(func, self, other) return result # do @functools.wraps(func) manually since it doesn't work on cdef funcs @@ -185,7 +219,9 @@ cdef _wrap_timedelta_result(result): cdef _get_calendar(weekmask, holidays, calendar): - """Generate busdaycalendar""" + """ + Generate busdaycalendar + """ if isinstance(calendar, np.busdaycalendar): if not holidays: holidays = tuple(calendar.holidays) @@ -515,6 +551,10 @@ cdef class BaseOffset: raises NotImplementedError for offsets without a vectorized implementation. + .. deprecated:: 1.1.0 + + Use ``offset + dtindex`` instead. + Parameters ---------- index : DatetimeIndex @@ -522,12 +562,25 @@ cdef class BaseOffset: Returns ------- DatetimeIndex + + Raises + ------ + NotImplementedError + When the specific offset subclass does not have a vectorized + implementation. """ raise NotImplementedError( f"DateOffset subclass {type(self).__name__} " "does not have a vectorized implementation" ) + @apply_array_wraps + def _apply_array(self, dtarr): + raise NotImplementedError( + f"DateOffset subclass {type(self).__name__} " + "does not have a vectorized implementation" + ) + def rollback(self, dt) -> datetime: """ Roll provided date backward to next offset only if not on offset. @@ -608,14 +661,18 @@ cdef class BaseOffset: return nint def __setstate__(self, state): - """Reconstruct an instance from a pickled state""" + """ + Reconstruct an instance from a pickled state + """ self.n = state.pop("n") self.normalize = state.pop("normalize") self._cache = state.pop("_cache", {}) # At this point we expect state to be empty def __getstate__(self): - """Return a pickleable state""" + """ + Return a pickleable state + """ state = {} state["n"] = self.n state["normalize"] = self.normalize @@ -734,6 +791,11 @@ cdef class Tick(SingleConstructorOffset): def is_anchored(self) -> bool: return False + # This is identical to BaseOffset.__hash__, but has to be redefined here + # for Python 3, because we've redefined __eq__. + def __hash__(self) -> int: + return hash(self._params) + # -------------------------------------------------------------------- # Comparison and Arithmetic Methods @@ -920,7 +982,9 @@ cdef class RelativeDeltaOffset(BaseOffset): object.__setattr__(self, key, val) def __getstate__(self): - """Return a pickleable state""" + """ + Return a pickleable state + """ # RelativeDeltaOffset (technically DateOffset) is the only non-cdef # class, so the only one with __dict__ state = self.__dict__.copy() @@ -929,7 +993,9 @@ cdef class RelativeDeltaOffset(BaseOffset): return state def __setstate__(self, state): - """Reconstruct an instance from a pickled state""" + """ + Reconstruct an instance from a pickled state + """ if "offset" in state: # Older (<0.22.0) versions have offset attribute instead of _offset @@ -938,13 +1004,6 @@ cdef class RelativeDeltaOffset(BaseOffset): state["_offset"] = state.pop("offset") state["kwds"]["offset"] = state["_offset"] - if "_offset" in state and not isinstance(state["_offset"], timedelta): - # relativedelta, we need to populate using its kwds - offset = state["_offset"] - odict = offset.__dict__ - kwds = {key: odict[key] for key in odict if odict[key]} - state.update(kwds) - self.n = state.pop("n") self.normalize = state.pop("normalize") self._cache = state.pop("_cache", {}) @@ -992,7 +1051,11 @@ cdef class RelativeDeltaOffset(BaseOffset): ------- ndarray[datetime64[ns]] """ - dt64other = np.asarray(dtindex) + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): + dt64other = np.asarray(dtarr) kwds = self.kwds relativedelta_fast = { "years", @@ -1154,9 +1217,8 @@ class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): >>> ts + DateOffset(months=2) Timestamp('2017-03-01 09:10:11') """ - - pass - + def __setattr__(self, name, value): + raise AttributeError("DateOffset objects are immutable.") # -------------------------------------------------------------------- @@ -1321,8 +1383,16 @@ cdef class BusinessDay(BusinessMixin): @apply_index_wraps def apply_index(self, dtindex): - i8other = dtindex.view("i8") - return shift_bdays(i8other, self.n) + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): + i8other = dtarr.view("i8") + res = _shift_bdays(i8other, self.n) + if self.offset: + res = res.view("M8[ns]") + Timedelta(self.offset) + res = res.view("i8") + return res def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): @@ -1333,6 +1403,19 @@ cdef class BusinessDay(BusinessMixin): cdef class BusinessHour(BusinessMixin): """ DateOffset subclass representing possibly n business hours. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + start : str, default "09:00" + Start time of your custom business hour in 24h format. + end : str, default: "17:00" + End time of your custom business hour in 24h format. """ _prefix = "BH" @@ -1804,8 +1887,12 @@ cdef class YearOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): shifted = shift_quarters( - dtindex.view("i8"), self.n, self.month, self._day_opt, modby=12 + dtarr.view("i8"), self.n, self.month, self._day_opt, modby=12 ) return shifted @@ -1957,8 +2044,12 @@ cdef class QuarterOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): shifted = shift_quarters( - dtindex.view("i8"), self.n, self.startingMonth, self._day_opt + dtarr.view("i8"), self.n, self.startingMonth, self._day_opt ) return shifted @@ -2072,7 +2163,11 @@ cdef class MonthOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): - shifted = shift_months(dtindex.view("i8"), self.n, self._day_opt) + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): + shifted = shift_months(dtarr.view("i8"), self.n, self._day_opt) return shifted cpdef __setstate__(self, state): @@ -2209,8 +2304,14 @@ cdef class SemiMonthOffset(SingleConstructorOffset): @cython.wraparound(False) @cython.boundscheck(False) def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + @cython.wraparound(False) + @cython.boundscheck(False) + def _apply_array(self, dtarr): cdef: - int64_t[:] i8other = dtindex.view("i8") + int64_t[:] i8other = dtarr.view("i8") Py_ssize_t i, count = len(i8other) int64_t val int64_t[:] out = np.empty(count, dtype="i8") @@ -2368,12 +2469,16 @@ cdef class Week(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): if self.weekday is None: td = timedelta(days=7 * self.n) td64 = np.timedelta64(td, "ns") - return dtindex + td64 + return dtarr + td64 else: - i8other = dtindex.view("i8") + i8other = dtarr.view("i8") return self._end_apply_index(i8other) @cython.wraparound(False) @@ -3146,6 +3251,9 @@ cdef class CustomBusinessDay(BusinessDay): def apply_index(self, dtindex): raise NotImplementedError + def _apply_array(self, dtarr): + raise NotImplementedError + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False @@ -3156,6 +3264,19 @@ cdef class CustomBusinessDay(BusinessDay): cdef class CustomBusinessHour(BusinessHour): """ DateOffset subclass representing possibly n custom business days. + + Parameters + ---------- + n : int, default 1 + The number of months represented. + normalize : bool, default False + Normalize start/end dates to midnight before generating date range. + weekmask : str, Default 'Mon Tue Wed Thu Fri' + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. + start : str, default "09:00" + Start time of your custom business hour in 24h format. + end : str, default: "17:00" + End time of your custom business hour in 24h format. """ _prefix = "CBH" @@ -3527,7 +3648,9 @@ def shift_day(other: datetime, days: int) -> datetime: cdef inline int year_add_months(npy_datetimestruct dts, int months) nogil: - """new year number after shifting npy_datetimestruct number of months""" + """ + New year number after shifting npy_datetimestruct number of months. + """ return dts.year + (dts.month + months - 1) // 12 @@ -3625,10 +3748,12 @@ cdef inline void _shift_months(const int64_t[:] dtindex, Py_ssize_t count, int months, str day_opt) nogil: - """See shift_months.__doc__""" + """ + See shift_months.__doc__ + """ cdef: Py_ssize_t i - int months_to_roll, compare_day + int months_to_roll npy_datetimestruct dts for i in range(count): @@ -3638,10 +3763,8 @@ cdef inline void _shift_months(const int64_t[:] dtindex, dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months - compare_day = get_day_of_month(&dts, day_opt) - months_to_roll = roll_convention(dts.day, months_to_roll, - compare_day) + months_to_roll = _roll_qtrday(&dts, months_to_roll, 0, day_opt) dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) @@ -3659,7 +3782,9 @@ cdef inline void _shift_quarters(const int64_t[:] dtindex, int q1start_month, str day_opt, int modby) nogil: - """See shift_quarters.__doc__""" + """ + See shift_quarters.__doc__ + """ cdef: Py_ssize_t i int months_since, n @@ -3683,7 +3808,7 @@ cdef inline void _shift_quarters(const int64_t[:] dtindex, out[i] = dtstruct_to_dt64(&dts) -cdef ndarray[int64_t] shift_bdays(const int64_t[:] i8other, int periods): +cdef ndarray[int64_t] _shift_bdays(const int64_t[:] i8other, int periods): """ Implementation of BusinessDay.apply_offset. @@ -3915,7 +4040,9 @@ cdef inline int _roll_qtrday(npy_datetimestruct* dts, int n, int months_since, str day_opt) nogil except? -1: - """See roll_qtrday.__doc__""" + """ + See roll_qtrday.__doc__ + """ if n > 0: if months_since < 0 or (months_since == 0 and diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c4f369d0d3b3f..aeb1be121bc9e 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -9,39 +9,44 @@ from libc.string cimport strchr import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_Str - from cpython.datetime cimport datetime, datetime_new, import_datetime, tzinfo +from cpython.object cimport PyObject_Str from cpython.version cimport PY_VERSION_HEX + import_datetime() import numpy as np + cimport numpy as cnp -from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, - PyArray_IterNew, flatiter, float64_t) +from numpy cimport ( + PyArray_GETITEM, + PyArray_ITER_DATA, + PyArray_ITER_NEXT, + PyArray_IterNew, + flatiter, + float64_t, +) + cnp.import_array() # dateutil compat -from dateutil.tz import (tzoffset, - tzlocal as _dateutil_tzlocal, - tzutc as _dateutil_tzutc, - tzstr as _dateutil_tzstr) + +from dateutil.parser import DEFAULTPARSER, parse as du_parse from dateutil.relativedelta import relativedelta -from dateutil.parser import DEFAULTPARSER -from dateutil.parser import parse as du_parse +from dateutil.tz import ( + tzlocal as _dateutil_tzlocal, + tzoffset, + tzstr as _dateutil_tzstr, + tzutc as _dateutil_tzutc, +) from pandas._config import get_option from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS -from pandas._libs.tslibs.nattype cimport ( - c_nat_strings as nat_strings, - c_NaT as NaT, -) -from pandas._libs.tslibs.util cimport ( - is_array, - get_c_string_buf_and_size, -) +from pandas._libs.tslibs.nattype cimport c_NaT as NaT, c_nat_strings as nat_strings from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.util cimport get_c_string_buf_and_size, is_array + cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil @@ -376,7 +381,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default, object freq): cdef: object ret - int year, quarter = -1, month, mnum, date_len + # year initialized to prevent compiler warnings + int year = -1, quarter = -1, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 assert isinstance(date_string, str) @@ -765,7 +771,7 @@ class _timelex: _DATEUTIL_LEXER_SPLIT = _timelex.split -def _format_is_iso(f) -> bint: +def format_is_iso(f: str) -> bint: """ Does format match the iso8601 set that can be handled by the C parser? Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different @@ -783,7 +789,7 @@ def _format_is_iso(f) -> bint: return False -def _guess_datetime_format( +def guess_datetime_format( dt_str, bint dayfirst=False, dt_str_parse=du_parse, diff --git a/pandas/_libs/tslibs/period.pxd b/pandas/_libs/tslibs/period.pxd index 9c0342e239a89..46c6e52cb9156 100644 --- a/pandas/_libs/tslibs/period.pxd +++ b/pandas/_libs/tslibs/period.pxd @@ -2,5 +2,6 @@ from numpy cimport int64_t from .np_datetime cimport npy_datetimestruct + cdef bint is_period_object(object obj) cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 20961c6da56bd..cbd4e2e6704a9 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,96 +1,98 @@ import warnings -from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompareBool +from numpy cimport import_array, int64_t, ndarray -from numpy cimport int64_t, import_array, ndarray import numpy as np + import_array() from libc.stdlib cimport free, malloc +from libc.string cimport memset, strlen from libc.time cimport strftime, tm -from libc.string cimport strlen, memset import cython from cpython.datetime cimport ( - datetime, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, PyDelta_Check, + datetime, ) + # import datetime C API PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, - dt64_to_dtstruct, - pandas_datetime_to_datetimestruct, - check_dts_bounds, NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, ) + cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.timedeltas import Timedelta -from pandas._libs.tslibs.timedeltas cimport ( - delta_to_nanoseconds, - is_any_td_scalar, -) +from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.ccalendar cimport ( + c_MONTH_NUMBERS, dayofweek, get_day_of_year, - is_leapyear, - get_week_of_year, get_days_in_month, + get_week_of_year, + is_leapyear, ) -from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.conversion import ensure_datetime64ns from pandas._libs.tslibs.dtypes cimport ( - PeriodDtypeBase, - FR_UND, FR_ANN, - FR_QTR, - FR_MTH, - FR_WK, FR_BUS, FR_DAY, FR_HR, FR_MIN, - FR_SEC, FR_MS, - FR_US, + FR_MTH, FR_NS, + FR_QTR, + FR_SEC, + FR_UND, + FR_US, + FR_WK, + PeriodDtypeBase, attrname_to_abbrevs, ) - from pandas._libs.tslibs.parsing cimport get_rule_month + from pandas._libs.tslibs.parsing import parse_time_string + from pandas._libs.tslibs.nattype cimport ( - _nat_scalar_rules, NPY_NAT, - is_null_datetimelike, + _nat_scalar_rules, c_NaT as NaT, c_nat_strings as nat_strings, + is_null_datetimelike, ) from pandas._libs.tslibs.offsets cimport ( BaseOffset, - to_offset, - is_tick_object, is_offset_object, + is_tick_object, + to_offset, ) -from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG cdef: enum: @@ -859,6 +861,7 @@ cdef int64_t get_time_nanos(int freq, int64_t unix_date, int64_t ordinal) nogil: """ cdef: int64_t sub, factor + int64_t nanos_in_day = 24 * 3600 * 10**9 freq = get_freq_group(freq) @@ -884,7 +887,7 @@ cdef int64_t get_time_nanos(int freq, int64_t unix_date, int64_t ordinal) nogil: # We must have freq == FR_HR factor = 10**9 * 3600 - sub = ordinal - unix_date * 24 * 3600 * 10**9 / factor + sub = ordinal - unix_date * (nanos_in_day / factor) return sub * factor @@ -1370,7 +1373,7 @@ cdef accessor _get_accessor_func(str field): return pweek elif field == "day_of_year": return pday_of_year - elif field == "weekday": + elif field == "weekday" or field == "day_of_week": return pweekday elif field == "days_in_month": return pdays_in_month @@ -1472,6 +1475,9 @@ cdef class _Period(PeriodMixin): PeriodDtypeBase _dtype BaseOffset freq + dayofweek = _Period.day_of_week + dayofyear = _Period.day_of_year + def __cinit__(self, int64_t ordinal, BaseOffset freq): self.ordinal = ordinal self.freq = freq @@ -1879,7 +1885,7 @@ cdef class _Period(PeriodMixin): return self.weekofyear @property - def dayofweek(self) -> int: + def day_of_week(self) -> int: """ Day of the week the period lies in, with Monday=0 and Sunday=6. @@ -1897,33 +1903,33 @@ cdef class _Period(PeriodMixin): See Also -------- - Period.dayofweek : Day of the week the period lies in. - Period.weekday : Alias of Period.dayofweek. + Period.day_of_week : Day of the week the period lies in. + Period.weekday : Alias of Period.day_of_week. Period.day : Day of the month. Period.dayofyear : Day of the year. Examples -------- >>> per = pd.Period('2017-12-31 22:00', 'H') - >>> per.dayofweek + >>> per.day_of_week 6 For periods that span over multiple days, the day at the beginning of the period is returned. >>> per = pd.Period('2017-12-31 22:00', '4H') - >>> per.dayofweek + >>> per.day_of_week 6 - >>> per.start_time.dayofweek + >>> per.start_time.day_of_week 6 For periods with a frequency higher than days, the last day of the period is returned. >>> per = pd.Period('2018-01', 'M') - >>> per.dayofweek + >>> per.day_of_week 2 - >>> per.end_time.dayofweek + >>> per.end_time.day_of_week 2 """ base = self._dtype._dtype_code @@ -1983,7 +1989,7 @@ cdef class _Period(PeriodMixin): return self.dayofweek @property - def dayofyear(self) -> int: + def day_of_year(self) -> int: """ Return the day of the year. @@ -1999,19 +2005,19 @@ cdef class _Period(PeriodMixin): See Also -------- Period.day : Return the day of the month. - Period.dayofweek : Return the day of week. - PeriodIndex.dayofyear : Return the day of year of all indexes. + Period.day_of_week : Return the day of week. + PeriodIndex.day_of_year : Return the day of year of all indexes. Examples -------- >>> period = pd.Period("2015-10-23", freq='H') - >>> period.dayofyear + >>> period.day_of_year 296 >>> period = pd.Period("2012-12-31", freq='D') - >>> period.dayofyear + >>> period.day_of_year 366 >>> period = pd.Period("2013-01-01", freq='D') - >>> period.dayofyear + >>> period.day_of_year 1 """ base = self._dtype._dtype_code @@ -2310,7 +2316,7 @@ class Period(_Period): freq : str, default None One of pandas period strings or corresponding objects. ordinal : int, default None - The period offset from the gregorian proleptic epoch. + The period offset from the proleptic Gregorian epoch. year : int, default None Year value of the period. month : int, default 1 @@ -2339,6 +2345,7 @@ class Period(_Period): if freq is not None: freq = cls._maybe_convert_freq(freq) + nanosecond = 0 if ordinal is not None and value is not None: raise ValueError("Only value or ordinal but not both should be " @@ -2388,6 +2395,14 @@ class Period(_Period): value = str(value) value = value.upper() dt, reso = parse_time_string(value, freq) + try: + ts = Timestamp(value) + except ValueError: + nanosecond = 0 + else: + nanosecond = ts.nanosecond + if nanosecond != 0: + reso = 'nanosecond' if dt is NaT: ordinal = NPY_NAT @@ -2419,7 +2434,7 @@ class Period(_Period): base = freq_to_dtype_code(freq) ordinal = period_ordinal(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, - dt.microsecond, 0, base) + dt.microsecond, 1000*nanosecond, base) return cls._from_ordinal(ordinal, freq) @@ -2432,7 +2447,7 @@ cpdef int freq_to_dtype_code(BaseOffset freq) except? -1: try: return freq._period_dtype_code except AttributeError as err: - raise ValueError(INVALID_FREQ_ERR_MSG) from err + raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) from err cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day, diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime.c b/pandas/_libs/tslibs/src/datetime/np_datetime.c index f647098140528..8eb995dee645b 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime.c @@ -312,7 +312,7 @@ int cmp_npy_datetimestruct(const npy_datetimestruct *a, * object into a NumPy npy_datetimestruct. Uses tzinfo (if present) * to convert to UTC time. * - * The following implementation just asks for attributes, and thus + * The following implementation just asks for attributes, and thus * supports datetime duck typing. The tzinfo time zone conversion * requires this style of access as well. * diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 660b582f73e6e..bc4632ad028ab 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,27 +1,30 @@ """Strptime-related classes and functions. """ -import time -import locale import calendar +import locale import re +import time from cpython.datetime cimport date, tzinfo from _thread import allocate_lock as _thread_allocate_lock +import numpy as np import pytz -import numpy as np -from numpy cimport int64_t - -from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, dtstruct_to_dt64, npy_datetimestruct) +from numpy cimport int64_t, ndarray from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_nat_strings as nat_strings, + checknull_with_nat, ) +from pandas._libs.tslibs.np_datetime cimport ( + check_dts_bounds, + dtstruct_to_dt64, + npy_datetimestruct, +) + cdef dict _parse_code_table = {'y': 0, 'Y': 1, @@ -48,7 +51,7 @@ cdef dict _parse_code_table = {'y': 0, 'u': 22} -def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise'): +def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): """ Calculates the datetime structs represented by the passed array of strings diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index 4142861e9ad38..fed1f2d326819 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -1,6 +1,7 @@ from cpython.datetime cimport timedelta from numpy cimport int64_t + # Exposed for tslib, not intended for outside use. cpdef int64_t delta_to_nanoseconds(delta) except? -1 cdef convert_to_timedelta64(object ts, str unit) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8f3a599bf107c..e4b19d844dcab 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,40 +1,49 @@ import collections +import warnings import cython -from cpython.object cimport Py_NE, Py_EQ, PyObject_RichCompare +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() -from cpython.datetime cimport (timedelta, - PyDateTime_Check, PyDelta_Check, - PyDateTime_IMPORT) +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + timedelta, +) + PyDateTime_IMPORT cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.util cimport ( - is_timedelta64_object, is_datetime64_object, is_integer_object, - is_float_object, is_array -) - from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.conversion cimport cast_from_unit - -from pandas._libs.tslibs.np_datetime cimport ( - cmp_scalar, td64_to_tdstruct, pandas_timedeltastruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, +) +from pandas._libs.tslibs.np_datetime cimport ( + cmp_scalar, + pandas_timedeltastruct, + td64_to_tdstruct, ) from pandas._libs.tslibs.offsets cimport is_tick_object +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) # ---------------------------------------------------------------------- # Constants @@ -218,7 +227,7 @@ cdef convert_to_timedelta64(object ts, str unit): @cython.boundscheck(False) @cython.wraparound(False) -def array_to_timedelta64(object[:] values, str unit=None, str errors="raise"): +def array_to_timedelta64(ndarray[object] values, str unit=None, str errors="raise"): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. @@ -396,9 +405,11 @@ cdef inline int64_t parse_timedelta_string(str ts) except? -1: m = 10**(3 -len(frac)) * 1000 * 1000 elif len(frac) > 3 and len(frac) <= 6: m = 10**(6 -len(frac)) * 1000 - else: + elif len(frac) > 6 and len(frac) <= 9: m = 10**(9 -len(frac)) - + else: + m = 1 + frac = frac[:9] r = int(''.join(frac)) * m result += timedelta_as_neg(r, neg) @@ -458,6 +469,15 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): try: unit = ''.join(unit) + + if unit in ["M", "Y", "y"]: + warnings.warn( + "Units 'M', 'Y' and 'y' do not represent unambiguous " + "timedelta values and will be removed in a future version", + FutureWarning, + stacklevel=2, + ) + if unit == 'M': # To parse ISO 8601 string, 'M' should be treated as minute, # not month @@ -596,7 +616,7 @@ cdef inline int64_t parse_iso_format_string(str ts) except? -1: for c in ts: # number (ascii codes) - if ord(c) >= 48 and ord(c) <= 57: + if 48 <= ord(c) <= 57: have_value = 1 if have_dot: @@ -612,27 +632,30 @@ cdef inline int64_t parse_iso_format_string(str ts) except? -1: if not len(unit): number.append(c) else: - # if in days, pop trailing T - if unit[-1] == 'T': - unit.pop() - elif 'H' in unit or 'M' in unit: - if len(number) > 2: - raise ValueError(err_msg) r = timedelta_from_spec(number, '0', unit) result += timedelta_as_neg(r, neg) neg = 0 unit, number = [], [c] else: - if c == 'P': - pass # ignore leading character + if c == 'P' or c == 'T': + pass # ignore marking characters P and T elif c == '-': if neg or have_value: raise ValueError(err_msg) else: neg = 1 - elif c in ['D', 'T', 'H', 'M']: + elif c in ['W', 'D', 'H', 'M']: + if c in ['H', 'M'] and len(number) > 2: + raise ValueError(err_msg) + if c == 'M': + c = 'min' unit.append(c) + r = timedelta_from_spec(number, '0', unit) + result += timedelta_as_neg(r, neg) + + neg = 0 + unit, number = [], [] elif c == '.': # append any seconds if len(number): @@ -653,11 +676,8 @@ cdef inline int64_t parse_iso_format_string(str ts) except? -1: r = timedelta_from_spec(number, '0', dec_unit) result += timedelta_as_neg(r, neg) else: # seconds - if len(number) <= 2: - r = timedelta_from_spec(number, '0', 'S') - result += timedelta_as_neg(r, neg) - else: - raise ValueError(err_msg) + r = timedelta_from_spec(number, '0', 'S') + result += timedelta_as_neg(r, neg) else: raise ValueError(err_msg) @@ -1050,7 +1070,8 @@ cdef class _Timedelta(timedelta): See Also -------- - Timestamp.isoformat + Timestamp.isoformat : Function is used to convert the given + Timestamp object into the ISO format. Notes ----- @@ -1124,6 +1145,9 @@ class Timedelta(_Timedelta): Notes ----- The ``.value`` attribute is always in ns. + + If the precision is higher than nanoseconds, the precision of the duration is + truncated to nanoseconds. """ def __new__(cls, object value=_no_input, unit=None, **kwargs): diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 307b6dfc90715..45aae3581fe79 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -1,5 +1,4 @@ from cpython.datetime cimport datetime, tzinfo - from numpy cimport int64_t from pandas._libs.tslibs.base cimport ABCTimestamp @@ -19,8 +18,8 @@ cdef class _Timestamp(ABCTimestamp): cdef bint _get_start_end_field(self, str field) cdef _get_date_name_field(self, str field, object locale) cdef int64_t _maybe_convert_value_to_local(self) + cdef bint _can_compare(self, datetime other) cpdef to_datetime64(self) - cdef _assert_tzawareness_compat(_Timestamp self, datetime other) cpdef datetime to_pydatetime(_Timestamp self, bint warn=*) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1 diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8cef685933863..242eb89d1e723 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -9,54 +9,66 @@ shadows the python class, where we do any heavy lifting. import warnings import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int8_t, uint8_t, ndarray -cnp.import_array() +from numpy cimport int8_t, int64_t, ndarray, uint8_t -from cpython.object cimport (PyObject_RichCompareBool, PyObject_RichCompare, - Py_EQ, Py_NE) +cnp.import_array() -from cpython.datetime cimport ( - datetime, - time, - tzinfo, - tzinfo as tzinfo_type, # alias bc `tzinfo` is a kwarg below +from cpython.datetime cimport ( # alias bc `tzinfo` is a kwarg below PyDateTime_Check, + PyDateTime_IMPORT, PyDelta_Check, PyTZInfo_Check, - PyDateTime_IMPORT, -) -PyDateTime_IMPORT - -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, - is_timedelta64_object, is_array, + datetime, + time, + tzinfo as tzinfo_type, ) +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare, PyObject_RichCompareBool -from pandas._libs.tslibs.base cimport ABCTimestamp +PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar - +from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.conversion cimport ( _TSObject, - convert_to_tsobject, convert_datetime_to_tsobject, + convert_to_tsobject, normalize_i8_stamp, ) -from pandas._libs.tslibs.fields import get_start_end_field, get_date_name_field +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) + +from pandas._libs.tslibs.fields import get_date_name_field, get_start_end_field + from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct, + check_dts_bounds, cmp_scalar, + dt64_to_dtstruct, + npy_datetimestruct, pydatetime_to_dt64, ) + from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas._libs.tslibs.offsets cimport to_offset, is_offset_object -from pandas._libs.tslibs.timedeltas cimport is_any_td_scalar, delta_to_nanoseconds + +from pandas._libs.tslibs.offsets cimport is_offset_object, to_offset +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._libs.tslibs.timezones cimport ( - is_utc, maybe_get_tz, treat_tz_as_pytz, utc_pytz as UTC, - get_timezone, tz_compare, + get_timezone, + is_utc, + maybe_get_tz, + treat_tz_as_pytz, + tz_compare, + utc_pytz as UTC, ) from pandas._libs.tslibs.tzconversion cimport ( tz_convert_from_utc_single, @@ -218,6 +230,8 @@ cdef class _Timestamp(ABCTimestamp): # higher than np.ndarray and np.matrix __array_priority__ = 100 + dayofweek = _Timestamp.day_of_week + dayofyear = _Timestamp.day_of_year def __hash__(_Timestamp self): if self.nanosecond: @@ -248,6 +262,10 @@ cdef class _Timestamp(ABCTimestamp): if other.dtype.kind == "M": if self.tz is None: return PyObject_RichCompare(self.asm8, other, op) + elif op == Py_NE: + return np.ones(other.shape, dtype=np.bool_) + elif op == Py_EQ: + return np.zeros(other.shape, dtype=np.bool_) raise TypeError( "Cannot compare tz-naive and tz-aware timestamps" ) @@ -266,7 +284,12 @@ cdef class _Timestamp(ABCTimestamp): else: return NotImplemented - self._assert_tzawareness_compat(ots) + if not self._can_compare(ots): + if op == Py_NE or op == Py_EQ: + return NotImplemented + raise TypeError( + "Cannot compare tz-naive and tz-aware timestamps" + ) return cmp_scalar(self.value, ots.value, op) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, @@ -274,16 +297,15 @@ cdef class _Timestamp(ABCTimestamp): cdef: datetime dtval = self.to_pydatetime() - self._assert_tzawareness_compat(other) + if not self._can_compare(other): + return NotImplemented + return PyObject_RichCompareBool(dtval, other, op) - cdef _assert_tzawareness_compat(_Timestamp self, datetime other): - if self.tzinfo is None: - if other.tzinfo is not None: - raise TypeError('Cannot compare tz-naive and tz-aware ' - 'timestamps') - elif other.tzinfo is None: - raise TypeError('Cannot compare tz-naive and tz-aware timestamps') + cdef bint _can_compare(self, datetime other): + if self.tzinfo is not None: + return other.tzinfo is not None + return other.tzinfo is None def __add__(self, other): cdef: @@ -491,9 +513,7 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- - day_name : string - - .. versionadded:: 0.23.0 + str """ return self._get_date_name_field("day_name", locale) @@ -508,9 +528,7 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- - month_name : string - - .. versionadded:: 0.23.0 + str """ return self._get_date_name_field("month_name", locale) @@ -522,14 +540,14 @@ cdef class _Timestamp(ABCTimestamp): return bool(ccalendar.is_leapyear(self.year)) @property - def dayofweek(self) -> int: + def day_of_week(self) -> int: """ Return day of the week. """ return self.weekday() @property - def dayofyear(self) -> int: + def day_of_year(self) -> int: """ Return the day of the year. """ @@ -771,7 +789,6 @@ class Timestamp(_Timestamp): year, month, day : int hour, minute, second, microsecond : int, optional, default 0 nanosecond : int, optional, default 0 - .. versionadded:: 0.23.0 tzinfo : datetime.tzinfo, optional, default None fold : {0, 1}, default None, keyword-only Due to daylight saving time, one wall clock time can occur twice @@ -895,10 +912,26 @@ class Timestamp(_Timestamp): """ Timestamp.fromtimestamp(ts) - timestamp[, tz] -> tz's local time from POSIX timestamp. + Transform timestamp[, tz] to tz's local time from POSIX timestamp. """ return cls(datetime.fromtimestamp(ts)) + def strftime(self, format): + """ + Timestamp.strftime(format) + + Return a string representing the given POSIX timestamp + controlled by an explicit format string. + + Parameters + ---------- + format : str + Format string to convert Timestamp to string. + See strftime documentation for more information on the format string: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. + """ + return datetime.strftime(self, format) + # Issue 25016. @classmethod def strptime(cls, date_string, format): @@ -917,7 +950,7 @@ class Timestamp(_Timestamp): """ Timestamp.combine(date, time) - date, time -> datetime with same date and time fields. + Combine date, time into datetime with same date and time fields. """ return cls(datetime.combine(date, time)) @@ -1136,7 +1169,7 @@ timedelta}, default 'raise' def floor(self, freq, ambiguous='raise', nonexistent='raise'): """ - return a new Timestamp floored to this resolution. + Return a new Timestamp floored to this resolution. Parameters ---------- @@ -1175,7 +1208,7 @@ timedelta}, default 'raise' def ceil(self, freq, ambiguous='raise', nonexistent='raise'): """ - return a new Timestamp ceiled to this resolution. + Return a new Timestamp ceiled to this resolution. Parameters ---------- @@ -1357,10 +1390,10 @@ default 'raise' microsecond=None, nanosecond=None, tzinfo=object, - fold=0, + fold=None, ): """ - implements datetime.replace, handles nanoseconds. + Implements datetime.replace, handles nanoseconds. Parameters ---------- @@ -1373,7 +1406,7 @@ default 'raise' microsecond : int, optional nanosecond : int, optional tzinfo : tz-convertible, optional - fold : int, optional, default is 0 + fold : int, optional Returns ------- @@ -1390,6 +1423,11 @@ default 'raise' # set to naive if needed tzobj = self.tzinfo value = self.value + + # GH 37610. Preserve fold when replacing. + if fold is None: + fold = self.fold + if tzobj is not None: value = tz_convert_from_utc_single(value, tzobj) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 136710003d32a..753c881ed505c 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,5 +1,6 @@ from cpython.datetime cimport datetime, timedelta, tzinfo + cdef tzinfo utc_pytz cpdef bint is_utc(tzinfo tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index a8c785704d8e8..3deabc57ec522 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,27 +1,31 @@ -from datetime import timezone +from datetime import timedelta, timezone + from cpython.datetime cimport datetime, timedelta, tzinfo # dateutil compat + from dateutil.tz import ( gettz as dateutil_gettz, tzfile as _dateutil_tzfile, tzlocal as _dateutil_tzlocal, tzutc as _dateutil_tzutc, ) - - -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo import pytz +from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + UTC = pytz.utc import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport is_integer_object, get_nat +from pandas._libs.tslibs.util cimport get_nat, is_integer_object + cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc @@ -98,6 +102,14 @@ cpdef inline tzinfo maybe_get_tz(object tz): # On Python 3 on Windows, the filename is not always set correctly. if isinstance(tz, _dateutil_tzfile) and '.tar.gz' in tz._filename: tz._filename = zone + elif tz[0] in {'-', '+'}: + hours = int(tz[0:3]) + minutes = int(tz[0] + tz[4:6]) + tz = timezone(timedelta(hours=hours, minutes=minutes)) + elif tz[0:4] in {'UTC-', 'UTC+'}: + hours = int(tz[3:6]) + minutes = int(tz[3] + tz[7:9]) + tz = timezone(timedelta(hours=hours, minutes=minutes)) else: tz = pytz.timezone(tz) elif is_integer_object(tz): diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 1990afd77a8fb..3666d00707ac8 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -2,7 +2,9 @@ from cpython.datetime cimport tzinfo from numpy cimport int64_t -cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz, bint* fold=*) +cdef int64_t tz_convert_utc_to_tzlocal( + int64_t utc_val, tzinfo tz, bint* fold=* +) except? -1 cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz) cdef int64_t tz_localize_to_utc_single( int64_t val, tzinfo tz, object ambiguous=*, object nonexistent=* diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 606639af16a18..1049682af08e8 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -5,21 +5,27 @@ import cython from cython import Py_ssize_t from cpython.datetime cimport ( - PyDateTime_IMPORT, PyDelta_Check, datetime, timedelta, tzinfo) + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, + tzinfo, +) + PyDateTime_IMPORT -import pytz from dateutil.tz import tzutc - import numpy as np +import pytz + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, intp_t +from numpy cimport int64_t, intp_t, ndarray, uint8_t + cnp.import_array() from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dt64_to_dtstruct) +from pandas._libs.tslibs.np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from pandas._libs.tslibs.timezones cimport ( get_dst_info, get_utcoffset, @@ -349,7 +355,9 @@ cdef inline str _render_tstamp(int64_t val): # ---------------------------------------------------------------------- # Timezone Conversion -cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz, bint* fold=NULL): +cdef int64_t tz_convert_utc_to_tzlocal( + int64_t utc_val, tzinfo tz, bint* fold=NULL +) except? -1: """ Parameters ---------- @@ -404,7 +412,7 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): return val + deltas[pos] -def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): +def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the values (in i8) from UTC to tz @@ -418,7 +426,7 @@ def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): int64 ndarray of converted """ cdef: - int64_t[:] converted + const int64_t[:] converted if len(vals) == 0: return np.array([], dtype=np.int64) @@ -429,7 +437,7 @@ def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): +cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the given values (in i8) either to UTC or from UTC. @@ -451,7 +459,7 @@ cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): str typ if is_utc(tz): - converted = vals + return vals elif is_tzlocal(tz): converted = np.empty(n, dtype=np.int64) for i in range(n): @@ -494,9 +502,11 @@ cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): return converted +# OSError may be thrown by tzlocal on windows at or close to 1970-01-01 +# see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241 cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz, bint to_utc, - bint *fold=NULL): + bint *fold=NULL) except? -1: """ Calculate offset in nanoseconds needed to convert the i8 representation of a datetime from a tzlocal timezone to UTC, or vice-versa. @@ -541,8 +551,10 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz, return int(td.total_seconds() * 1_000_000_000) +# OSError may be thrown by tzlocal on windows at or close to 1970-01-01 +# see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241 cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, - bint* fold=NULL): + bint* fold=NULL) except? -1: """ Convert the i8 representation of a datetime from a tzlocal timezone to UTC, or vice-versa. diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index e280609bb17a7..16d801f69df05 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -1,6 +1,7 @@ from cpython.object cimport PyTypeObject + cdef extern from *: """ PyObject* char_to_string(const char* data) { @@ -26,7 +27,8 @@ cdef extern from "Python.h": const char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL -from numpy cimport int64_t, float64_t +from numpy cimport float64_t, int64_t + cdef extern from "numpy/arrayobject.h": PyTypeObject PyFloatingArrType_Type @@ -121,6 +123,10 @@ cdef inline bint is_bool_object(object obj) nogil: PyObject_TypeCheck(obj, &PyBoolArrType_Type)) +cdef inline bint is_real_number_object(object obj) nogil: + return is_bool_object(obj) or is_integer_object(obj) or is_float_object(obj) + + cdef inline bint is_timedelta64_object(object obj) nogil: """ Cython equivalent of `isinstance(val, np.timedelta64)` diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index c8f8daf6724c2..c3c78ca54885a 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,18 +1,21 @@ import cython -from cpython.datetime cimport datetime, date, time, tzinfo +from cpython.datetime cimport date, datetime, time, tzinfo import numpy as np + from numpy cimport int64_t, intp_t, ndarray from .conversion cimport normalize_i8_stamp + from .dtypes import Resolution + from .nattype cimport NPY_NAT, c_NaT as NaT -from .np_datetime cimport npy_datetimestruct, dt64_to_dtstruct +from .np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from .offsets cimport to_offset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts -from .timezones cimport is_utc, is_tzlocal, get_dst_info +from .timezones cimport get_dst_info, is_tzlocal, is_utc from .tzconversion cimport tz_convert_utc_to_tzlocal # ------------------------------------------------------------------------- @@ -208,49 +211,40 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None): int reso = RESO_DAY, curr_reso ndarray[int64_t] trans int64_t[:] deltas - Py_ssize_t[:] pos - int64_t local_val, delta + intp_t[:] pos + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False if is_utc(tz) or tz is None: - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + delta, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + + for i in range(n): + if stamps[i] == NPY_NAT: + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + dt64_to_dtstruct(local_val, &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso return Resolution(reso) @@ -281,44 +275,38 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t int64_t[:] deltas str typ Py_ssize_t[:] pos - int64_t delta, local_val - - if tz is None or is_utc(tz): - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] - result[i] = normalize_i8_stamp(local_val) + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False + + if is_utc(tz) or tz is None: + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - result[i] = normalize_i8_stamp(local_val) + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + delta - result[i] = normalize_i8_stamp(local_val) else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + deltas[pos[i]] - result[i] = normalize_i8_stamp(local_val) + + for i in range(n): + # TODO: reinstate nogil for use_utc case? + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + result[i] = normalize_i8_stamp(local_val) return result.base # `.base` to access underlying ndarray @@ -345,40 +333,36 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): ndarray[int64_t] trans int64_t[:] deltas intp_t[:] pos - int64_t local_val, delta + int64_t local_val, delta = NPY_NAT str typ int64_t day_nanos = 24 * 3600 * 1_000_000_000 + bint use_utc = False, use_tzlocal = False, use_fixed = False - if tz is None or is_utc(tz): - for i in range(n): - local_val = stamps[i] - if local_val % day_nanos != 0: - return False - + if is_utc(tz) or tz is None: + use_utc = True elif is_tzlocal(tz): - for i in range(n): - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - if local_val % day_nanos != 0: - return False + use_tzlocal = True else: trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + delta - if local_val % day_nanos != 0: - return False + else: + pos = trans.searchsorted(stamps, side="right") - 1 + for i in range(n): + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta else: - pos = trans.searchsorted(stamps) - 1 - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + deltas[pos[i]] - if local_val % day_nanos != 0: - return False + local_val = stamps[i] + deltas[pos[i]] + + if local_val % day_nanos != 0: + return False return True @@ -396,45 +380,38 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): int64_t[:] deltas Py_ssize_t[:] pos npy_datetimestruct dts - int64_t local_val + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False if is_utc(tz) or tz is None: - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = get_period_ordinal(&dts, freq) - + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - result[i] = get_period_ordinal(&dts, freq) + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[0], &dts) - result[i] = get_period_ordinal(&dts, freq) + use_fixed = True + delta = deltas[0] else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - result[i] = get_period_ordinal(&dts, freq) + for i in range(n): + # TODO: reinstate nogil for use_utc case? + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + dt64_to_dtstruct(local_val, &dts) + result[i] = get_period_ordinal(&dts, freq) return result.base # .base to get underlying ndarray diff --git a/pandas/_libs/util.pxd b/pandas/_libs/util.pxd index 828bccf7d5641..bd1e21b0d8665 100644 --- a/pandas/_libs/util.pxd +++ b/pandas/_libs/util.pxd @@ -1,8 +1,9 @@ -from pandas._libs.tslibs.util cimport * - cimport numpy as cnp from numpy cimport ndarray +from pandas._libs.tslibs.util cimport * + + cdef extern from "numpy/ndarraytypes.h": void PyArray_CLEARFLAGS(ndarray arr, int flags) nogil @@ -48,4 +49,3 @@ cdef inline void set_array_not_contiguous(ndarray ao) nogil: # ao->flags &= ~(NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS); PyArray_CLEARFLAGS(ao, (NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS)) - diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 362d0e6263697..54a09a6d2ede7 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1,14 +1,15 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import cython -from cython import Py_ssize_t -from libcpp.deque cimport deque -from libc.stdlib cimport malloc, free +from libc.math cimport round +from libcpp.deque cimport deque import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, float64_t, float32_t, uint8_t +from numpy cimport float32_t, float64_t, int64_t, ndarray + cnp.import_array() @@ -22,6 +23,7 @@ from pandas._libs.algos import is_monotonic from pandas._libs.util cimport numeric + cdef extern from "../src/skiplist.h": ctypedef struct node_t: node_t **next @@ -56,7 +58,7 @@ cdef: cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -cdef bint is_monotonic_start_end_bounds( +cdef bint is_monotonic_increasing_start_end_bounds( ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end ): return is_monotonic(start, False)[0] and is_monotonic(end, False)[0] @@ -86,62 +88,6 @@ cdef bint is_monotonic_start_end_bounds( # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -# ---------------------------------------------------------------------- -# Rolling count -# this is only an impl for index not None, IOW, freq aware - - -def roll_count( - ndarray[float64_t] values, - ndarray[int64_t] start, - ndarray[int64_t] end, - int64_t minp, -): - cdef: - float64_t val, count_x = 0.0 - int64_t s, e, nobs, N = len(values) - Py_ssize_t i, j - ndarray[float64_t] output - - output = np.empty(N, dtype=float) - - with nogil: - - for i in range(0, N): - s = start[i] - e = end[i] - - if i == 0: - - # setup - count_x = 0.0 - for j in range(s, e): - val = values[j] - if notnan(val): - count_x += 1.0 - - else: - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - if notnan(val): - count_x -= 1.0 - - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - if notnan(val): - count_x += 1.0 - - if count_x >= minp: - output[i] = count_x - else: - output[i] = NaN - - return output - - # ---------------------------------------------------------------------- # Rolling sum @@ -158,33 +104,50 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi return result -cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: - """ add a value from the sum calc """ +cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x, + float64_t *compensation) nogil: + """ add a value from the sum calc using Kahan summation """ + + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - sum_x[0] = sum_x[0] + val + y = val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t -cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: - """ remove a value from the sum calc """ +cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, + float64_t *compensation) nogil: + """ remove a value from the sum calc using Kahan summation """ + + cdef: + float64_t y, t + # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - sum_x[0] = sum_x[0] - val + y = - val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t -def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): +def roll_sum(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): cdef: - float64_t sum_x = 0 + float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0 int64_t s, e int64_t nobs = 0, i, j, N = len(values) ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -193,62 +156,33 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: # setup for j in range(s, e): - add_sum(values[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x, &compensation_add) else: # calculate deletes for j in range(start[i - 1], s): - remove_sum(values[j], &nobs, &sum_x) + remove_sum(values[j], &nobs, &sum_x, &compensation_remove) # calculate adds for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x, &compensation_add) output[i] = calc_sum(minp, nobs, sum_x) - if not is_monotonic_bounds: - for j in range(s, e): - remove_sum(values[j], &nobs, &sum_x) + if not is_monotonic_increasing_bounds: + nobs = 0 + sum_x = 0.0 + compensation_remove = 0.0 return output -def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win): - cdef: - float64_t val, prev_x, sum_x = 0 - int64_t range_endpoint - int64_t nobs = 0, i, N = len(values) - ndarray[float64_t] output - - output = np.empty(N, dtype=float) - - range_endpoint = int_max(minp, 1) - 1 - - with nogil: - - for i in range(0, range_endpoint): - add_sum(values[i], &nobs, &sum_x) - output[i] = NaN - - for i in range(range_endpoint, N): - val = values[i] - add_sum(val, &nobs, &sum_x) - - if i > win - 1: - prev_x = values[i - win] - remove_sum(prev_x, &nobs, &sum_x) - - output[i] = calc_sum(minp, nobs, sum_x) - - return output - # ---------------------------------------------------------------------- # Rolling mean @@ -274,66 +208,50 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct) nogil: - """ add a value from the mean calc """ + Py_ssize_t *neg_ct, float64_t *compensation) nogil: + """ add a value from the mean calc using Kahan summation """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - sum_x[0] = sum_x[0] + val + y = val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] + 1 cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct) nogil: - """ remove a value from the mean calc """ + Py_ssize_t *neg_ct, float64_t *compensation) nogil: + """ remove a value from the mean calc using Kahan summation """ + cdef: + float64_t y, t if notnan(val): nobs[0] = nobs[0] - 1 - sum_x[0] = sum_x[0] - val + y = - val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] - 1 -def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win): +def roll_mean(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev_x, sum_x = 0 - Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) - ndarray[float64_t] output - - output = np.empty(N, dtype=float) - - with nogil: - for i in range(minp - 1): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) - output[i] = NaN - - for i in range(minp - 1, N): - val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) - - if i > win - 1: - prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct) - - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) - - return output - - -def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): - cdef: - float64_t val, sum_x = 0 + float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0 int64_t s, e Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -342,31 +260,32 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: # setup for j in range(s, e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) else: # calculate deletes for j in range(start[i - 1], s): val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) - if not is_monotonic_bounds: - for j in range(s, e): - val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + if not is_monotonic_increasing_bounds: + nobs = 0 + neg_ct = 0 + sum_x = 0.0 + compensation_remove = 0.0 return output # ---------------------------------------------------------------------- @@ -386,7 +305,9 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, result = 0 else: result = ssqdm_x / (nobs - ddof) - if result < 0: + # Fix for numerical imprecision. + # Can be result < 0 once Kahan Summation is implemented + if result < 1e-14: result = 0 else: result = NaN @@ -395,112 +316,69 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs, cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x, - float64_t *ssqdm_x) nogil: + float64_t *ssqdm_x, float64_t *compensation) nogil: """ add a value from the var calc """ cdef: - float64_t delta + float64_t delta, prev_mean, y, t # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug if isnan(val): return nobs[0] = nobs[0] + 1 - # a part of Welford's method for the online variance-calculation + # Welford's method for the online variance-calculation + # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - delta = val - mean_x[0] + prev_mean = mean_x[0] - compensation[0] + y = val - compensation[0] + t = y - mean_x[0] + compensation[0] = t + mean_x[0] - y + delta = t mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0]) cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x, - float64_t *ssqdm_x) nogil: + float64_t *ssqdm_x, float64_t *compensation) nogil: """ remove a value from the var calc """ cdef: - float64_t delta - + float64_t delta, prev_mean, y, t if notnan(val): nobs[0] = nobs[0] - 1 if nobs[0]: - # a part of Welford's method for the online variance-calculation + # Welford's method for the online variance-calculation + # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - delta = val - mean_x[0] + prev_mean = mean_x[0] - compensation[0] + y = val - compensation[0] + t = y - mean_x[0] + compensation[0] = t + mean_x[0] - y + delta = t mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] + ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0]) else: mean_x[0] = 0 ssqdm_x[0] = 0 -def roll_var_fixed(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win, int ddof=1): +def roll_var(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, int ddof=1): """ Numerically stable implementation using Welford's method. """ cdef: - float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, + float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, compensation_add = 0, + float64_t compensation_remove = 0, float64_t val, prev, delta, mean_x_old int64_t s, e Py_ssize_t i, j, N = len(values) ndarray[float64_t] output + bint is_monotonic_increasing_bounds - output = np.empty(N, dtype=float) - - # Check for windows larger than array, addresses #7297 - win = min(win, N) - - with nogil: - - # Over the first window, observations can only be added, never - # removed - for i in range(win): - add_var(values[i], &nobs, &mean_x, &ssqdm_x) - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - - # a part of Welford's method for the online variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - - # After the first window, observations can both be added and - # removed - for i in range(win, N): - val = values[i] - prev = values[i - win] - - if notnan(val): - if prev == prev: - - # Adding one observation and removing another one - delta = val - prev - mean_x_old = mean_x - - mean_x += delta / nobs - ssqdm_x += ((nobs - 1) * val - + (nobs + 1) * prev - - 2 * nobs * mean_x_old) * delta / nobs - - else: - add_var(val, &nobs, &mean_x, &ssqdm_x) - elif prev == prev: - remove_var(prev, &nobs, &mean_x, &ssqdm_x) - - output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - - return output - - -def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int ddof=1): - """ - Numerically stable implementation using Welford's method. - """ - cdef: - float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, - float64_t val, prev, delta, mean_x_old - int64_t s, e - Py_ssize_t i, j, N = len(values) - ndarray[float64_t] output - bint is_monotonic_bounds - - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + minp = max(minp, 1) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) with nogil: @@ -512,29 +390,32 @@ def roll_var_variable(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) else: # After the first window, observations can both be added # and removed - # calculate adds - for j in range(end[i - 1], e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x) - # calculate deletes for j in range(start[i - 1], s): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x) + remove_var(values[j], &nobs, &mean_x, &ssqdm_x, + &compensation_remove) + + # calculate adds + for j in range(end[i - 1], e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) - if not is_monotonic_bounds: - for j in range(s, e): - remove_var(values[j], &nobs, &mean_x, &ssqdm_x) + if not is_monotonic_increasing_bounds: + nobs = 0.0 + mean_x = 0.0 + ssqdm_x = 0.0 + compensation_remove = 0.0 return output @@ -578,78 +459,92 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, cdef inline void add_skew(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx) nogil: + float64_t *xxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx) nogil: """ add a value from the skew calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - # seriously don't ask me why this is faster - x[0] = x[0] + val - xx[0] = xx[0] + val * val - xxx[0] = xxx[0] + val * val * val + y = val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t cdef inline void remove_skew(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx) nogil: + float64_t *xxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx) nogil: """ remove a value from the skew calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - # seriously don't ask me why this is faster - x[0] = x[0] - val - xx[0] = xx[0] - val * val - xxx[0] = xxx[0] - val * val * val - - -def roll_skew_fixed(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win): + y = - val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = - val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = - val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t + + +def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev + float64_t val, prev, min_val, mean_val, sum_val = 0 + float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0 + float64_t compensation_xx_add = 0, compensation_xx_remove = 0 + float64_t compensation_x_add = 0, compensation_x_remove = 0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values) + int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e - ndarray[float64_t] output + ndarray[float64_t] output, mean_array + bint is_monotonic_increasing_bounds + minp = max(minp, 3) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) + min_val = np.nanmin(values) with nogil: - for i in range(minp - 1): - val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) - output[i] = NaN - - for i in range(minp - 1, N): + for i in range(0, N): val = values[i] - add_skew(val, &nobs, &x, &xx, &xxx) - - if i > win - 1: - prev = values[i - win] - remove_skew(prev, &nobs, &x, &xx, &xxx) - - output[i] = calc_skew(minp, nobs, x, xx, xxx) - - return output - - -def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): - cdef: - float64_t val, prev - float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values) - int64_t s, e - ndarray[float64_t] output - bint is_monotonic_bounds - - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) - output = np.empty(N, dtype=float) - - with nogil: + if notnan(val): + nobs_mean += 1 + sum_val += val + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e5: + mean_val = round(mean_val) + for i in range(0, N): + values[i] = values[i] - mean_val for i in range(0, N): @@ -658,33 +553,36 @@ def roll_skew_variable(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) + add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, + &compensation_xx_add, &compensation_xxx_add) else: # After the first window, observations can both be added # and removed + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, + &compensation_xx_remove, &compensation_xxx_remove) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, + &compensation_xx_add, &compensation_xxx_add) output[i] = calc_skew(minp, nobs, x, xx, xxx) - if not is_monotonic_bounds: - for j in range(s, e): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + if not is_monotonic_increasing_bounds: + nobs = 0 + x = 0.0 + xx = 0.0 + xxx = 0.0 return output @@ -731,78 +629,102 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs, cdef inline void add_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx, float64_t *xxxx) nogil: + float64_t *xxx, float64_t *xxxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx, + float64_t *compensation_xxxx) nogil: """ add a value from the kurotic calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - # seriously don't ask me why this is faster - x[0] = x[0] + val - xx[0] = xx[0] + val * val - xxx[0] = xxx[0] + val * val * val - xxxx[0] = xxxx[0] + val * val * val * val + y = val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t + y = val * val * val * val - compensation_xxxx[0] + t = xxxx[0] + y + compensation_xxxx[0] = t - xxxx[0] - y + xxxx[0] = t cdef inline void remove_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx, float64_t *xxxx) nogil: + float64_t *xxx, float64_t *xxxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx, + float64_t *compensation_xxxx) nogil: """ remove a value from the kurotic calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - # seriously don't ask me why this is faster - x[0] = x[0] - val - xx[0] = xx[0] - val * val - xxx[0] = xxx[0] - val * val * val - xxxx[0] = xxxx[0] - val * val * val * val - - -def roll_kurt_fixed(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win): - cdef: - float64_t val, prev - float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, N = len(values) - int64_t s, e - ndarray[float64_t] output - - output = np.empty(N, dtype=float) - - with nogil: - - for i in range(minp - 1): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - output[i] = NaN - - for i in range(minp - 1, N): - add_kurt(values[i], &nobs, &x, &xx, &xxx, &xxxx) - - if i > win - 1: - prev = values[i - win] - remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) - - output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) - - return output - - -def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): + y = - val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = - val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = - val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t + y = - val * val * val * val - compensation_xxxx[0] + t = xxxx[0] + y + compensation_xxxx[0] = t - xxxx[0] - y + xxxx[0] = t + + +def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev + float64_t val, prev, mean_val, min_val, sum_val = 0 + float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0 + float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0 + float64_t compensation_xx_remove = 0, compensation_xx_add = 0 + float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values) + int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output - bint is_monotonic_bounds + bint is_monotonic_increasing_bounds - is_monotonic_bounds = is_monotonic_start_end_bounds(start, end) + minp = max(minp, 4) + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) output = np.empty(N, dtype=float) + min_val = np.nanmin(values) with nogil: + for i in range(0, N): + val = values[i] + if notnan(val): + nobs_mean += 1 + sum_val += val + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e4: + mean_val = round(mean_val) + for i in range(0, N): + values[i] = values[i] - mean_val for i in range(0, N): @@ -811,29 +733,37 @@ def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0 or not is_monotonic_bounds: + if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_add, &compensation_xx_add, + &compensation_xxx_add, &compensation_xxxx_add) else: # After the first window, observations can both be added # and removed + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_remove, &compensation_xx_remove, + &compensation_xxx_remove, &compensation_xxxx_remove) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) - - # calculate deletes - for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_add, &compensation_xx_add, + &compensation_xxx_add, &compensation_xxxx_add) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) - if not is_monotonic_bounds: - for j in range(s, e): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + if not is_monotonic_increasing_bounds: + nobs = 0 + x = 0.0 + xx = 0.0 + xxx = 0.0 + xxxx = 0.0 return output @@ -842,8 +772,8 @@ def roll_kurt_variable(ndarray[float64_t] values, ndarray[int64_t] start, # Rolling median, min, max -def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win=0): +def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): # GH 32865. win argument kept for compatibility cdef: float64_t val, res, prev @@ -851,9 +781,14 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, int ret = 0 skiplist_t *sl Py_ssize_t i, j - int64_t nobs = 0, N = len(values), s, e + int64_t nobs = 0, N = len(values), s, e, win int midpoint ndarray[float64_t] output + bint is_monotonic_increasing_bounds + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs @@ -873,7 +808,7 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0: + if i == 0 or not is_monotonic_increasing_bounds: # setup for j in range(s, e): @@ -901,7 +836,6 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, if notnan(val): skiplist_remove(sl, val) nobs -= 1 - if nobs >= minp: midpoint = (nobs / 2) if nobs % 2: @@ -909,11 +843,17 @@ def roll_median_c(ndarray[float64_t] values, ndarray[int64_t] start, else: res = (skiplist_get(sl, midpoint, &ret) + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + if ret == 0: + res = NaN else: res = NaN output[i] = res + if not is_monotonic_increasing_bounds: + nobs = 0 + sl = skiplist_init(win) + skiplist_destroy(sl) if err: raise MemoryError("skiplist_insert failed") @@ -971,28 +911,8 @@ cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, return result -def roll_max_fixed(float64_t[:] values, int64_t[:] start, - int64_t[:] end, int64_t minp, int64_t win): - """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. - - Parameters - ---------- - values : np.ndarray[np.float64] - window : int, size of rolling window - minp : if number of observations in window - is below this, output a NaN - index : ndarray, optional - index for window computation - closed : 'right', 'left', 'both', 'neither' - make the interval closed on the right, left, - both or neither endpoints - """ - return _roll_min_max_fixed(values, minp, win, is_max=1) - - -def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): +def roll_max(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): """ Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. @@ -1008,11 +928,11 @@ def roll_max_variable(ndarray[float64_t] values, ndarray[int64_t] start, make the interval closed on the right, left, both or neither endpoints """ - return _roll_min_max_variable(values, start, end, minp, is_max=1) + return _roll_min_max(values, start, end, minp, is_max=1) -def roll_min_fixed(float64_t[:] values, int64_t[:] start, - int64_t[:] end, int64_t minp, int64_t win): +def roll_min(ndarray[float64_t] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp): """ Moving min of 1d array of any numeric type along axis=0 ignoring NaNs. @@ -1025,31 +945,14 @@ def roll_min_fixed(float64_t[:] values, int64_t[:] start, index : ndarray, optional index for window computation """ - return _roll_min_max_fixed(values, minp, win, is_max=0) + return _roll_min_max(values, start, end, minp, is_max=0) -def roll_min_variable(ndarray[float64_t] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp): - """ - Moving min of 1d array of any numeric type along axis=0 ignoring NaNs. - - Parameters - ---------- - values : np.ndarray[np.float64] - window : int, size of rolling window - minp : if number of observations in window - is below this, output a NaN - index : ndarray, optional - index for window computation - """ - return _roll_min_max_variable(values, start, end, minp, is_max=0) - - -cdef _roll_min_max_variable(ndarray[numeric] values, - ndarray[int64_t] starti, - ndarray[int64_t] endi, - int64_t minp, - bint is_max): +cdef _roll_min_max(ndarray[numeric] values, + ndarray[int64_t] starti, + ndarray[int64_t] endi, + int64_t minp, + bint is_max): cdef: numeric ai int64_t i, k, curr_win_size, start @@ -1112,93 +1015,6 @@ cdef _roll_min_max_variable(ndarray[numeric] values, return output -cdef _roll_min_max_fixed(numeric[:] values, - int64_t minp, - int64_t win, - bint is_max): - cdef: - numeric ai - bint should_replace - int64_t i, removed, window_i, - Py_ssize_t nobs = 0, N = len(values) - int64_t* death - numeric* ring - numeric* minvalue - numeric* end - numeric* last - ndarray[float64_t, ndim=1] output - - output = np.empty(N, dtype=float) - # setup the rings of death! - ring = malloc(win * sizeof(numeric)) - death = malloc(win * sizeof(int64_t)) - - end = ring + win - last = ring - minvalue = ring - ai = values[0] - minvalue[0] = init_mm(values[0], &nobs, is_max) - death[0] = win - nobs = 0 - - with nogil: - - for i in range(N): - ai = init_mm(values[i], &nobs, is_max) - - if i >= win: - remove_mm(values[i - win], &nobs) - - if death[minvalue - ring] == i: - minvalue = minvalue + 1 - if minvalue >= end: - minvalue = ring - - if is_max: - should_replace = ai >= minvalue[0] - else: - should_replace = ai <= minvalue[0] - if should_replace: - - minvalue[0] = ai - death[minvalue - ring] = i + win - last = minvalue - - else: - - if is_max: - should_replace = last[0] <= ai - else: - should_replace = last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - if is_max: - should_replace = last[0] <= ai - else: - should_replace = last[0] >= ai - - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i + win - - output[i] = calc_mm(minp, nobs, minvalue[0]) - - for i in range(minp - 1): - if numeric in cython.floating: - output[i] = NaN - else: - output[i] = 0 - - free(ring) - free(death) - - return output - - cdef enum InterpolationType: LINEAR, LOWER, @@ -1216,8 +1032,8 @@ interpolation_types = { } -def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, int64_t win, +def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, float64_t quantile, str interpolation): """ O(N log(window)) implementation using skip list @@ -1225,7 +1041,7 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, cdef: float64_t val, prev, midpoint, idx_with_fraction skiplist_t *skiplist - int64_t nobs = 0, i, j, s, e, N = len(values) + int64_t nobs = 0, i, j, s, e, N = len(values), win Py_ssize_t idx ndarray[float64_t] output float64_t vlow, vhigh @@ -1240,11 +1056,14 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, except KeyError: raise ValueError(f"Interpolation '{interpolation}' is not supported") + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) # we use the Fixed/Variable Indexer here as the # actual skiplist ops outweigh any window computation costs output = np.empty(N, dtype=float) - if win == 0 or (end - start).max() == 0: + if (end - start).max() == 0: output[:] = NaN return output win = (end - start).max() @@ -1257,7 +1076,10 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, s = start[i] e = end[i] - if i == 0: + if i == 0 or not is_monotonic_increasing_bounds: + if not is_monotonic_increasing_bounds: + nobs = 0 + skiplist = skiplist_init(win) # setup for j in range(s, e): @@ -1267,7 +1089,6 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, skiplist_insert(skiplist, val) else: - # calculate adds for j in range(end[i - 1], e): val = values[j] @@ -1281,7 +1102,6 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, if notnan(val): skiplist_remove(skiplist, val) nobs -= 1 - if nobs >= minp: if nobs == 1: # Single value in skip list @@ -1320,94 +1140,27 @@ def roll_quantile(ndarray[float64_t, cast=True] values, ndarray[int64_t] start, vlow = skiplist_get(skiplist, idx, &ret) vhigh = skiplist_get(skiplist, idx + 1, &ret) output[i] = (vlow + vhigh) / 2 - else: - output[i] = NaN - - skiplist_destroy(skiplist) - - return output - -def roll_generic_fixed(object obj, - ndarray[int64_t] start, ndarray[int64_t] end, - int64_t minp, int64_t win, - int offset, object func, bint raw, - object args, object kwargs): - cdef: - ndarray[float64_t] output, counts, bufarr - ndarray[float64_t, cast=True] arr - float64_t *buf - float64_t *oldbuf - int64_t nobs = 0, i, j, s, e, N = len(start) - - n = len(obj) - if n == 0: - return obj - - arr = np.asarray(obj) - - # ndarray input - if raw: - if not arr.flags.c_contiguous: - arr = arr.copy('C') - - counts = roll_sum_fixed(np.concatenate([np.isfinite(arr).astype(float), - np.array([0.] * offset)]), - start, end, minp, win)[offset:] - - output = np.empty(N, dtype=float) - - if not raw: - # series - for i in range(N): - if counts[i] >= minp: - sl = slice(int_max(i + offset - win + 1, 0), - int_min(i + offset + 1, N)) - output[i] = func(obj.iloc[sl], *args, **kwargs) - else: - output[i] = NaN - - else: - - # truncated windows at the beginning, through first full-length window - for i in range((int_min(win, N) - offset)): - if counts[i] >= minp: - output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs) - else: - output[i] = NaN - - # remaining full-length windows - for j, i in enumerate(range((win - offset), (N - offset)), 1): - if counts[i] >= minp: - output[i] = func(arr[j:j + win], *args, **kwargs) + if ret == 0: + output[i] = NaN else: output[i] = NaN - # truncated windows at the end - for i in range(int_max(N - offset, 0), N): - if counts[i] >= minp: - output[i] = func(arr[int_max(i + offset - win + 1, 0): N], - *args, - **kwargs) - else: - output[i] = NaN + skiplist_destroy(skiplist) return output -def roll_generic_variable(object obj, - ndarray[int64_t] start, ndarray[int64_t] end, - int64_t minp, - int offset, object func, bint raw, - object args, object kwargs): +def roll_apply(object obj, + ndarray[int64_t] start, ndarray[int64_t] end, + int64_t minp, + object function, bint raw, + tuple args, dict kwargs): cdef: - ndarray[float64_t] output, counts, bufarr + ndarray[float64_t] output, counts ndarray[float64_t, cast=True] arr - float64_t *buf - float64_t *oldbuf - int64_t nobs = 0, i, j, s, e, N = len(start) + Py_ssize_t i, s, e, N = len(start), n = len(obj) - n = len(obj) if n == 0: return obj @@ -1418,24 +1171,20 @@ def roll_generic_variable(object obj, if not arr.flags.c_contiguous: arr = arr.copy('C') - counts = roll_sum_variable(np.concatenate([np.isfinite(arr).astype(float), - np.array([0.] * offset)]), - start, end, minp)[offset:] + counts = roll_sum(np.isfinite(arr).astype(float), start, end, minp) output = np.empty(N, dtype=float) - if offset != 0: - raise ValueError("unable to roll_generic with a non-zero offset") + for i in range(N): - for i in range(0, N): s = start[i] e = end[i] if counts[i] >= minp: if raw: - output[i] = func(arr[s:e], *args, **kwargs) + output[i] = function(arr[s:e], *args, **kwargs) else: - output[i] = func(obj.iloc[s:e], *args, **kwargs) + output[i] = function(obj.iloc[s:e], *args, **kwargs) else: output[i] = NaN @@ -1473,13 +1222,8 @@ cdef ndarray[float64_t] _roll_weighted_sum_mean(float64_t[:] values, if avg: tot_wgt = np.zeros(in_n, dtype=np.float64) - if minp > win_n: - raise ValueError(f"min_periods (minp) must be <= " - f"window (win)") elif minp > in_n: minp = in_n + 1 - elif minp < 0: - raise ValueError('min_periods must be >= 0') minp = max(minp, 1) @@ -1752,8 +1496,8 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, # ---------------------------------------------------------------------- # Exponentially weighted moving average -def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, - int64_t halflife): +def ewma_time(const float64_t[:] vals, int64_t[:] start, int64_t[:] end, + int minp, ndarray[int64_t] times, int64_t halflife): """ Compute exponentially-weighted moving average using halflife and time distances. @@ -1761,6 +1505,8 @@ def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, Parameters ---------- vals : ndarray[float_64] + start: ndarray[int_64] + end: ndarray[int_64] minp : int times : ndarray[int64] halflife : int64 @@ -1770,45 +1516,58 @@ def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, ndarray """ cdef: - Py_ssize_t i, num_not_nan = 0, N = len(vals) + Py_ssize_t i, j, num_not_nan = 0, N = len(vals) bint is_not_nan - float64_t last_result - ndarray[uint8_t] mask = np.zeros(N, dtype=np.uint8) - ndarray[float64_t] weights, observations, output = np.empty(N, dtype=np.float64) + float64_t last_result, weights_dot, weights_sum, weight, halflife_float + float64_t[:] times_float + float64_t[:] observations = np.zeros(N, dtype=float) + float64_t[:] times_masked = np.zeros(N, dtype=float) + ndarray[float64_t] output = np.empty(N, dtype=float) if N == 0: return output + halflife_float = halflife + times_float = times.astype(float) last_result = vals[0] - for i in range(N): - is_not_nan = vals[i] == vals[i] - num_not_nan += is_not_nan - if is_not_nan: - mask[i] = 1 - weights = 0.5 ** ((times[i] - times[mask.view(np.bool_)]) / halflife) - observations = vals[mask.view(np.bool_)] - last_result = np.sum(weights * observations) / np.sum(weights) - - if num_not_nan >= minp: - output[i] = last_result - else: - output[i] = NaN + with nogil: + for i in range(N): + is_not_nan = vals[i] == vals[i] + num_not_nan += is_not_nan + if is_not_nan: + times_masked[num_not_nan-1] = times_float[i] + observations[num_not_nan-1] = vals[i] + + weights_sum = 0 + weights_dot = 0 + for j in range(num_not_nan): + weight = 0.5 ** ( + (times_float[i] - times_masked[j]) / halflife_float) + weights_sum += weight + weights_dot += weight * observations[j] + + last_result = weights_dot / weights_sum + + output[i] = last_result if num_not_nan >= minp else NaN return output -def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp): +def ewma(float64_t[:] vals, int64_t[:] start, int64_t[:] end, int minp, + float64_t com, bint adjust, bint ignore_na): """ Compute exponentially-weighted moving average using center-of-mass. Parameters ---------- vals : ndarray (float64 type) + start: ndarray (int64 type) + end: ndarray (int64 type) + minp : int com : float64 adjust : int ignore_na : bool - minp : int Returns ------- @@ -1866,19 +1625,21 @@ def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp # Exponentially weighted moving covariance -def ewmcov(float64_t[:] input_x, float64_t[:] input_y, - float64_t com, bint adjust, bint ignore_na, int minp, bint bias): +def ewmcov(float64_t[:] input_x, int64_t[:] start, int64_t[:] end, int minp, + float64_t[:] input_y, float64_t com, bint adjust, bint ignore_na, bint bias): """ Compute exponentially-weighted moving variance using center-of-mass. Parameters ---------- input_x : ndarray (float64 type) + start: ndarray (int64 type) + end: ndarray (int64 type) + minp : int input_y : ndarray (float64 type) com : float64 adjust : int ignore_na : bool - minp : int bias : int Returns diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 8a1e7feb57ace..6a49a5bb34855 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,7 +1,8 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import numpy as np -from numpy cimport ndarray, int64_t + +from numpy cimport int64_t, ndarray # Cython routines for window indexers @@ -42,16 +43,14 @@ def calculate_variable_window_bounds( (ndarray[int64], ndarray[int64]) """ cdef: - bint left_closed = False - bint right_closed = False - int index_growth_sign = 1 + bint left_closed = False, right_closed = False ndarray[int64_t, ndim=1] start, end - int64_t start_bound, end_bound + int64_t start_bound, end_bound, index_growth_sign = 1 Py_ssize_t i, j - # if windows is variable, default is 'right', otherwise default is 'both' + # default is 'right' if closed is None: - closed = 'right' if index is not None else 'both' + closed = 'right' if closed in ['right', 'both']: right_closed = True diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 2d5b31d7ccbcf..06f180eef0c65 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -1,12 +1,8 @@ import cython -from cython import Py_ssize_t - -from cpython.bytes cimport PyBytes_GET_SIZE -from cpython.unicode cimport PyUnicode_GET_SIZE - import numpy as np -from numpy cimport ndarray, uint8_t +from cpython cimport PyBytes_GET_SIZE, PyUnicode_GET_LENGTH +from numpy cimport ndarray, uint8_t ctypedef fused pandas_string: str @@ -112,7 +108,7 @@ def convert_json_to_lines(arr: object) -> str: if not in_quotes: num_open_brackets_seen -= 1 - return narr.tobytes().decode('utf-8') + return narr.tobytes().decode('utf-8') + '\n' # GH:36888 # stata, pytables @@ -144,7 +140,7 @@ cpdef inline Py_ssize_t word_len(object val): Py_ssize_t l = 0 if isinstance(val, str): - l = PyUnicode_GET_SIZE(val) + l = PyUnicode_GET_LENGTH(val) elif isinstance(val, bytes): l = PyBytes_GET_SIZE(val) diff --git a/pandas/_testing.py b/pandas/_testing.py index fc6df7a95e348..469f5e1bed6ba 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -6,10 +6,11 @@ import gzip import operator import os +import re from shutil import rmtree import string import tempfile -from typing import Any, Callable, List, Optional, Type, Union, cast +from typing import Any, Callable, ContextManager, List, Optional, Type, Union, cast import warnings import zipfile @@ -25,7 +26,7 @@ from pandas._libs.lib import no_default import pandas._libs.testing as _testing from pandas._typing import Dtype, FilePathOrBuffer, FrameOrSeries -from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat import get_lzma_file, import_lzma from pandas.core.dtypes.common import ( is_bool, @@ -70,7 +71,7 @@ from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing -lzma = _import_lzma() +lzma = import_lzma() _N = 30 _K = 4 @@ -84,6 +85,7 @@ ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES FLOAT_DTYPES: List[Dtype] = [float, "float32", "float64"] +FLOAT_EA_DTYPES: List[Dtype] = ["Float32", "Float64"] COMPLEX_DTYPES: List[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: List[Dtype] = [str, "str", "U"] @@ -115,14 +117,24 @@ def set_testing_mode(): # set the testing mode filters testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") if "deprecate" in testing_mode: - warnings.simplefilter("always", _testing_mode_warnings) + # pandas\_testing.py:119: error: Argument 2 to "simplefilter" has + # incompatible type "Tuple[Type[DeprecationWarning], + # Type[ResourceWarning]]"; expected "Type[Warning]" + warnings.simplefilter( + "always", _testing_mode_warnings # type: ignore[arg-type] + ) def reset_testing_mode(): # reset the testing mode filters testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") if "deprecate" in testing_mode: - warnings.simplefilter("ignore", _testing_mode_warnings) + # pandas\_testing.py:126: error: Argument 2 to "simplefilter" has + # incompatible type "Tuple[Type[DeprecationWarning], + # Type[ResourceWarning]]"; expected "Type[Warning]" + warnings.simplefilter( + "ignore", _testing_mode_warnings # type: ignore[arg-type] + ) set_testing_mode() @@ -239,16 +251,22 @@ def decompress_file(path, compression): if compression is None: f = open(path, "rb") elif compression == "gzip": - f = gzip.open(path, "rb") + # pandas\_testing.py:243: error: Incompatible types in assignment + # (expression has type "IO[Any]", variable has type "BinaryIO") + f = gzip.open(path, "rb") # type: ignore[assignment] elif compression == "bz2": - f = bz2.BZ2File(path, "rb") + # pandas\_testing.py:245: error: Incompatible types in assignment + # (expression has type "BZ2File", variable has type "BinaryIO") + f = bz2.BZ2File(path, "rb") # type: ignore[assignment] elif compression == "xz": - f = _get_lzma_file(lzma)(path, "rb") + f = get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) + # pandas\_testing.py:252: error: Incompatible types in assignment + # (expression has type "IO[bytes]", variable has type "BinaryIO") + f = zip_file.open(zip_names.pop()) # type: ignore[assignment] else: raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") else: @@ -284,11 +302,17 @@ def write_to_compressed(compression, path, data, dest="test"): if compression == "zip": compress_method = zipfile.ZipFile elif compression == "gzip": - compress_method = gzip.GzipFile + # pandas\_testing.py:288: error: Incompatible types in assignment + # (expression has type "Type[GzipFile]", variable has type + # "Type[ZipFile]") + compress_method = gzip.GzipFile # type: ignore[assignment] elif compression == "bz2": - compress_method = bz2.BZ2File + # pandas\_testing.py:290: error: Incompatible types in assignment + # (expression has type "Type[BZ2File]", variable has type + # "Type[ZipFile]") + compress_method = bz2.BZ2File # type: ignore[assignment] elif compression == "xz": - compress_method = _get_lzma_file(lzma) + compress_method = get_lzma_file(lzma) else: raise ValueError(f"Unrecognized compression type: {compression}") @@ -298,7 +322,10 @@ def write_to_compressed(compression, path, data, dest="test"): method = "writestr" else: mode = "wb" - args = (data,) + # pandas\_testing.py:302: error: Incompatible types in assignment + # (expression has type "Tuple[Any]", variable has type "Tuple[Any, + # Any]") + args = (data,) # type: ignore[assignment] method = "write" with compress_method(path, mode=mode) as f: @@ -535,7 +562,7 @@ def rands(nchars): def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close + from matplotlib.pyplot import close as _close, get_fignums if fignum is None: for fignum in get_fignums(): @@ -665,6 +692,7 @@ def assert_index_equal( check_less_precise: Union[bool, int] = no_default, check_exact: bool = True, check_categorical: bool = True, + check_order: bool = True, rtol: float = 1.0e-5, atol: float = 1.0e-8, obj: str = "Index", @@ -694,6 +722,12 @@ def assert_index_equal( Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. + check_order : bool, default True + Whether to compare the order of index entries as well as their values. + If True, both indexes must contain the same elements, in the same order. + If False, both indexes must contain the same elements, but in any order. + + .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -705,30 +739,36 @@ def assert_index_equal( obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. + + Examples + -------- + >>> from pandas.testing import assert_index_equal + >>> a = pd.Index([1, 2, 3]) + >>> b = pd.Index([1, 2, 3]) + >>> assert_index_equal(a, b) """ __tracebackhide__ = True - def _check_types(l, r, obj="Index"): + def _check_types(left, right, obj="Index"): if exact: - assert_class_equal(l, r, exact=exact, obj=obj) + assert_class_equal(left, right, exact=exact, obj=obj) # Skip exact dtype checking when `check_categorical` is False if check_categorical: - assert_attr_equal("dtype", l, r, obj=obj) + assert_attr_equal("dtype", left, right, obj=obj) # allow string-like to have different inferred_types - if l.inferred_type in ("string"): - assert r.inferred_type in ("string") + if left.inferred_type in ("string"): + assert right.inferred_type in ("string") else: - assert_attr_equal("inferred_type", l, r, obj=obj) + assert_attr_equal("inferred_type", left, right, obj=obj) def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] level_codes = index.codes[level] filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) - values = unique._shallow_copy(filled, name=index.names[level]) - return values + return unique._shallow_copy(filled, name=index.names[level]) if check_less_precise is not no_default: warnings.warn( @@ -760,6 +800,11 @@ def _get_ilevel_values(index, level): msg3 = f"{len(right)}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) + # If order doesn't matter then sort the index entries + if not check_order: + left = left.sort_values() + right = right.sort_values() + # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: left = cast(MultiIndex, left) @@ -939,7 +984,7 @@ def assert_categorical_equal( if check_category_order: assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") assert_numpy_array_equal( - left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes" ) else: try: @@ -948,9 +993,7 @@ def assert_categorical_equal( except TypeError: # e.g. '<' not supported between instances of 'int' and 'str' lc, rc = left.categories, right.categories - assert_index_equal( - lc, rc, obj=f"{obj}.categories", - ) + assert_index_equal(lc, rc, obj=f"{obj}.categories") assert_index_equal( left.categories.take(left.codes), right.categories.take(right.codes), @@ -978,8 +1021,14 @@ def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray") """ _check_isinstance(left, right, IntervalArray) - assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") - assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") + kwargs = {} + if left._left.dtype.kind in ["m", "M"]: + # We have a DatetimeArray or TimedeltaArray + kwargs["check_freq"] = False + + assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs) + assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs) + assert_attr_equal("closed", left, right, obj=obj) @@ -990,20 +1039,22 @@ def assert_period_array_equal(left, right, obj="PeriodArray"): assert_attr_equal("freq", left, right, obj=obj) -def assert_datetime_array_equal(left, right, obj="DatetimeArray"): +def assert_datetime_array_equal(left, right, obj="DatetimeArray", check_freq=True): __tracebackhide__ = True _check_isinstance(left, right, DatetimeArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) + if check_freq: + assert_attr_equal("freq", left, right, obj=obj) assert_attr_equal("tz", left, right, obj=obj) -def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray", check_freq=True): __tracebackhide__ = True _check_isinstance(left, right, TimedeltaArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) + if check_freq: + assert_attr_equal("freq", left, right, obj=obj) def raise_assert_detail(obj, message, left, right, diff=None, index_values=None): @@ -1092,13 +1143,13 @@ def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shapes are different", left.shape, right.shape, + obj, f"{obj} shapes are different", left.shape, right.shape ) diff = 0 - for l, r in zip(left, right): + for left_arr, right_arr in zip(left, right): # count up differences - if not array_equivalent(l, r, strict_nan=strict_nan): + if not array_equivalent(left_arr, right_arr, strict_nan=strict_nan): diff += 1 diff = diff * 100.0 / left.size @@ -1161,6 +1212,13 @@ def assert_extension_array_equal( Missing values are checked separately from valid values. A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. + + Examples + -------- + >>> from pandas.testing import assert_extension_array_equal + >>> a = pd.Series([1, 2, 3, 4]) + >>> b, c = a.array, a.array + >>> assert_extension_array_equal(b, c) """ if check_less_precise is not no_default: warnings.warn( @@ -1227,6 +1285,7 @@ def assert_series_equal( check_categorical=True, check_category_order=True, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", @@ -1273,6 +1332,11 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. + + .. versionadded:: 1.2.0 + rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1284,6 +1348,13 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. + + Examples + -------- + >>> from pandas.testing import assert_series_equal + >>> a = pd.Series([1, 2, 3, 4]) + >>> b = pd.Series([1, 2, 3, 4]) + >>> assert_series_equal(a, b) """ __tracebackhide__ = True @@ -1309,6 +1380,9 @@ def assert_series_equal( msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) + if check_flags: + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + # index comparison assert_index_equal( left.index, @@ -1339,10 +1413,8 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact: - if not is_numeric_dtype(left.dtype): - raise AssertionError("check_exact may only be used with numeric Series") - + if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + # Only check exact if dtype is numeric assert_numpy_array_equal( left._values, right._values, @@ -1379,12 +1451,27 @@ def assert_series_equal( ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) - elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): + elif is_extension_array_dtype_and_needs_i8_conversion( + left.dtype, right.dtype + ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype): + assert_extension_array_equal( + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), + ) + elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( @@ -1427,6 +1514,7 @@ def assert_frame_equal( check_categorical=True, check_like=False, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="DataFrame", @@ -1488,6 +1576,8 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1555,11 +1645,11 @@ def assert_frame_equal( # shape comparison if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}" ) - if check_like: - left, right = left.reindex_like(right), right + if check_flags: + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" # index comparison assert_index_equal( @@ -1569,6 +1659,7 @@ def assert_frame_equal( check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, + check_order=not check_like, rtol=rtol, atol=atol, obj=f"{obj}.index", @@ -1582,11 +1673,15 @@ def assert_frame_equal( check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, + check_order=not check_like, rtol=rtol, atol=atol, obj=f"{obj}.columns", ) + if check_like: + left, right = left.reindex_like(right), right + # compare by blocks if by_blocks: rblocks = right._to_dict_of_blocks() @@ -1682,7 +1777,7 @@ def box_expected(expected, box_cls, transpose=True): elif box_cls is pd.DataFrame: expected = pd.Series(expected).to_frame() if transpose: - # for vector operations, we we need a DataFrame to be a single-row, + # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame # vectors of the same length. expected = expected.T @@ -1780,6 +1875,20 @@ def assert_copy(iter1, iter2, **eql_kwargs): assert elem1 is not elem2, msg +def is_extension_array_dtype_and_needs_i8_conversion(left_dtype, right_dtype) -> bool: + """ + Checks that we have the combination of an ExtensionArraydtype and + a dtype that should be converted to int64 + + Returns + ------- + bool + + Related to issue #37609 + """ + return is_extension_array_dtype(left_dtype) and needs_i8_conversion(right_dtype) + + def getCols(k): return string.ascii_uppercase[:k] @@ -1844,8 +1953,7 @@ def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): def makePeriodIndex(k=10, name=None, **kwargs): dt = datetime(2000, 1, 1) - dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) - return dr + return pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) def makeMultiIndex(k=10, names=None, **kwargs): @@ -1943,8 +2051,7 @@ def index_subclass_makers_generator(): makeCategoricalIndex, makeMultiIndex, ] - for make_index_func in make_index_funcs: - yield make_index_func + yield from make_index_funcs def all_timeseries_index_generator(k=10): @@ -1958,7 +2065,8 @@ def all_timeseries_index_generator(k=10): """ make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] for make_index_func in make_index_funcs: - yield make_index_func(k=k) + # pandas\_testing.py:1986: error: Cannot call function of unknown type + yield make_index_func(k=k) # type: ignore[operator] # make series @@ -2082,17 +2190,18 @@ def makeCustomIndex( names = [names] # specific 1D index type requested? - idx_func = dict( - i=makeIntIndex, - f=makeFloatIndex, - s=makeStringIndex, - u=makeUnicodeIndex, - dt=makeDateIndex, - td=makeTimedeltaIndex, - p=makePeriodIndex, - ).get(idx_type) + idx_func = { + "i": makeIntIndex, + "f": makeFloatIndex, + "s": makeStringIndex, + "u": makeUnicodeIndex, + "dt": makeDateIndex, + "td": makeTimedeltaIndex, + "p": makePeriodIndex, + }.get(idx_type) if idx_func: - idx = idx_func(nentries) + # pandas\_testing.py:2120: error: Cannot call function of unknown type + idx = idx_func(nentries) # type: ignore[operator] # but we need to fill in the name if names: idx.name = names[0] @@ -2120,7 +2229,8 @@ def keyfunc(x): # build a list of lists to create the index from div_factor = nentries // ndupe_l[i] + 1 - cnt = Counter() + # pandas\_testing.py:2148: error: Need type annotation for 'cnt' + cnt = Counter() # type: ignore[var-annotated] for j in range(div_factor): label = f"{prefix}_l{i}_g{j}" cnt[label] = ndupe_l[i] @@ -2278,7 +2388,14 @@ def _gen_unique_rand(rng, _extra_size): def makeMissingDataframe(density=0.9, random_state=None): df = makeDataFrame() - i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) + # pandas\_testing.py:2306: error: "_create_missing_idx" gets multiple + # values for keyword argument "density" [misc] + + # pandas\_testing.py:2306: error: "_create_missing_idx" gets multiple + # values for keyword argument "random_state" [misc] + i, j = _create_missing_idx( # type: ignore[misc] + *df.shape, density=density, random_state=random_state + ) df.values[i, j] = np.nan return df @@ -2303,7 +2420,10 @@ def dec(f): is_decorating = not kwargs and len(args) == 1 and callable(args[0]) if is_decorating: f = args[0] - args = [] + # pandas\_testing.py:2331: error: Incompatible types in assignment + # (expression has type "List[]", variable has type + # "Tuple[Any, ...]") + args = [] # type: ignore[assignment] return dec(f) else: return dec @@ -2387,7 +2507,7 @@ def can_connect(url, error_classes=None): @optional_args def network( t, - url="http://www.google.com", + url="https://www.google.com", raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, check_before_test=False, error_classes=None, @@ -2411,7 +2531,7 @@ def network( The test requiring network connectivity. url : path The url to test via ``pandas.io.common.urlopen`` to check - for connectivity. Defaults to 'http://www.google.com'. + for connectivity. Defaults to 'https://www.google.com'. raise_on_error : bool If True, never catches errors. check_before_test : bool @@ -2455,7 +2575,7 @@ def network( You can specify alternative URLs:: - >>> @network("http://www.yahoo.com") + >>> @network("https://www.yahoo.com") ... def test_something_with_yahoo(): ... raise IOError("Failure Message") >>> test_something_with_yahoo() @@ -2485,15 +2605,20 @@ def network( @wraps(t) def wrapper(*args, **kwargs): - if check_before_test and not raise_on_error: - if not can_connect(url, error_classes): - skip() + if ( + check_before_test + and not raise_on_error + and not can_connect(url, error_classes) + ): + skip() try: return t(*args, **kwargs) except Exception as err: errno = getattr(err, "errno", None) if not errno and hasattr(errno, "reason"): - errno = getattr(err.reason, "errno", None) + # pandas\_testing.py:2521: error: "Exception" has no attribute + # "reason" + errno = getattr(err.reason, "errno", None) # type: ignore[attr-defined] if errno in skip_errnos: skip(f"Skipping test due to known errno and error {err}") @@ -2521,10 +2646,11 @@ def wrapper(*args, **kwargs): @contextmanager def assert_produces_warning( - expected_warning=Warning, + expected_warning: Optional[Union[Type[Warning], bool]] = Warning, filter_level="always", - check_stacklevel=True, - raise_on_extra_warnings=True, + check_stacklevel: bool = True, + raise_on_extra_warnings: bool = True, + match: Optional[str] = None, ): """ Context manager for running code expected to either raise a specific @@ -2559,6 +2685,8 @@ class for all warnings. To check that no warning is returned, raise_on_extra_warnings : bool, default True Whether extra warnings not of the type `expected_warning` should cause the test to fail. + match : str, optional + Match warning message. Examples -------- @@ -2585,28 +2713,28 @@ class for all warnings. To check that no warning is returned, with warnings.catch_warnings(record=True) as w: saw_warning = False + matched_message = False + warnings.simplefilter(filter_level) yield w extra_warnings = [] for actual_warning in w: - if expected_warning and issubclass( - actual_warning.category, expected_warning - ): + if not expected_warning: + continue + + expected_warning = cast(Type[Warning], expected_warning) + if issubclass(actual_warning.category, expected_warning): saw_warning = True if check_stacklevel and issubclass( actual_warning.category, (FutureWarning, DeprecationWarning) ): - from inspect import getframeinfo, stack + _assert_raised_with_correct_stacklevel(actual_warning) + + if match is not None and re.search(match, str(actual_warning.message)): + matched_message = True - caller = getframeinfo(stack()[2][0]) - msg = ( - "Warning not set with correct stacklevel. " - f"File where warning is raised: {actual_warning.filename} != " - f"{caller.filename}. Warning message: {actual_warning.message}" - ) - assert actual_warning.filename == caller.filename, msg else: extra_warnings.append( ( @@ -2616,18 +2744,41 @@ class for all warnings. To check that no warning is returned, actual_warning.lineno, ) ) + if expected_warning: - msg = ( - f"Did not see expected warning of class " - f"{repr(expected_warning.__name__)}" - ) - assert saw_warning, msg + expected_warning = cast(Type[Warning], expected_warning) + if not saw_warning: + raise AssertionError( + f"Did not see expected warning of class " + f"{repr(expected_warning.__name__)}" + ) + + if match and not matched_message: + raise AssertionError( + f"Did not see warning {repr(expected_warning.__name__)} " + f"matching {match}" + ) + if raise_on_extra_warnings and extra_warnings: raise AssertionError( f"Caused unexpected warning(s): {repr(extra_warnings)}" ) +def _assert_raised_with_correct_stacklevel( + actual_warning: warnings.WarningMessage, +) -> None: + from inspect import getframeinfo, stack + + caller = getframeinfo(stack()[3][0]) + msg = ( + "Warning not set with correct stacklevel. " + f"File where warning is raised: {actual_warning.filename} != " + f"{caller.filename}. Warning message: {actual_warning.message}" + ) + assert actual_warning.filename == caller.filename, msg + + class RNGContext: """ Context manager to set the numpy random number generator speed. Returns @@ -2696,7 +2847,7 @@ def use_numexpr(use, min_elements=None): if min_elements is None: min_elements = expr._MIN_ELEMENTS - olduse = expr._USE_NUMEXPR + olduse = expr.USE_NUMEXPR oldmin = expr._MIN_ELEMENTS expr.set_use_numexpr(use) expr._MIN_ELEMENTS = min_elements @@ -2876,13 +3027,10 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): Expected output of to_csv() in current OS. """ sep = os.linesep - expected = sep.join(rows_list) + sep - return expected + return sep.join(rows_list) + sep -def external_error_raised( - expected_exception: Type[Exception], -) -> Callable[[Type[Exception], None], None]: +def external_error_raised(expected_exception: Type[Exception]) -> ContextManager: """ Helper function to mark pytest.raises that have an external error message. diff --git a/pandas/_typing.py b/pandas/_typing.py index 8e98833ad37f7..09c490e64957d 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,5 +1,7 @@ from datetime import datetime, timedelta, tzinfo -from pathlib import Path +from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper +from mmap import mmap +from os import PathLike from typing import ( IO, TYPE_CHECKING, @@ -12,6 +14,8 @@ List, Mapping, Optional, + Sequence, + Tuple, Type, TypeVar, Union, @@ -23,14 +27,27 @@ # and use a string literal forward reference to it in subsequent types # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: - from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 + from typing import final + + from pandas._libs import Period, Timedelta, Timestamp + + from pandas.core.dtypes.dtypes import ExtensionDtype + + from pandas import Interval from pandas.core.arrays.base import ExtensionArray # noqa: F401 - from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 - from pandas.core.indexes.base import Index # noqa: F401 + from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame # noqa: F401 - from pandas import Interval # noqa: F401 - from pandas.core.series import Series # noqa: F401 - from pandas.core.frame import DataFrame # noqa: F401 + from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy + from pandas.core.indexes.base import Index + from pandas.core.resample import Resampler + from pandas.core.series import Series + from pandas.core.window.rolling import BaseWindow + + from pandas.io.formats.format import EngFormatter +else: + # typing.final does not exist until py38 + final = lambda x: x + # array-like @@ -57,10 +74,9 @@ # other Dtype = Union[ - "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] + "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] -FilePathOrBuffer = Union[str, Path, IO[AnyStr]] # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series @@ -76,7 +92,9 @@ Axis = Union[str, int] Label = Optional[Hashable] +IndexLabel = Union[Label, Sequence[Label]] Level = Union[Label, int] +Shape = Tuple[int, ...] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Axes = Collection @@ -99,8 +117,34 @@ # types of `func` kwarg for DataFrame.aggregate and Series.aggregate AggFuncTypeBase = Union[Callable, str] +AggFuncTypeDict = Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] AggFuncType = Union[ AggFuncTypeBase, List[AggFuncTypeBase], - Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], + AggFuncTypeDict, +] +AggObjType = Union[ + "Series", + "DataFrame", + "SeriesGroupBy", + "DataFrameGroupBy", + "BaseWindow", + "Resampler", ] + +# filenames and file-like-objects +Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap] +FileOrBuffer = Union[str, Buffer[T]] +FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[T]] + +# for arbitrary kwargs passed during reading/writing files +StorageOptions = Optional[Dict[str, Any]] + + +# compression keywords and compression +CompressionDict = Dict[str, Any] +CompressionOptions = Optional[Union[str, CompressionDict]] + + +# type of float formatter in DataFrameFormatter +FloatFormatType = Union[str, Callable, "EngFormatter"] diff --git a/pandas/_version.py b/pandas/_version.py index 66e756a4744c8..14c2b5c6e7603 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -5,32 +5,36 @@ # that just contains the computed version number. # This file is released into the public domain. Generated by -# versioneer-0.15 (https://github.com/warner/python-versioneer) +# versioneer-0.19 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" import errno import os import re import subprocess import sys -from typing import Callable, Dict def get_keywords(): + """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "$Format:%d$" git_full = "$Format:%H$" - keywords = {"refnames": git_refnames, "full": git_full} + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: - pass + """Container for Versioneer configuration parameters.""" def get_config(): + """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() @@ -44,14 +48,17 @@ def get_config(): class NotThisMethod(Exception): - pass + """Exception raised if a method is not valid for the current scenario.""" + +HANDLERS = {} -HANDLERS: Dict[str, Dict[str, Callable]] = {} +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" -def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator - def decorate(f: Callable) -> Callable: + def decorate(f): + """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f @@ -60,7 +67,8 @@ def decorate(f: Callable) -> Callable: return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" assert isinstance(commands, list) p = None for c in commands: @@ -70,58 +78,73 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): p = subprocess.Popen( [c] + args, cwd=cwd, + env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), ) break - except EnvironmentError: + except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue if verbose: - print(f"unable to run {dispcmd}") + print("unable to run %s" % dispcmd) print(e) - return None + return None, None else: if verbose: print(f"unable to find command, tried {commands}") - return None + return None, None stdout = p.communicate()[0].strip().decode() if p.returncode != 0: if verbose: - print(f"unable to run {dispcmd} (error)") - return None - return stdout + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, p.returncode + return stdout, p.returncode def versions_from_parentdir(parentdir_prefix, root, verbose): - # Source tarballs conventionally unpack into a directory that includes - # both the project name and a version string. - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print( - f"guessing rootdir is '{root}', but '{dirname}' " - f"doesn't start with prefix '{parentdir_prefix}'" - ) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - return { - "version": dirname[len(parentdir_prefix) :], - "full-revisionid": None, - "dirty": False, - "error": None, - } + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for i in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + else: + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords = {} try: - f = open(versionfile_abs, "r") + f = open(versionfile_abs) for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) @@ -131,16 +154,34 @@ def git_get_keywords(versionfile_abs): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) f.close() - except EnvironmentError: + except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" if not keywords: raise NotThisMethod("no keywords at all, weird") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: @@ -161,20 +202,21 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: - print(f"discarding '{','.join(refs - tags)}', no digits") + print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: - print(f"likely tags: {','.join(sorted(tags))}") + print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] if verbose: - print(f"picking {r}") + print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, + "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: @@ -184,34 +226,48 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", + "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): - # this runs 'git' from the root of the source tree. This only gets called - # if the git-archive 'subst' keywords were *not* expanded, and - # _version.py hasn't already been rewritten with a short version string, - # meaning we're inside a checked out source tree. - - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print(f"no .git in {root}") - raise NotThisMethod("no .git directory") + """Get version from 'git describe' in the root of the source tree. + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - # if there is a tag, this yields TAG-NUM-gHEX[-dirty] - # if there are no tags, this yields HEX[-dirty] (no NUM) - describe_out = run_command( - GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root + + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() - full_out = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) + full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() @@ -238,18 +294,20 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = f"unable to parse git-describe output: '{describe_out}'" + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): - msg = f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" if verbose: - print(msg) - pieces["error"] = msg + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag @@ -261,114 +319,129 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + return pieces def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces): - # now build up version string, with post-release "local version - # identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - # get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + """Build up version string, with post-release "local version identifier". - # exceptions: - # 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) - rendered += f"{pieces['distance']:d}.g{pieces['short']}" + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 - rendered = f"0+untagged.{pieces['distance']:d}.g{pieces['short']}" + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_pre(pieces): - # TAG[.post.devDISTANCE] . No -dirty - - # exceptions: - # 1: no tags. 0.post.devDISTANCE + """TAG[.post0.devDISTANCE] -- No -dirty. + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += f".post.dev{pieces['distance']:d}" + rendered += ".post0.dev%d" % pieces["distance"] else: # exception #1 - rendered = f"0.post.dev{pieces['distance']:d}" + rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces): - # TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that - # .dev0 sorts backwards (a dirty tree will appear "older" than the - # corresponding clean one), but you shouldn't be releasing software with - # -dirty anyways. + """TAG[.postDISTANCE[.dev0]+gHEX] . - # exceptions: - # 1: no tags. 0.postDISTANCE[.dev0] + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += f".post{pieces['distance']:d}" + rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) - rendered += f"g{pieces['short']}" + rendered += "g%s" % pieces["short"] else: # exception #1 - rendered = f"0.pos{pieces['distance']:d}" + rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" - rendered += f"+g{pieces['short']}" + rendered += "+g%s" % pieces["short"] return rendered def render_pep440_old(pieces): - # TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. + """TAG[.postDISTANCE[.dev0]] . - # exceptions: - # 1: no tags. 0.postDISTANCE[.dev0] + The ".dev0" means dirty. + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += f".post{pieces['distance']:d}" + rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 - rendered = f"0.post{pieces['distance']:d}" + rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces): - # TAG[-DISTANCE-gHEX][-dirty], like 'git describe --tags --dirty - # --always' + """TAG[-DISTANCE-gHEX][-dirty]. - # exceptions: - # 1: no tags. HEX[-dirty] (note: no 'g' prefix) + Like 'git describe --tags --dirty --always'. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += f"-{pieces['distance']:d}-g{pieces['short']}" + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] @@ -378,15 +451,17 @@ def render_git_describe(pieces): def render_git_describe_long(pieces): - # TAG-DISTANCE-gHEX[-dirty], like 'git describe --tags --dirty - # --always -long'. The distance/hash is unconditional. + """TAG-DISTANCE-gHEX[-dirty]. - # exceptions: - # 1: no tags. HEX[-dirty] (note: no 'g' prefix) + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] - rendered += f"-{pieces['distance']:d}-g{pieces['short']}" + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] @@ -396,12 +471,14 @@ def render_git_describe_long(pieces): def render(pieces, style): + """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], + "date": None, } if not style or style == "default": @@ -420,17 +497,19 @@ def render(pieces, style): elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: - raise ValueError(f"unknown style '{style}'") + raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date"), } def get_versions(): + """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which @@ -457,6 +536,7 @@ def get_versions(): "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", + "date": None, } try: @@ -476,4 +556,5 @@ def get_versions(): "full-revisionid": None, "dirty": None, "error": "unable to compute version", + "date": None, } diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index 3495b493707c2..fb1abdd5b18ec 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -4,7 +4,7 @@ from pandas._libs.lib import infer_dtype -from pandas.core.dtypes.api import * # noqa: F403, F401 +from pandas.core.dtypes.api import * # noqa: F401, F403 from pandas.core.dtypes.concat import union_categoricals from pandas.core.dtypes.dtypes import ( CategoricalDtype, diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 61832a8b6d621..0fa070b6e4fc4 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -7,6 +7,7 @@ BooleanArray, Categorical, DatetimeArray, + FloatingArray, IntegerArray, IntervalArray, PandasArray, @@ -20,6 +21,7 @@ "BooleanArray", "Categorical", "DatetimeArray", + "FloatingArray", "IntegerArray", "IntervalArray", "PandasArray", diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index f7bb73b916ce0..2ac9b9e2c875c 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -8,24 +8,15 @@ * platform checker """ import platform -import struct import sys import warnings from pandas._typing import F -PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) PYPY = platform.python_implementation() == "PyPy" - - -# ---------------------------------------------------------------------------- -# functions largely based / taken from the six module - -# Much of the code in this module comes from Benjamin Peterson's six library. -# The license for this library can be found in LICENSES/SIX and the code can be -# found at https://bitbucket.org/gutworth/six +IS64 = sys.maxsize > 2 ** 32 def set_function_name(f: F, name: str, cls) -> F: @@ -38,7 +29,6 @@ def set_function_name(f: F, name: str, cls) -> F: return f -# https://github.com/pandas-dev/pandas/pull/9123 def is_platform_little_endian() -> bool: """ Checking if the running platform is little endian. @@ -60,7 +50,7 @@ def is_platform_windows() -> bool: bool True if the running platform is windows. """ - return sys.platform == "win32" or sys.platform == "cygwin" + return sys.platform in ["win32", "cygwin"] def is_platform_linux() -> bool: @@ -72,7 +62,7 @@ def is_platform_linux() -> bool: bool True if the running platform is linux. """ - return sys.platform == "linux2" + return sys.platform == "linux" def is_platform_mac() -> bool: @@ -87,19 +77,7 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" -def is_platform_32bit() -> bool: - """ - Checking if the running platform is 32-bit. - - Returns - ------- - bool - True if the running platform is 32-bit. - """ - return struct.calcsize("P") * 8 < 64 - - -def _import_lzma(): +def import_lzma(): """ Importing the `lzma` module. @@ -119,7 +97,7 @@ def _import_lzma(): warnings.warn(msg) -def _get_lzma_file(lzma): +def get_lzma_file(lzma): """ Importing the `LZMAFile` class from the `lzma` module. diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6423064732def..533e67acfa2f4 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -11,28 +11,40 @@ "fsspec": "0.7.4", "fastparquet": "0.3.2", "gcsfs": "0.6.0", - "lxml.etree": "3.8.0", - "matplotlib": "2.2.2", - "numexpr": "2.6.2", + "lxml.etree": "4.3.0", + "matplotlib": "2.2.3", + "numexpr": "2.6.8", "odfpy": "1.3.0", "openpyxl": "2.5.7", "pandas_gbq": "0.12.0", - "pyarrow": "0.13.0", - "pytables": "3.4.3", + "pyarrow": "0.15.0", "pytest": "5.0.1", "pyxlsb": "1.0.6", "s3fs": "0.4.0", "scipy": "1.2.0", - "sqlalchemy": "1.1.4", - "tables": "3.4.3", + "sqlalchemy": "1.2.8", + "tables": "3.5.1", "tabulate": "0.8.3", - "xarray": "0.8.2", - "xlrd": "1.1.0", - "xlwt": "1.2.0", - "xlsxwriter": "0.9.8", + "xarray": "0.12.3", + "xlrd": "1.2.0", + "xlwt": "1.3.0", + "xlsxwriter": "1.0.2", "numba": "0.46.0", } +# A mapping from import name to package name (on PyPI) for packages where +# these two names are different. + +INSTALL_MAPPING = { + "bs4": "beautifulsoup4", + "bottleneck": "Bottleneck", + "lxml.etree": "lxml", + "odf": "odfpy", + "pandas_gbq": "pandas-gbq", + "sqlalchemy": "SQLAlchemy", + "jinja2": "Jinja2", +} + def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) @@ -82,9 +94,13 @@ def import_optional_dependency( is False, or when the package's version is too old and `on_version` is ``'warn'``. """ + + package_name = INSTALL_MAPPING.get(name) + install_name = package_name if package_name is not None else name + msg = ( - f"Missing optional dependency '{name}'. {extra} " - f"Use pip or conda to install {name}." + f"Missing optional dependency '{install_name}'. {extra} " + f"Use pip or conda to install {install_name}." ) try: module = importlib.import_module(name) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 789a4668b6fee..a2444b7ba5a0d 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -8,19 +8,19 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p16 = _nlv < LooseVersion("1.16") -_np_version_under1p17 = _nlv < LooseVersion("1.17") -_np_version_under1p18 = _nlv < LooseVersion("1.18") +np_version_under1p17 = _nlv < LooseVersion("1.17") +np_version_under1p18 = _nlv < LooseVersion("1.18") _np_version_under1p19 = _nlv < LooseVersion("1.19") _np_version_under1p20 = _nlv < LooseVersion("1.20") -_is_numpy_dev = ".dev" in str(_nlv) +is_numpy_dev = ".dev" in str(_nlv) +_min_numpy_ver = "1.16.5" -if _nlv < "1.15.4": +if _nlv < _min_numpy_ver: raise ImportError( - "this version of pandas is incompatible with numpy < 1.15.4\n" + f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n" f"your numpy version is {_np_version}.\n" - "Please upgrade numpy to >= 1.15.4 to use this pandas version" + f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version" ) @@ -65,7 +65,6 @@ def np_array_datetime64_compat(arr, *args, **kwargs): __all__ = [ "np", "_np_version", - "_np_version_under1p16", - "_np_version_under1p17", - "_is_numpy_dev", + "np_version_under1p17", + "is_numpy_dev", ] diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index d7a14c28cc9ca..c47c31fabeb70 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -1,27 +1,24 @@ """ -For compatibility with numpy libraries, pandas functions or -methods have to accept '*args' and '**kwargs' parameters to -accommodate numpy arguments that are not actually used or -respected in the pandas implementation. - -To ensure that users do not abuse these parameters, validation -is performed in 'validators.py' to make sure that any extra -parameters passed correspond ONLY to those in the numpy signature. -Part of that validation includes whether or not the user attempted -to pass in non-default values for these extraneous parameters. As we -want to discourage users from relying on these parameters when calling -the pandas implementation, we want them only to pass in the default values -for these parameters. - -This module provides a set of commonly used default arguments for functions -and methods that are spread throughout the codebase. This module will make it +For compatibility with numpy libraries, pandas functions or methods have to +accept '*args' and '**kwargs' parameters to accommodate numpy arguments that +are not actually used or respected in the pandas implementation. + +To ensure that users do not abuse these parameters, validation is performed in +'validators.py' to make sure that any extra parameters passed correspond ONLY +to those in the numpy signature. Part of that validation includes whether or +not the user attempted to pass in non-default values for these extraneous +parameters. As we want to discourage users from relying on these parameters +when calling the pandas implementation, we want them only to pass in the +default values for these parameters. + +This module provides a set of commonly used default arguments for functions and +methods that are spread throughout the codebase. This module will make it easier to adjust to future upstream changes in the analogous numpy signatures. """ -from collections import OrderedDict from distutils.version import LooseVersion from typing import Any, Dict, Optional, Union -from numpy import __version__ as _np_version, ndarray +from numpy import __version__, ndarray from pandas._libs.lib import is_bool, is_integer from pandas.errors import UnsupportedFunctionCall @@ -74,7 +71,7 @@ def __call__( raise ValueError(f"invalid validation method '{method}'") -ARGMINMAX_DEFAULTS = dict(out=None) +ARGMINMAX_DEFAULTS = {"out": None} validate_argmin = CompatValidator( ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1 ) @@ -93,11 +90,10 @@ def process_skipna(skipna, args): def validate_argmin_with_skipna(skipna, args, kwargs): """ - If 'Series.argmin' is called via the 'numpy' library, - the third parameter in its signature is 'out', which - takes either an ndarray or 'None', so check if the - 'skipna' parameter is either an instance of ndarray or - is None, since 'skipna' itself should be a boolean + If 'Series.argmin' is called via the 'numpy' library, the third parameter + in its signature is 'out', which takes either an ndarray or 'None', so + check if the 'skipna' parameter is either an instance of ndarray or is + None, since 'skipna' itself should be a boolean """ skipna, args = process_skipna(skipna, args) validate_argmin(args, kwargs) @@ -106,23 +102,22 @@ def validate_argmin_with_skipna(skipna, args, kwargs): def validate_argmax_with_skipna(skipna, args, kwargs): """ - If 'Series.argmax' is called via the 'numpy' library, - the third parameter in its signature is 'out', which - takes either an ndarray or 'None', so check if the - 'skipna' parameter is either an instance of ndarray or - is None, since 'skipna' itself should be a boolean + If 'Series.argmax' is called via the 'numpy' library, the third parameter + in its signature is 'out', which takes either an ndarray or 'None', so + check if the 'skipna' parameter is either an instance of ndarray or is + None, since 'skipna' itself should be a boolean """ skipna, args = process_skipna(skipna, args) validate_argmax(args, kwargs) return skipna -ARGSORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() +ARGSORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None -if LooseVersion(_np_version) >= LooseVersion("1.17.0"): +if LooseVersion(__version__) >= LooseVersion("1.17.0"): # GH-26361. NumPy added radix sort and changed default to None. ARGSORT_DEFAULTS["kind"] = None @@ -131,9 +126,9 @@ def validate_argmax_with_skipna(skipna, args, kwargs): ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both" ) -# two different signatures of argsort, this second validation -# for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND: "OrderedDict[str, Optional[int]]" = OrderedDict() +# two different signatures of argsort, this second validation for when the +# `kind` param is supported +ARGSORT_DEFAULTS_KIND: Dict[str, Optional[int]] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -143,11 +138,10 @@ def validate_argmax_with_skipna(skipna, args, kwargs): def validate_argsort_with_ascending(ascending, args, kwargs): """ - If 'Categorical.argsort' is called via the 'numpy' library, the - first parameter in its signature is 'axis', which takes either - an integer or 'None', so check if the 'ascending' parameter has - either integer type or is None, since 'ascending' itself should - be a boolean + If 'Categorical.argsort' is called via the 'numpy' library, the first + parameter in its signature is 'axis', which takes either an integer or + 'None', so check if the 'ascending' parameter has either integer type or is + None, since 'ascending' itself should be a boolean """ if is_integer(ascending) or ascending is None: args = (ascending,) + args @@ -157,7 +151,7 @@ def validate_argsort_with_ascending(ascending, args, kwargs): return ascending -CLIP_DEFAULTS: Dict[str, Any] = dict(out=None) +CLIP_DEFAULTS: Dict[str, Any] = {"out": None} validate_clip = CompatValidator( CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 ) @@ -165,10 +159,10 @@ def validate_argsort_with_ascending(ascending, args, kwargs): def validate_clip_with_axis(axis, args, kwargs): """ - If 'NDFrame.clip' is called via the numpy library, the third - parameter in its signature is 'out', which can takes an ndarray, - so check if the 'axis' parameter is an instance of ndarray, since - 'axis' itself should either be an integer or None + If 'NDFrame.clip' is called via the numpy library, the third parameter in + its signature is 'out', which can takes an ndarray, so check if the 'axis' + parameter is an instance of ndarray, since 'axis' itself should either be + an integer or None """ if isinstance(axis, ndarray): args = (axis,) + args @@ -178,7 +172,7 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() +CUM_FUNC_DEFAULTS: Dict[str, Any] = {} CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None validate_cum_func = CompatValidator( @@ -191,10 +185,9 @@ def validate_clip_with_axis(axis, args, kwargs): def validate_cum_func_with_skipna(skipna, args, kwargs, name): """ - If this function is called via the 'numpy' library, the third - parameter in its signature is 'dtype', which takes either a - 'numpy' dtype or 'None', so check if the 'skipna' parameter is - a boolean or not + If this function is called via the 'numpy' library, the third parameter in + its signature is 'dtype', which takes either a 'numpy' dtype or 'None', so + check if the 'skipna' parameter is a boolean or not """ if not is_bool(skipna): args = (skipna,) + args @@ -204,7 +197,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna -ALLANY_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() +ALLANY_DEFAULTS: Dict[str, Optional[bool]] = {} ALLANY_DEFAULTS["dtype"] = None ALLANY_DEFAULTS["out"] = None ALLANY_DEFAULTS["keepdims"] = False @@ -215,10 +208,10 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1 ) -LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) +LOGICAL_FUNC_DEFAULTS = {"out": None, "keepdims": False} validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") -MINMAX_DEFAULTS = dict(axis=None, out=None, keepdims=False) +MINMAX_DEFAULTS = {"axis": None, "out": None, "keepdims": False} validate_min = CompatValidator( MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 ) @@ -226,28 +219,28 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C") +RESHAPE_DEFAULTS: Dict[str, str] = {"order": "C"} validate_reshape = CompatValidator( RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 ) -REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None) +REPEAT_DEFAULTS: Dict[str, Any] = {"axis": None} validate_repeat = CompatValidator( REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 ) -ROUND_DEFAULTS: Dict[str, Any] = dict(out=None) +ROUND_DEFAULTS: Dict[str, Any] = {"out": None} validate_round = CompatValidator( ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() +SORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") -STAT_FUNC_DEFAULTS: "OrderedDict[str, Optional[Any]]" = OrderedDict() +STAT_FUNC_DEFAULTS: Dict[str, Optional[Any]] = {} STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -281,13 +274,13 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() +STAT_DDOF_FUNC_DEFAULTS: Dict[str, Optional[bool]] = {} STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") -TAKE_DEFAULTS: "OrderedDict[str, Optional[str]]" = OrderedDict() +TAKE_DEFAULTS: Dict[str, Optional[str]] = {} TAKE_DEFAULTS["out"] = None TAKE_DEFAULTS["mode"] = "raise" validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") @@ -295,10 +288,9 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): def validate_take_with_convert(convert, args, kwargs): """ - If this function is called via the 'numpy' library, the third - parameter in its signature is 'axis', which takes either an - ndarray or 'None', so check if the 'convert' parameter is either - an instance of ndarray or is None + If this function is called via the 'numpy' library, the third parameter in + its signature is 'axis', which takes either an ndarray or 'None', so check + if the 'convert' parameter is either an instance of ndarray or is None """ if isinstance(convert, ndarray) or convert is None: args = (convert,) + args @@ -308,7 +300,7 @@ def validate_take_with_convert(convert, args, kwargs): return convert -TRANSPOSE_DEFAULTS = dict(axes=None) +TRANSPOSE_DEFAULTS = {"axes": None} validate_transpose = CompatValidator( TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 ) @@ -361,10 +353,9 @@ def validate_expanding_func(name, args, kwargs) -> None: def validate_groupby_func(name, args, kwargs, allowed=None) -> None: """ - 'args' and 'kwargs' should be empty, except for allowed - kwargs because all of - their necessary parameters are explicitly listed in - the function signature + 'args' and 'kwargs' should be empty, except for allowed kwargs because all + of their necessary parameters are explicitly listed in the function + signature """ if allowed is None: allowed = [] @@ -383,9 +374,8 @@ def validate_groupby_func(name, args, kwargs, allowed=None) -> None: def validate_resampler_func(method: str, args, kwargs) -> None: """ - 'args' and 'kwargs' should be empty because all of - their necessary parameters are explicitly listed in - the function signature + 'args' and 'kwargs' should be empty because all of their necessary + parameters are explicitly listed in the function signature """ if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: @@ -397,20 +387,20 @@ def validate_resampler_func(method: str, args, kwargs) -> None: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis: Optional[int]) -> None: +def validate_minmax_axis(axis: Optional[int], ndim: int = 1) -> None: """ - Ensure that the axis argument passed to min, max, argmin, or argmax is - zero or None, as otherwise it will be incorrectly ignored. + Ensure that the axis argument passed to min, max, argmin, or argmax is zero + or None, as otherwise it will be incorrectly ignored. Parameters ---------- axis : int or None + ndim : int, default 1 Raises ------ ValueError """ - ndim = 1 # hard-coded for Index if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 0484de3fa165d..80ee1f2e20154 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -14,7 +14,7 @@ from pandas import Index if TYPE_CHECKING: - from pandas import Series, DataFrame + from pandas import DataFrame, Series def load_reduce(self): @@ -64,7 +64,7 @@ class _LoadSparseSeries: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "Series", but must return # a subtype of "_LoadSparseSeries") - def __new__(cls) -> "Series": # type: ignore + def __new__(cls) -> "Series": # type: ignore[misc] from pandas import Series warnings.warn( @@ -82,7 +82,7 @@ class _LoadSparseFrame: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "DataFrame", but must # return a subtype of "_LoadSparseFrame") - def __new__(cls) -> "DataFrame": # type: ignore + def __new__(cls) -> "DataFrame": # type: ignore[misc] from pandas import DataFrame warnings.warn( @@ -181,7 +181,7 @@ def __new__(cls) -> "DataFrame": # type: ignore # functions for compat and uses a non-public class of the pickle module. # error: Name 'pkl._Unpickler' is not defined -class Unpickler(pkl._Unpickler): # type: ignore +class Unpickler(pkl._Unpickler): # type: ignore[name-defined] def find_class(self, module, name): # override superclass key = (module, name) @@ -274,7 +274,7 @@ def patch_pickle(): """ orig_loads = pkl.loads try: - pkl.loads = loads + setattr(pkl, "loads", loads) yield finally: - pkl.loads = orig_loads + setattr(pkl, "loads", orig_loads) diff --git a/pandas/conftest.py b/pandas/conftest.py index e0adb37e7d2f5..2bac2ed198789 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -33,8 +33,10 @@ import pandas.util._test_decorators as td +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype + import pandas as pd -from pandas import DataFrame +from pandas import DataFrame, Interval, Period, Series, Timedelta, Timestamp import pandas._testing as tm from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex @@ -55,6 +57,9 @@ def pytest_configure(config): ) config.addinivalue_line("markers", "high_memory: mark a test as a high-memory only") config.addinivalue_line("markers", "clipboard: mark a pd.read_clipboard test") + config.addinivalue_line( + "markers", "arm_slow: mark a test as slow for arm64 architecture" + ) def pytest_addoption(parser): @@ -171,14 +176,6 @@ def axis(request): axis_frame = axis -@pytest.fixture(params=[0, "index"], ids=lambda x: f"axis {repr(x)}") -def axis_series(request): - """ - Fixture for returning the axis numbers of a Series. - """ - return request.param - - @pytest.fixture(params=[True, False, None]) def observed(request): """ @@ -291,11 +288,22 @@ def unique_nulls_fixture(request): # Generate cartesian product of unique_nulls_fixture: unique_nulls_fixture2 = unique_nulls_fixture - # ---------------------------------------------------------------- # Classes # ---------------------------------------------------------------- -@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) + + +@pytest.fixture(params=[pd.DataFrame, pd.Series]) +def frame_or_series(request): + """ + Fixture to parametrize over DataFrame and Series. + """ + return request.param + + +@pytest.fixture( + params=[pd.Index, pd.Series], ids=["index", "series"] # type: ignore[list-item] +) def index_or_series(request): """ Fixture to parametrize over Index and Series, made necessary by a mypy @@ -312,6 +320,16 @@ def index_or_series(request): index_or_series2 = index_or_series +@pytest.fixture( + params=[pd.Index, pd.Series, pd.array], ids=["index", "series", "array"] +) +def index_or_series_or_array(request): + """ + Fixture to parametrize over Index, Series, and ExtensionArray + """ + return request.param + + @pytest.fixture def dict_subclass(): """ @@ -359,11 +377,24 @@ def multiindex_year_month_day_dataframe_random_data(): tdf = tm.makeTimeDataFrame(100) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index = ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels]) ymd.index.set_names(["year", "month", "day"], inplace=True) return ymd +@pytest.fixture +def multiindex_dataframe_random_data(): + """DataFrame with 2 level MultiIndex with random data""" + index = MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["first", "second"], + ) + return DataFrame( + np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") + ) + + def _create_multiindex(): """ MultiIndex used to test the general functionality of this object @@ -376,13 +407,12 @@ def _create_multiindex(): major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ["first", "second"] - mi = MultiIndex( + return MultiIndex( levels=[major_axis, minor_axis], codes=[major_codes, minor_codes], names=index_names, verify_integrity=False, ) - return mi def _create_mi_with_dt64tz_level(): @@ -437,6 +467,29 @@ def index(request): index_fixture2 = index +@pytest.fixture(params=indices_dict.keys()) +def index_with_missing(request): + """ + Fixture for indices with missing values + """ + if request.param in ["int", "uint", "range", "empty", "repeats"]: + pytest.xfail("missing values not supported") + # GH 35538. Use deep copy to avoid illusive bug on np-dev + # Azure pipeline that writes into indices_dict despite copy + ind = indices_dict[request.param].copy(deep=True) + vals = ind.values + if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: + # For setting missing values in the top level of MultiIndex + vals = ind.tolist() + vals[0] = (None,) + vals[0][1:] + vals[-1] = (None,) + vals[-1][1:] + return MultiIndex.from_tuples(vals) + else: + vals[0] = None + vals[-1] = None + return type(ind)(vals) + + # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- @@ -496,6 +549,23 @@ def series_with_simple_index(index): return _create_series(index) +@pytest.fixture +def series_with_multilevel_index(): + """ + Fixture with a Series with a 2-level MultiIndex. + """ + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + data = np.random.randn(8) + ser = Series(data, index=index) + ser[3] = np.NaN + return ser + + _narrow_dtypes = [ np.float16, np.float32, @@ -628,6 +698,26 @@ def float_frame(): return DataFrame(tm.getSeriesData()) +# ---------------------------------------------------------------- +# Scalars +# ---------------------------------------------------------------- +@pytest.fixture( + params=[ + (Interval(left=0, right=5), IntervalDtype("int64")), + (Interval(left=0.1, right=0.5), IntervalDtype("float64")), + (Period("2012-01", freq="M"), "period[M]"), + (Period("2012-02-01", freq="D"), "period[D]"), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(tz="US/Eastern"), + ), + (Timedelta(seconds=500), "timedelta64[ns]"), + ] +) +def ea_scalar_and_dtype(request): + return request.param + + # ---------------------------------------------------------------- # Operators & Operations # ---------------------------------------------------------------- @@ -657,6 +747,43 @@ def all_arithmetic_operators(request): return request.param +@pytest.fixture( + params=[ + operator.add, + ops.radd, + operator.sub, + ops.rsub, + operator.mul, + ops.rmul, + operator.truediv, + ops.rtruediv, + operator.floordiv, + ops.rfloordiv, + operator.mod, + ops.rmod, + operator.pow, + ops.rpow, + operator.eq, + operator.ne, + operator.lt, + operator.le, + operator.gt, + operator.ge, + operator.and_, + ops.rand_, + operator.xor, + ops.rxor, + operator.or_, + ops.ror_, + ] +) +def all_binary_operators(request): + """ + Fixture for operator and roperator arithmetic, comparison, and logical ops. + """ + return request.param + + @pytest.fixture( params=[ operator.add, @@ -837,6 +964,10 @@ def iris(datapath): "Asia/Tokyo", "dateutil/US/Pacific", "dateutil/Asia/Singapore", + "+01:15", + "-02:15", + "UTC+01:15", + "UTC-02:15", tzutc(), tzlocal(), FixedOffset(300), @@ -958,6 +1089,31 @@ def float_dtype(request): return request.param +@pytest.fixture(params=tm.FLOAT_EA_DTYPES) +def float_ea_dtype(request): + """ + Parameterized fixture for float dtypes. + + * 'Float32' + * 'Float64' + """ + return request.param + + +@pytest.fixture(params=tm.FLOAT_DTYPES + tm.FLOAT_EA_DTYPES) +def any_float_allowed_nullable_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + """ + return request.param + + @pytest.fixture(params=tm.COMPLEX_DTYPES) def complex_dtype(request): """ @@ -1032,6 +1188,39 @@ def any_nullable_int_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_EA_INT_DTYPES + tm.FLOAT_EA_DTYPES) +def any_numeric_dtype(request): + """ + Parameterized fixture for any nullable integer dtype and + any float ea dtypes. + + * 'UInt8' + * 'Int8' + * 'UInt16' + * 'Int16' + * 'UInt32' + * 'Int32' + * 'UInt64' + * 'Int64' + * 'Float32' + * 'Float64' + """ + return request.param + + +@pytest.fixture(params=tm.SIGNED_EA_INT_DTYPES) +def any_signed_nullable_int_dtype(request): + """ + Parameterized fixture for any signed nullable integer dtype. + + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + """ + return request.param + + @pytest.fixture(params=tm.ALL_REAL_DTYPES) def any_real_dtype(request): """ @@ -1181,7 +1370,13 @@ def ip(): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.interactiveshell import InteractiveShell - return InteractiveShell() + # GH#35711 make sure sqlite history file handle is not leaked + from traitlets.config import Config # isort:skip + + c = Config() + c.HistoryManager.hist_file = ":memory:" + + return InteractiveShell(config=c) @pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) @@ -1194,15 +1389,6 @@ def spmatrix(request): return getattr(sparse, request.param + "_matrix") -@pytest.fixture(params=list(tm.cython_table)) -def cython_table_items(request): - """ - Yields a tuple of a function and its corresponding name. Correspond to - the list of aggregator "Cython functions" used on selected table items. - """ - return request.param - - @pytest.fixture( params=[ getattr(pd.offsets, o) @@ -1224,3 +1410,39 @@ def sort_by_key(request): Tests None (no key) and the identity key. """ return request.param + + +@pytest.fixture() +def fsspectest(): + pytest.importorskip("fsspec") + from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem + from fsspec.registry import _registry as registry + + class TestMemoryFS(MemoryFileSystem): + protocol = "testmem" + test = [None] + + def __init__(self, **kwargs): + self.test[0] = kwargs.pop("test", None) + super().__init__(**kwargs) + + register_implementation("testmem", TestMemoryFS, clobber=True) + yield TestMemoryFS() + registry.pop("testmem", None) + TestMemoryFS.test[0] = None + TestMemoryFS.store.clear() + + +@pytest.fixture( + params=[ + ("foo", None, None), + ("Egon", "Venkman", None), + ("NCC1701D", "NCC1701D", "NCC1701D"), + ] +) +def names(request): + """ + A 3-tuple of names, the first two for operands, the last for a result. + """ + return request.param diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 2caf1f75f3da1..15c2a4a6c5c04 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -4,7 +4,7 @@ that can be mixed into or pinned onto other pandas classes. """ -from typing import FrozenSet, Set +from typing import FrozenSet, List, Set import warnings from pandas.util._decorators import doc @@ -12,28 +12,21 @@ class DirNamesMixin: _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset() + _hidden_attrs: FrozenSet[str] = frozenset() - def _dir_deletions(self): + def _dir_deletions(self) -> Set[str]: """ Delete unwanted __dir__ for this object. """ - return self._accessors | self._deprecations + return self._accessors | self._hidden_attrs - def _dir_additions(self): + def _dir_additions(self) -> Set[str]: """ Add additional __dir__ for this object. """ - rv = set() - for accessor in self._accessors: - try: - getattr(self, accessor) - rv.add(accessor) - except AttributeError: - pass - return rv + return {accessor for accessor in self._accessors if hasattr(self, accessor)} - def __dir__(self): + def __dir__(self) -> List[str]: """ Provide method name lookup and completion. @@ -41,7 +34,7 @@ def __dir__(self): ----- Only provide 'public' methods. """ - rv = set(dir(type(self))) + rv = set(super().__dir__()) rv = (rv - self._dir_deletions()) | self._dir_additions() return sorted(rv) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 891048ae82dfd..c64f0bd71cf84 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -6,32 +6,46 @@ from collections import defaultdict from functools import partial from typing import ( + TYPE_CHECKING, Any, Callable, DefaultDict, Dict, + Iterable, List, Optional, Sequence, Tuple, Union, + cast, ) -from pandas._typing import AggFuncType, Label +from pandas._typing import ( + AggFuncType, + AggFuncTypeBase, + AggFuncTypeDict, + AggObjType, + Axis, + FrameOrSeries, + FrameOrSeriesUnion, + Label, +) +from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import is_dict_like, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCNDFrame, ABCSeries -from pandas.core.base import SpecificationError +from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index -from pandas.core.series import FrameOrSeriesUnion, Series + +if TYPE_CHECKING: + from pandas.core.series import Series def reconstruct_func( - func: Optional[AggFuncType], **kwargs, -) -> Tuple[ - bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], -]: + func: Optional[AggFuncType], **kwargs +) -> Tuple[bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -63,7 +77,7 @@ def reconstruct_func( Examples -------- >>> reconstruct_func(None, **{"foo": ("col", "min")}) - (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) + (True, defaultdict(, {'col': ['min']}), ('foo',), array([0])) >>> reconstruct_func("min") (False, 'min', None, None) @@ -87,7 +101,6 @@ def reconstruct_func( if relabeling: func, columns, order = normalize_keyword_aggregation(kwargs) - func = maybe_mangle_lambdas(func) return relabeling, func, columns, order @@ -278,12 +291,13 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: def relabel_result( - result: FrameOrSeriesUnion, + result: FrameOrSeries, func: Dict[str, List[Union[Callable, str]]], - columns: Tuple, - order: List[int], -) -> Dict[Label, Series]: - """Internal function to reorder result if relabelling is True for + columns: Iterable[Label], + order: Iterable[int], +) -> Dict[Label, "Series"]: + """ + Internal function to reorder result if relabelling is True for dataframe.agg, and return the reordered result in dict. Parameters: @@ -308,10 +322,10 @@ def relabel_result( reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] - reordered_result_in_dict: Dict[Label, Series] = {} + reordered_result_in_dict: Dict[Label, "Series"] = {} idx = 0 - reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 + reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1 for col, fun in func.items(): s = result[col].dropna() @@ -374,7 +388,7 @@ def validate_func_kwargs( (['one', 'two'], ['min', 'max']) """ no_arg_message = "Must provide 'func' or named aggregation **kwargs." - tuple_given_message = "func is expected but recieved {} in **kwargs." + tuple_given_message = "func is expected but received {} in **kwargs." columns = list(kwargs) func = [] for col_func in kwargs.values(): @@ -384,3 +398,390 @@ def validate_func_kwargs( if not columns: raise TypeError(no_arg_message) return columns, func + + +def transform( + obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs +) -> FrameOrSeriesUnion: + """ + Transform a DataFrame or Series + + Parameters + ---------- + obj : DataFrame or Series + Object to compute the transform on. + func : string, function, list, or dictionary + Function(s) to compute the transform with. + axis : {0 or 'index', 1 or 'columns'} + Axis along which the function is applied: + + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. + + Returns + ------- + DataFrame or Series + Result of applying ``func`` along the given axis of the + Series or DataFrame. + + Raises + ------ + ValueError + If the transform function fails or does not transform. + """ + is_series = obj.ndim == 1 + + if obj._get_axis_number(axis) == 1: + assert not is_series + return transform(obj.T, func, 0, *args, **kwargs).T + + if is_list_like(func) and not is_dict_like(func): + func = cast(List[AggFuncTypeBase], func) + # Convert func equivalent dict + if is_series: + func = {com.get_callable_name(v) or v: v for v in func} + else: + func = {col: func for col in obj} + + if is_dict_like(func): + func = cast(AggFuncTypeDict, func) + return transform_dict_like(obj, func, *args, **kwargs) + + # func is either str or callable + func = cast(AggFuncTypeBase, func) + try: + result = transform_str_or_callable(obj, func, *args, **kwargs) + except Exception: + raise ValueError("Transform function failed") + + # Functions that transform may return empty Series/DataFrame + # when the dtype is not appropriate + if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: + raise ValueError("Transform function failed") + if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( + obj.index + ): + raise ValueError("Function did not transform") + + return result + + +def transform_dict_like( + obj: FrameOrSeries, + func: AggFuncTypeDict, + *args, + **kwargs, +): + """ + Compute transform in the case of a dict-like func + """ + from pandas.core.reshape.concat import concat + + if len(func) == 0: + raise ValueError("No transform functions were provided") + + if obj.ndim != 1: + # Check for missing columns on a frame + cols = sorted(set(func.keys()) - set(obj.columns)) + if len(cols) > 0: + raise SpecificationError(f"Column(s) {cols} do not exist") + + # Can't use func.values(); wouldn't work for a Series + if any(is_dict_like(v) for _, v in func.items()): + # GH 15931 - deprecation of renaming keys + raise SpecificationError("nested renamer is not supported") + + results: Dict[Label, FrameOrSeriesUnion] = {} + for name, how in func.items(): + colg = obj._gotitem(name, ndim=1) + try: + results[name] = transform(colg, how, 0, *args, **kwargs) + except Exception as err: + if ( + str(err) == "Function did not transform" + or str(err) == "No transform functions were provided" + ): + raise err + + # combine results + if len(results) == 0: + raise ValueError("Transform function failed") + return concat(results, axis=1) + + +def transform_str_or_callable( + obj: FrameOrSeries, func: AggFuncTypeBase, *args, **kwargs +) -> FrameOrSeriesUnion: + """ + Compute transform in the case of a string or callable func + """ + if isinstance(func, str): + return obj._try_aggregate_string_function(func, *args, **kwargs) + + if not args and not kwargs: + f = obj._get_cython_func(func) + if f: + return getattr(obj, f)() + + # Two possible ways to use a UDF - apply or call directly + try: + return obj.apply(func, args=args, **kwargs) + except Exception: + return func(obj, *args, **kwargs) + + +def aggregate( + obj: AggObjType, + arg: AggFuncType, + *args, + **kwargs, +): + """ + Provide an implementation for the aggregators. + + Parameters + ---------- + obj : Pandas object to compute aggregation on. + arg : string, dict, function. + *args : args to pass on to the function. + **kwargs : kwargs to pass on to the function. + + Returns + ------- + tuple of result, how. + + Notes + ----- + how can be a string describe the required post-processing, or + None if not required. + """ + _axis = kwargs.pop("_axis", None) + if _axis is None: + _axis = getattr(obj, "axis", 0) + + if isinstance(arg, str): + return obj._try_aggregate_string_function(arg, *args, **kwargs), None + elif is_dict_like(arg): + arg = cast(AggFuncTypeDict, arg) + return agg_dict_like(obj, arg, _axis), True + elif is_list_like(arg): + # we require a list, but not an 'str' + arg = cast(List[AggFuncTypeBase], arg) + return agg_list_like(obj, arg, _axis=_axis), None + else: + result = None + + if callable(arg): + f = obj._get_cython_func(arg) + if f and not args and not kwargs: + return getattr(obj, f)(), None + + # caller can react + return result, True + + +def agg_list_like( + obj: AggObjType, + arg: List[AggFuncTypeBase], + _axis: int, +) -> FrameOrSeriesUnion: + """ + Compute aggregation in the case of a list-like argument. + + Parameters + ---------- + obj : Pandas object to compute aggregation on. + arg : list + Aggregations to compute. + _axis : int, 0 or 1 + Axis to compute aggregation on. + + Returns + ------- + Result of aggregation. + """ + from pandas.core.reshape.concat import concat + + if _axis != 0: + raise NotImplementedError("axis other than 0 is not supported") + + if obj._selected_obj.ndim == 1: + selected_obj = obj._selected_obj + else: + selected_obj = obj._obj_with_exclusions + + results = [] + keys = [] + + # degenerate case + if selected_obj.ndim == 1: + for a in arg: + colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) + try: + new_res = colg.aggregate(a) + + except TypeError: + pass + else: + results.append(new_res) + + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) + + # multiples + else: + for index, col in enumerate(selected_obj): + colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) + try: + new_res = colg.aggregate(arg) + except (TypeError, DataError): + pass + except ValueError as err: + # cannot aggregate + if "Must produce aggregated value" in str(err): + # raised directly in _aggregate_named + pass + elif "no results" in str(err): + # raised directly in _aggregate_multiple_funcs + pass + else: + raise + else: + results.append(new_res) + keys.append(col) + + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + return concat(results, keys=keys, axis=1, sort=False) + except TypeError as err: + + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + + from pandas import Series + + result = Series(results, index=keys, name=obj.name) + if is_nested_object(result): + raise ValueError( + "cannot combine transform and aggregation operations" + ) from err + return result + + +def agg_dict_like( + obj: AggObjType, + arg: AggFuncTypeDict, + _axis: int, +) -> FrameOrSeriesUnion: + """ + Compute aggregation in the case of a dict-like argument. + + Parameters + ---------- + obj : Pandas object to compute aggregation on. + arg : dict + label-aggregation pairs to compute. + _axis : int, 0 or 1 + Axis to compute aggregation on. + + Returns + ------- + Result of aggregation. + """ + is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + + if _axis != 0: # pragma: no cover + raise ValueError("Can only pass dict with axis=0") + + selected_obj = obj._selected_obj + + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + if any(is_aggregator(x) for x in arg.values()): + new_arg: AggFuncTypeDict = {} + for k, v in arg.items(): + if not isinstance(v, (tuple, list, dict)): + new_arg[k] = [v] + else: + new_arg[k] = v + + # the keys must be in the columns + # for ndim=2, or renamers for ndim=1 + + # ok for now, but deprecated + # {'A': { 'ra': 'mean' }} + # {'A': { 'ra': ['mean'] }} + # {'ra': ['mean']} + + # not ok + # {'ra' : { 'A' : 'mean' }} + if isinstance(v, dict): + raise SpecificationError("nested renamer is not supported") + elif isinstance(selected_obj, ABCSeries): + raise SpecificationError("nested renamer is not supported") + elif ( + isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns + ): + raise KeyError(f"Column '{k}' does not exist!") + + arg = new_arg + + else: + # deprecation of renaming keys + # GH 15931 + keys = list(arg.keys()) + if isinstance(selected_obj, ABCDataFrame) and len( + selected_obj.columns.intersection(keys) + ) != len(keys): + cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys))) + raise SpecificationError(f"Column(s) {cols} do not exist") + + from pandas.core.reshape.concat import concat + + if selected_obj.ndim == 1: + # key only used for output + colg = obj._gotitem(obj._selection, ndim=1) + results = {key: colg.agg(how) for key, how in arg.items()} + else: + # key used for column selection and output + results = {key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()} + + # set the final keys + keys = list(arg.keys()) + + # Avoid making two isinstance calls in all and any below + is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] + + # combine results + if all(is_ndframe): + keys_to_use = [k for k in keys if not results[k].empty] + # Have to check, if at least one DataFrame is not empty. + keys_to_use = keys_to_use if keys_to_use != [] else keys + axis = 0 if isinstance(obj, ABCSeries) else 1 + result = concat({k: results[k] for k in keys_to_use}, axis=axis) + elif any(is_ndframe): + # There is a mix of NDFrames and scalars + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) + else: + from pandas import Series + + # we have a dict of scalars + # GH 36212 use name only if obj is a series + if obj.ndim == 1: + obj = cast("Series", obj) + name = obj.name + else: + name = None + + result = Series(results, name=name) + + return result diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e3ca4cc53363..67a0e02fc2d4d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2,15 +2,17 @@ Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ +from __future__ import annotations + import operator from textwrap import dedent -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast from warnings import catch_warnings, simplefilter, warn import numpy as np from pandas._libs import Timestamp, algos, hashtable as htable, iNaT, lib -from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj +from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -46,11 +48,13 @@ pandas_dtype, ) from pandas.core.dtypes.generic import ( + ABCDatetimeArray, ABCExtensionArray, - ABCIndex, ABCIndexClass, ABCMultiIndex, + ABCRangeIndex, ABCSeries, + ABCTimedeltaArray, ) from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -58,7 +62,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Series + from pandas import Categorical, DataFrame, Index, Series _shared_docs: Dict[str, str] = {} @@ -67,7 +71,7 @@ # dtype access # # --------------- # def _ensure_data( - values, dtype: Optional[DtypeObj] = None + values: ArrayLike, dtype: Optional[DtypeObj] = None ) -> Tuple[np.ndarray, DtypeObj]: """ routine to ensure that our data is of the correct @@ -93,6 +97,12 @@ def _ensure_data( pandas_dtype : np.dtype or ExtensionDtype """ + if dtype is not None: + # We only have non-None dtype when called from `isin`, and + # both Datetimelike and Categorical dispatch before getting here. + assert not needs_i8_conversion(dtype) + assert not is_categorical_dtype(dtype) + if not isinstance(values, ABCMultiIndex): # extract_array would raise values = extract_array(values, extract_numpy=True) @@ -129,21 +139,20 @@ def _ensure_data( return ensure_object(values), np.dtype("object") # datetimelike - vals_dtype = getattr(values, "dtype", None) - if needs_i8_conversion(vals_dtype) or needs_i8_conversion(dtype): - if is_period_dtype(vals_dtype) or is_period_dtype(dtype): + if needs_i8_conversion(values.dtype) or needs_i8_conversion(dtype): + if is_period_dtype(values.dtype) or is_period_dtype(dtype): from pandas import PeriodIndex - values = PeriodIndex(values) + values = PeriodIndex(values)._data dtype = values.dtype - elif is_timedelta64_dtype(vals_dtype) or is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(values.dtype) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex - values = TimedeltaIndex(values) + values = TimedeltaIndex(values)._data dtype = values.dtype else: # Datetime - if values.ndim > 1 and is_datetime64_ns_dtype(vals_dtype): + if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 # TODO(EA2D): special case not needed with 2D EAs @@ -153,14 +162,15 @@ def _ensure_data( from pandas import DatetimeIndex - values = DatetimeIndex(values) + values = DatetimeIndex(values)._data dtype = values.dtype return values.asi8, dtype - elif is_categorical_dtype(vals_dtype) and ( + elif is_categorical_dtype(values.dtype) and ( is_categorical_dtype(dtype) or dtype is None ): + values = cast("Categorical", values) values = values.codes dtype = pandas_dtype("category") @@ -191,8 +201,16 @@ def _reconstruct_data( ------- ExtensionArray or np.ndarray """ + if isinstance(values, ABCExtensionArray) and values.dtype == dtype: + # Catch DatetimeArray/TimedeltaArray + return values + if is_extension_array_dtype(dtype): - values = dtype.construct_array_type()._from_sequence(values) + cls = dtype.construct_array_type() + if isinstance(values, cls) and values.dtype == dtype: + return values + + values = cls._from_sequence(values) elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) @@ -216,7 +234,8 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ["mixed", "string"]: + if inferred in ["mixed", "string", "mixed-integer"]: + # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -234,11 +253,11 @@ def _ensure_arraylike(values): } -def _get_hashtable_algo(values): +def _get_hashtable_algo(values: np.ndarray): """ Parameters ---------- - values : arraylike + values : np.ndarray Returns ------- @@ -252,15 +271,15 @@ def _get_hashtable_algo(values): return htable, values -def _get_values_for_rank(values): +def _get_values_for_rank(values: ArrayLike): if is_categorical_dtype(values): - values = values._values_for_rank() + values = cast("Categorical", values)._values_for_rank() values, _ = _ensure_data(values) return values -def _get_data_algo(values): +def get_data_algo(values: ArrayLike): values = _get_values_for_rank(values) ndtype = _check_object_for_strings(values) @@ -276,7 +295,6 @@ def _check_object_for_strings(values) -> str: Parameters ---------- values : ndarray - ndtype : str Returns ------- @@ -419,49 +437,64 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): - values = construct_1d_object_array_from_listlike(list(values)) - # TODO: could use ensure_arraylike here + if not isinstance( + values, (ABCIndexClass, ABCSeries, ABCExtensionArray, np.ndarray) + ): + values = _ensure_arraylike(list(values)) + elif isinstance(values, ABCMultiIndex): + # Avoid raising in extract_array + values = np.array(values) + else: + values = extract_array(values, extract_numpy=True) + comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) - if is_categorical_dtype(comps): + if is_categorical_dtype(comps.dtype): # TODO(extension) # handle categoricals - return comps.isin(values) # type: ignore - - comps, dtype = _ensure_data(comps) - values, _ = _ensure_data(values, dtype=dtype) - - # faster for larger cases to use np.in1d - f = htable.ismember_object + return cast("Categorical", comps).isin(values) + + if needs_i8_conversion(comps.dtype): + # Dispatch to DatetimeLikeArrayMixin.isin + return array(comps).isin(values) + elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype): + # e.g. comps are integers and values are datetime64s + return np.zeros(comps.shape, dtype=bool) + # TODO: not quite right ... Sparse/Categorical + elif needs_i8_conversion(values.dtype): + return isin(comps, values.astype(object)) + + elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype( + values.dtype + ): + return isin(np.asarray(comps), np.asarray(values)) # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - if len(comps) > 1_000_000 and not is_object_dtype(comps): - f = np.in1d - elif is_integer_dtype(comps): - try: - values = values.astype("int64", copy=False) - comps = comps.astype("int64", copy=False) - f = htable.ismember_int64 - except (TypeError, ValueError, OverflowError): - values = values.astype(object) - comps = comps.astype(object) - - elif is_float_dtype(comps): - try: - values = values.astype("float64", copy=False) - comps = comps.astype("float64", copy=False) - f = htable.ismember_float64 - except (TypeError, ValueError): - values = values.astype(object) - comps = comps.astype(object) + # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), + # in1d is faster for small sizes + if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps): + # If the values include nan we need to check for nan explicitly + # since np.nan it not equal to np.nan + if isna(values).any(): + f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) + else: + f = np.in1d + + else: + common = np.find_common_type([values.dtype, comps.dtype], []) + values = values.astype(common, copy=False) + comps = comps.astype(common, copy=False) + name = common.name + if name == "bool": + name = "uint8" + f = getattr(htable, f"ismember_{name}") return f(comps, values) -def _factorize_array( - values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None, +def factorize_array( + values: np.ndarray, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -489,7 +522,7 @@ def _factorize_array( codes : ndarray uniques : ndarray """ - hash_klass, values = _get_data_algo(values) + hash_klass, values = get_data_algo(values) table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( @@ -525,10 +558,9 @@ def _factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int = -1, + na_sentinel: Optional[int] = -1, size_hint: Optional[int] = None, - dropna: bool = True, -) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: +) -> Tuple[np.ndarray, Union[np.ndarray, "Index"]]: """ Encode the object as an enumerated type or categorical variable. @@ -540,8 +572,11 @@ def factorize( Parameters ---------- {values}{sort} - na_sentinel : int, default -1 - Value to mark "not found". + na_sentinel : int or None, default -1 + Value to mark "not found". If None, will not drop the NaN + from the uniques of the values. + + .. versionchanged:: 1.1.2 {size_hint}\ Returns @@ -619,6 +654,22 @@ def factorize( array([0, 0, 1]...) >>> uniques Index(['a', 'c'], dtype='object') + + If NaN is in the values, and we want to include NaN in the uniques of the + values, it can be achieved by setting ``na_sentinel=None``. + + >>> values = np.array([1, 2, 1, np.nan]) + >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 + >>> codes + array([ 0, 1, 0, -1]) + >>> uniques + array([1., 2.]) + + >>> codes, uniques = pd.factorize(values, na_sentinel=None) + >>> codes + array([0, 1, 0, 2]) + >>> uniques + array([ 1., 2., nan]) """ # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -629,11 +680,35 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. + if isinstance(values, ABCRangeIndex): + return values.factorize(sort=sort) + values = _ensure_arraylike(values) original = values + if not isinstance(values, ABCMultiIndex): + values = extract_array(values, extract_numpy=True) + + # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques + # of values, assign na_sentinel=-1 to replace code value for NaN. + dropna = True + if na_sentinel is None: + na_sentinel = -1 + dropna = False + + if ( + isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) + and values.freq is not None + ): + codes, uniques = values.factorize(sort=sort) + if isinstance(original, ABCIndexClass): + uniques = original._shallow_copy(uniques, name=None) + elif isinstance(original, ABCSeries): + from pandas import Index + + uniques = Index(uniques) + return codes, uniques if is_extension_array_dtype(values.dtype): - values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: @@ -644,7 +719,7 @@ def factorize( else: na_value = None - codes, uniques = _factorize_array( + codes, uniques = factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value ) @@ -665,6 +740,8 @@ def factorize( # return original tenor if isinstance(original, ABCIndexClass): + if original.dtype.kind in ["m", "M"] and isinstance(uniques, np.ndarray): + uniques = type(original._data)._simple_new(uniques, dtype=original.dtype) uniques = original._shallow_copy(uniques, name=None) elif isinstance(original, ABCSeries): from pandas import Index @@ -681,7 +758,7 @@ def value_counts( normalize: bool = False, bins=None, dropna: bool = True, -) -> "Series": +) -> Series: """ Compute a histogram of the counts of non-null values. @@ -740,7 +817,7 @@ def value_counts( counts = result._values else: - keys, counts = _value_counts_arraylike(values, dropna) + keys, counts = value_counts_arraylike(values, dropna) result = Series(counts, index=keys, name=name) @@ -753,8 +830,8 @@ def value_counts( return result -# Called once from SparseArray -def _value_counts_arraylike(values, dropna: bool): +# Called once from SparseArray, otherwise could be private +def value_counts_arraylike(values, dropna: bool): """ Parameters ---------- @@ -798,7 +875,7 @@ def _value_counts_arraylike(values, dropna: bool): return keys, counts -def duplicated(values, keep="first") -> np.ndarray: +def duplicated(values: ArrayLike, keep: str = "first") -> np.ndarray: """ Return boolean ndarray denoting duplicate values. @@ -823,7 +900,7 @@ def duplicated(values, keep="first") -> np.ndarray: return f(values, keep=keep) -def mode(values, dropna: bool = True) -> "Series": +def mode(values, dropna: bool = True) -> Series: """ Returns the mode(s) of an array. @@ -991,11 +1068,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() - or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() - ) + (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] + ).any() or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() if to_raise: raise OverflowError("Overflow in int64 addition") @@ -1100,6 +1176,9 @@ def __init__(self, obj, n: int, keep: str): if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') + def compute(self, method: str) -> FrameOrSeriesUnion: + raise NotImplementedError + def nlargest(self): return self.compute("nlargest") @@ -1132,7 +1211,7 @@ class SelectNSeries(SelectN): nordered : Series """ - def compute(self, method): + def compute(self, method: str) -> Series: n = self.n dtype = self.obj.dtype @@ -1146,10 +1225,8 @@ def compute(self, method): # slow method if n >= len(self.obj): - reverse_it = self.keep == "last" or method == "nlargest" ascending = method == "nsmallest" - slc = np.s_[::-1] if reverse_it else np.s_[:] - return dropped[slc].sort_values(ascending=ascending).head(n) + return dropped.sort_values(ascending=ascending).head(n) # fast method arr, pandas_dtype = _ensure_data(dropped.values) @@ -1206,7 +1283,7 @@ def __init__(self, obj, n: int, keep: str, columns): columns = list(columns) self.columns = columns - def compute(self, method): + def compute(self, method: str) -> DataFrame: from pandas import Int64Index @@ -1494,8 +1571,6 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) """ Take elements from an array. - .. versionadded:: 0.23.0 - Parameters ---------- arr : sequence @@ -1513,7 +1588,7 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. + negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. @@ -1619,7 +1694,8 @@ def take_nd( """ mask_info = None - if is_extension_array_dtype(arr): + if isinstance(arr, ABCExtensionArray): + # Check for EA to catch DatetimeArray, TimedeltaArray return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) arr = extract_array(arr) @@ -1750,7 +1826,7 @@ def func(arr, indexer, out, fill_value=np.nan): # ------------ # -def searchsorted(arr, value, side="left", sorter=None): +def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: """ Find indices where elements should be inserted to maintain order. @@ -1799,7 +1875,7 @@ def searchsorted(arr, value, side="left", sorter=None): if ( isinstance(arr, np.ndarray) - and is_integer_dtype(arr) + and is_integer_dtype(arr.dtype) and (is_integer(value) or is_integer_dtype(value)) ): # if `arr` and `value` have different dtypes, `arr` would be @@ -1877,6 +1953,8 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if is_extension_array_dtype(dtype): if hasattr(arr, f"__{op.__name__}__"): + if axis != 0: + raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") return op(arr, arr.shift(n)) else: warn( @@ -1891,18 +1969,26 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): is_timedelta = False is_bool = False if needs_i8_conversion(arr.dtype): - dtype = np.float64 + dtype = np.int64 arr = arr.view("i8") na = iNaT is_timedelta = True elif is_bool_dtype(dtype): + # We have to cast in order to be able to hold np.nan dtype = np.object_ is_bool = True elif is_integer_dtype(dtype): + # We have to cast in order to be able to hold np.nan dtype = np.float64 + orig_ndim = arr.ndim + if orig_ndim == 1: + # reshape so we can always use algos.diff_2d + arr = arr.reshape(-1, 1) + # TODO: require axis == 0 + dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) @@ -1913,7 +1999,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if arr.ndim == 2 and arr.dtype.name in _diff_special: # TODO: can diff_2d dtype specialization troubles be fixed by defining # out_arr inside diff_2d? - algos.diff_2d(arr, out_arr, n, axis) + algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) else: # To keep mypy happy, _res_indexer is a list while res_indexer is # a tuple, ditto for lag_indexer. @@ -1947,8 +2033,10 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] if is_timedelta: - out_arr = out_arr.astype("int64").view("timedelta64[ns]") + out_arr = out_arr.view("timedelta64[ns]") + if orig_ndim == 1: + out_arr = out_arr[:, 0] return out_arr @@ -2012,32 +2100,30 @@ def safe_sort( "Only list-like objects are allowed to be passed to safe_sort as values" ) - if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): + if not isinstance(values, (np.ndarray, ABCExtensionArray)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) - def sort_mixed(values): - # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) - sorter = None + if ( not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == "mixed-integer" ): - # unorderable in py3 if mixed str/int - ordered = sort_mixed(values) + ordered = _sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: - # try this anyway - ordered = sort_mixed(values) + # Previous sorters failed or were not applicable, try `_sort_mixed` + # which would work, but which fails for special case of 1d arrays + # with tuples. + if values.size and isinstance(values[0], tuple): + ordered = _sort_tuples(values) + else: + ordered = _sort_mixed(values) # codes: @@ -2056,7 +2142,7 @@ def sort_mixed(values): if sorter is None: # mixed types - hash_klass, values = _get_data_algo(values) + hash_klass, values = get_data_algo(values) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) @@ -2084,3 +2170,47 @@ def sort_mixed(values): np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) + + +def _sort_mixed(values): + """ order ints before strings in 1d arrays, safe in py3 """ + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + +def _sort_tuples(values: np.ndarray[tuple]): + """ + Convert array of tuples (1d) to array or array (2d). + We need to keep the columns separately as they contain different types and + nans (can't use `np.sort` as it may fail when str and nan are mixed in a + column as types cannot be compared). + """ + from pandas.core.internals.construction import to_arrays + from pandas.core.sorting import lexsort_indexer + + arrays, _ = to_arrays(values, None) + indexer = lexsort_indexer(arrays, orders=True) + return values[indexer] + + +def make_duplicates_of_left_unique_in_right( + left: np.ndarray, right: np.ndarray +) -> np.ndarray: + """ + If left has duplicates, which are also duplicated in right, this duplicated values + are dropped from right, meaning that every duplicate value from left exists only + once in right. + + Parameters + ---------- + left: ndarray + right: ndarray + + Returns + ------- + Duplicates of left are unique in right + """ + left_duplicates = unique(left[duplicated(left)]) + return right[~(duplicated(right) & isin(right, left_duplicates))] diff --git a/pandas/core/api.py b/pandas/core/api.py index b0b65f9d0be34..67e86c2076329 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -14,6 +14,7 @@ from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -26,6 +27,7 @@ ) from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array +from pandas.core.flags import Flags from pandas.core.groupby import Grouper, NamedAgg from pandas.core.indexes.api import ( CategoricalIndex, diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d4be660939773..801d4665f9a1b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,21 +1,26 @@ import abc import inspect -from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type import numpy as np from pandas._config import option_context -from pandas._typing import Axis +from pandas._typing import Axis, FrameOrSeriesUnion from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence +from pandas.core.dtypes.common import ( + is_dict_like, + is_extension_array_dtype, + is_list_like, + is_sequence, +) from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype if TYPE_CHECKING: - from pandas import DataFrame, Series, Index + from pandas import DataFrame, Index, Series ResType = Dict[int, Any] @@ -26,7 +31,6 @@ def frame_apply( axis: Axis = 0, raw: bool = False, result_type: Optional[str] = None, - ignore_failures: bool = False, args=None, kwds=None, ): @@ -43,7 +47,6 @@ def frame_apply( func, raw=raw, result_type=result_type, - ignore_failures=ignore_failures, args=args, kwds=kwds, ) @@ -73,7 +76,7 @@ def series_generator(self) -> Iterator["Series"]: @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: pass # --------------------------------------------------------------- @@ -84,13 +87,11 @@ def __init__( func, raw: bool, result_type: Optional[str], - ignore_failures: bool, args, kwds, ): self.obj = obj self.raw = raw - self.ignore_failures = ignore_failures self.args = args or () self.kwds = kwds or {} @@ -141,7 +142,11 @@ def get_result(self): """ compute the results """ # dispatch to agg if is_list_like(self.f) or is_dict_like(self.f): - return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds) + # pandas\core\apply.py:144: error: "aggregate" of "DataFrame" gets + # multiple values for keyword argument "axis" + return self.obj.aggregate( # type: ignore[misc] + self.f, axis=self.axis, *self.args, **self.kwds + ) # all empty if len(self.columns) == 0 and len(self.index) == 0: @@ -216,7 +221,23 @@ def apply_empty_result(self): def apply_raw(self): """ apply to the values as a numpy array """ - result = np.apply_along_axis(self.f, self.axis, self.values) + + def wrap_function(func): + """ + Wrap user supplied function to work around numpy issue. + + see https://github.com/numpy/numpy/issues/8352 + """ + + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + if isinstance(result, str): + result = np.array(result, dtype=object) + return result + + return wrapper + + result = np.apply_along_axis(wrap_function(self.f), self.axis, self.values) # TODO: mixed type case if result.ndim == 2: @@ -252,59 +273,29 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": return result def apply_standard(self): - - # partial result that may be returned from reduction - partial_result = None - - # compute the result using the series generator, - # use the result computed while trying to reduce if available. - results, res_index = self.apply_series_generator(partial_result) + results, res_index = self.apply_series_generator() # wrap results return self.wrap_results(results, res_index) - def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]: + def apply_series_generator(self) -> Tuple[ResType, "Index"]: series_gen = self.series_generator res_index = self.result_index results = {} - # If a partial result was already computed, - # use it instead of running on the first element again - series_gen_enumeration = enumerate(series_gen) - if partial_result is not None: - i, v = next(series_gen_enumeration) - results[i] = partial_result - - if self.ignore_failures: - successes = [] - for i, v in series_gen_enumeration: - try: - results[i] = self.f(v) - except Exception: - pass - else: - successes.append(i) - - # so will work with MultiIndex - if len(successes) < len(res_index): - res_index = res_index.take(successes) - - else: - with option_context("mode.chained_assignment", None): - for i, v in series_gen_enumeration: - # ignore SettingWithCopy here in case the user mutates - results[i] = self.f(v) - if isinstance(results[i], ABCSeries): - # If we have a view on v, we need to make a copy because - # series_generator will swap out the underlying data - results[i] = results[i].copy(deep=False) + with option_context("mode.chained_assignment", None): + for i, v in enumerate(series_gen): + # ignore SettingWithCopy here in case the user mutates + results[i] = self.f(v) + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index - def wrap_results( - self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + def wrap_results(self, results: ResType, res_index: "Index") -> FrameOrSeriesUnion: from pandas import Series # see if we can infer the results @@ -348,23 +339,28 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: """ return the results for the rows """ if self.result_type == "reduce": # e.g. test_apply_dict GH#8735 - return self.obj._constructor_sliced(results) + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + elif self.result_type is None and all( isinstance(x, dict) for x in results.values() ): # Our operation was a to_dict op e.g. - # test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196 - return self.obj._constructor_sliced(results) + # test_apply_dict GH#8735, test_apply_reduce_to_dict GH#25196 #37544 + res = self.obj._constructor_sliced(results) + res.index = res_index + return res try: result = self.obj._constructor(data=results) except ValueError as err: - if "arrays must all be same length" in str(err): + if "All arrays must be of the same length" in str(err): # e.g. result = [[2, 3], [1.5], ['foo', 'bar']] # see test_agg_listlike_result GH#29587 res = self.obj._constructor_sliced(results) @@ -401,10 +397,20 @@ def series_generator(self): mgr = ser._mgr blk = mgr.blocks[0] - for (arr, name) in zip(values, self.index): - blk.values = arr - ser.name = name - yield ser + if is_extension_array_dtype(blk.dtype): + # values will be incorrect for this block + # TODO(EA2D): special case would be unnecessary with 2D EAs + obj = self.obj + for i in range(len(obj)): + yield obj._ixs(i, axis=0) + + else: + for (arr, name) in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr + blk.values = arr + ser.name = name + yield ser @property def result_index(self) -> "Index": @@ -416,9 +422,9 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: """ return the results for the columns """ - result: Union["Series", "DataFrame"] + result: FrameOrSeriesUnion # we have requested to expand if self.result_type == "expand": diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 1b9ed014f27b7..bce6f1aafb2c5 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -8,7 +8,7 @@ import numpy as np from pandas._libs import missing as libmissing -from pandas.compat.numpy import _np_version_under1p17 +from pandas.compat.numpy import np_version_under1p17 from pandas.core.nanops import check_below_min_count @@ -17,6 +17,7 @@ def _sumprod( func: Callable, values: np.ndarray, mask: np.ndarray, + *, skipna: bool = True, min_count: int = 0, ): @@ -46,25 +47,31 @@ def _sumprod( if check_below_min_count(values.shape, mask, min_count): return libmissing.NA - if _np_version_under1p17: + if np_version_under1p17: return func(values[~mask]) else: return func(values, where=~mask) -def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): +def sum( + values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 +): return _sumprod( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count ) -def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): +def prod( + values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 +): return _sumprod( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count ) -def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True): +def _minmax( + func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True +): """ Reduction for 1D masked array. @@ -94,9 +101,9 @@ def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = return libmissing.NA -def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): +def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): return _minmax(np.min, values=values, mask=mask, skipna=skipna) -def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): +def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): return _minmax(np.max, values=values, mask=mask, skipna=skipna) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py new file mode 100644 index 0000000000000..76d723beac7e6 --- /dev/null +++ b/pandas/core/array_algos/replace.py @@ -0,0 +1,133 @@ +""" +Methods used by Block.replace and related methods. +""" +import operator +import re +from typing import Optional, Pattern, Union + +import numpy as np + +from pandas._typing import ArrayLike, Scalar + +from pandas.core.dtypes.common import ( + is_datetimelike_v_numeric, + is_numeric_v_string_like, + is_re, + is_scalar, +) +from pandas.core.dtypes.missing import isna + + +def compare_or_regex_search( + a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike +) -> Union[ArrayLike, bool]: + """ + Compare two array_like inputs of the same shape or two scalar values + + Calls operator.eq or re.search, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array_like + b : scalar or regex pattern + regex : bool + mask : array_like + + Returns + ------- + mask : array_like of bool + """ + + def _check_comparison_types( + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] + ): + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_scalar(result) and isinstance(a, np.ndarray): + type_names = [type(a).__name__, type(b).__name__] + + if isinstance(a, np.ndarray): + type_names[0] = f"ndarray(dtype={a.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + + if not regex: + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize( + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, (str, Pattern)) + else False + ) + + # GH#32621 use mask to avoid comparing to NAs + if isinstance(a, np.ndarray): + a = a[mask] + + if is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy + return np.zeros(a.shape, dtype=bool) + + elif is_datetimelike_v_numeric(a, b): + # GH#29553 avoid deprecation warnings from numpy + _check_comparison_types(False, a, b) + return False + + result = op(a) + + if isinstance(result, np.ndarray) and mask is not None: + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + tmp[mask] = result + result = tmp + + _check_comparison_types(result, a, b) + return result + + +def replace_regex(values: ArrayLike, rx: re.Pattern, value, mask: Optional[np.ndarray]): + """ + Parameters + ---------- + values : ArrayLike + Object dtype. + rx : re.Pattern + value : Any + mask : np.ndarray[bool], optional + + Notes + ----- + Alters values in-place. + """ + + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isna(value) or not isinstance(value, str): + + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return value if rx.search(s) is not None else s + else: + return s + + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return rx.sub(value, s) + else: + return s + + f = np.vectorize(re_replacer, otypes=[values.dtype]) + + if mask is None: + values[:] = f(values) + else: + values[mask] = f(values[mask]) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py new file mode 100644 index 0000000000000..6b28f8f135769 --- /dev/null +++ b/pandas/core/arraylike.py @@ -0,0 +1,284 @@ +""" +Methods that can be shared by many array-like classes or subclasses: + Series + Index + ExtensionArray +""" +import operator +from typing import Any, Callable +import warnings + +import numpy as np + +from pandas._libs import lib + +from pandas.core.construction import extract_array +from pandas.core.ops import maybe_dispatch_ufunc_to_dunder_op, roperator +from pandas.core.ops.common import unpack_zerodim_and_defer + + +class OpsMixin: + # ------------------------------------------------------------- + # Comparisons + + def _cmp_method(self, other, op): + return NotImplemented + + @unpack_zerodim_and_defer("__eq__") + def __eq__(self, other): + return self._cmp_method(other, operator.eq) + + @unpack_zerodim_and_defer("__ne__") + def __ne__(self, other): + return self._cmp_method(other, operator.ne) + + @unpack_zerodim_and_defer("__lt__") + def __lt__(self, other): + return self._cmp_method(other, operator.lt) + + @unpack_zerodim_and_defer("__le__") + def __le__(self, other): + return self._cmp_method(other, operator.le) + + @unpack_zerodim_and_defer("__gt__") + def __gt__(self, other): + return self._cmp_method(other, operator.gt) + + @unpack_zerodim_and_defer("__ge__") + def __ge__(self, other): + return self._cmp_method(other, operator.ge) + + # ------------------------------------------------------------- + # Logical Methods + + def _logical_method(self, other, op): + return NotImplemented + + @unpack_zerodim_and_defer("__and__") + def __and__(self, other): + return self._logical_method(other, operator.and_) + + @unpack_zerodim_and_defer("__rand__") + def __rand__(self, other): + return self._logical_method(other, roperator.rand_) + + @unpack_zerodim_and_defer("__or__") + def __or__(self, other): + return self._logical_method(other, operator.or_) + + @unpack_zerodim_and_defer("__ror__") + def __ror__(self, other): + return self._logical_method(other, roperator.ror_) + + @unpack_zerodim_and_defer("__xor__") + def __xor__(self, other): + return self._logical_method(other, operator.xor) + + @unpack_zerodim_and_defer("__rxor__") + def __rxor__(self, other): + return self._logical_method(other, roperator.rxor) + + # ------------------------------------------------------------- + # Arithmetic Methods + + def _arith_method(self, other, op): + return NotImplemented + + @unpack_zerodim_and_defer("__add__") + def __add__(self, other): + return self._arith_method(other, operator.add) + + @unpack_zerodim_and_defer("__radd__") + def __radd__(self, other): + return self._arith_method(other, roperator.radd) + + @unpack_zerodim_and_defer("__sub__") + def __sub__(self, other): + return self._arith_method(other, operator.sub) + + @unpack_zerodim_and_defer("__rsub__") + def __rsub__(self, other): + return self._arith_method(other, roperator.rsub) + + @unpack_zerodim_and_defer("__mul__") + def __mul__(self, other): + return self._arith_method(other, operator.mul) + + @unpack_zerodim_and_defer("__rmul__") + def __rmul__(self, other): + return self._arith_method(other, roperator.rmul) + + @unpack_zerodim_and_defer("__truediv__") + def __truediv__(self, other): + return self._arith_method(other, operator.truediv) + + @unpack_zerodim_and_defer("__rtruediv__") + def __rtruediv__(self, other): + return self._arith_method(other, roperator.rtruediv) + + @unpack_zerodim_and_defer("__floordiv__") + def __floordiv__(self, other): + return self._arith_method(other, operator.floordiv) + + @unpack_zerodim_and_defer("__rfloordiv") + def __rfloordiv__(self, other): + return self._arith_method(other, roperator.rfloordiv) + + @unpack_zerodim_and_defer("__mod__") + def __mod__(self, other): + return self._arith_method(other, operator.mod) + + @unpack_zerodim_and_defer("__rmod__") + def __rmod__(self, other): + return self._arith_method(other, roperator.rmod) + + @unpack_zerodim_and_defer("__divmod__") + def __divmod__(self, other): + return self._arith_method(other, divmod) + + @unpack_zerodim_and_defer("__rdivmod__") + def __rdivmod__(self, other): + return self._arith_method(other, roperator.rdivmod) + + @unpack_zerodim_and_defer("__pow__") + def __pow__(self, other): + return self._arith_method(other, operator.pow) + + @unpack_zerodim_and_defer("__rpow__") + def __rpow__(self, other): + return self._arith_method(other, roperator.rpow) + + +def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): + """ + Compatibility with numpy ufuncs. + + See also + -------- + numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ + """ + from pandas.core.generic import NDFrame + from pandas.core.internals import BlockManager + + cls = type(self) + + # for binary ops, use our custom dunder methods + result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + types = tuple(type(x) for x in inputs) + alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + + if len(set(types)) > 1: + # We currently don't handle ufunc(DataFrame, Series) + # well. Previously this raised an internal ValueError. We might + # support it someday, so raise a NotImplementedError. + raise NotImplementedError( + "Cannot apply ufunc {} to mixed DataFrame and Series " + "inputs.".format(ufunc) + ) + axes = self.axes + for obj in alignable[1:]: + # this relies on the fact that we aren't handling mixed + # series / frame ufuncs. + for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): + axes[i] = ax1.union(ax2) + + reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) + inputs = tuple( + x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x + for x, t in zip(inputs, types) + ) + else: + reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) + + if self.ndim == 1: + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + name = names[0] if len(set(names)) == 1 else None + reconstruct_kwargs = {"name": name} + else: + reconstruct_kwargs = {} + + def reconstruct(result): + if lib.is_scalar(result): + return result + if result.ndim != self.ndim: + if method == "outer": + if self.ndim == 2: + # we already deprecated for Series + msg = ( + "outer method for ufunc {} is not implemented on " + "pandas objects. Returning an ndarray, but in the " + "future this will raise a 'NotImplementedError'. " + "Consider explicitly converting the DataFrame " + "to an array with '.to_numpy()' first." + ) + warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) + return result + raise NotImplementedError + return result + if isinstance(result, BlockManager): + # we went through BlockManager.apply + result = self._constructor(result, **reconstruct_kwargs, copy=False) + else: + # we converted an array, lost our axes + result = self._constructor( + result, **reconstruct_axes, **reconstruct_kwargs, copy=False + ) + # TODO: When we support multiple values in __finalize__, this + # should pass alignable to `__fianlize__` instead of self. + # Then `np.add(a, b)` would consider attrs from both a and b + # when a and b are NDFrames. + if len(alignable) == 1: + result = result.__finalize__(self) + return result + + if self.ndim > 1 and ( + len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] + ): + # Just give up on preserving types in the complex case. + # In theory we could preserve them for them. + # * nout>1 is doable if BlockManager.apply took nout and + # returned a Tuple[BlockManager]. + # * len(inputs) > 1 is doable when we know that we have + # aligned blocks / dtypes. + inputs = tuple(np.asarray(x) for x in inputs) + result = getattr(ufunc, method)(*inputs) + elif self.ndim == 1: + # ufunc(series, ...) + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + else: + # ufunc(dataframe) + mgr = inputs[0]._mgr + result = mgr.apply(getattr(ufunc, method)) + + if ufunc.nout > 1: # type: ignore[attr-defined] + result = tuple(reconstruct(x) for x in result) + else: + result = reconstruct(result) + return result diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 1d538824e6d82..e5258a6aecd30 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -6,8 +6,10 @@ from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.floating import FloatingArray from pandas.core.arrays.integer import IntegerArray, integer_array from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.numpy_ import PandasArray, PandasDtype from pandas.core.arrays.period import PeriodArray, period_array from pandas.core.arrays.sparse import SparseArray @@ -18,9 +20,11 @@ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", + "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", + "FloatingArray", "IntegerArray", "integer_array", "IntervalArray", diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 4a33e0e841f7f..c89f5554d0715 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -4,7 +4,7 @@ import numpy as np import pyarrow -from pandas.core.arrays.interval import _VALID_CLOSED +from pandas.core.arrays.interval import VALID_CLOSED _pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") @@ -83,7 +83,7 @@ class ArrowIntervalType(pyarrow.ExtensionType): def __init__(self, subtype, closed): # attributes need to be set first before calling # super init (as that calls serialize) - assert closed in _VALID_CLOSED + assert closed in VALID_CLOSED self._closed = closed if not isinstance(subtype, pyarrow.DataType): subtype = pyarrow.type_for_alias(str(subtype)) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 832d09b062265..02214ff51b02a 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -1,15 +1,30 @@ -from typing import Any, Sequence, Tuple, TypeVar +from __future__ import annotations + +from typing import Any, Optional, Sequence, Type, TypeVar, Union import numpy as np +from pandas._libs import lib +from pandas._typing import Shape from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import cache_readonly, doc +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.inference import is_array_like +from pandas.core.dtypes.missing import array_equivalent +from pandas.core import missing from pandas.core.algorithms import take, unique +from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray +from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer -_T = TypeVar("_T", bound="NDArrayBackedExtensionArray") +NDArrayBackedExtensionArrayT = TypeVar( + "NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray" +) class NDArrayBackedExtensionArray(ExtensionArray): @@ -19,7 +34,9 @@ class NDArrayBackedExtensionArray(ExtensionArray): _ndarray: np.ndarray - def _from_backing_data(self: _T, arr: np.ndarray) -> _T: + def _from_backing_data( + self: NDArrayBackedExtensionArrayT, arr: np.ndarray + ) -> NDArrayBackedExtensionArrayT: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. @@ -28,26 +45,42 @@ def _from_backing_data(self: _T, arr: np.ndarray) -> _T: """ raise AbstractMethodError(self) + def _box_func(self, x): + """ + Wrap numpy type in our dtype.type if necessary. + """ + return x + + def _validate_scalar(self, value): + # used by NDArrayBackedExtensionIndex.insert + raise AbstractMethodError(self) + # ------------------------------------------------------------------------ def take( - self: _T, + self: NDArrayBackedExtensionArrayT, indices: Sequence[int], + *, allow_fill: bool = False, fill_value: Any = None, - ) -> _T: + axis: int = 0, + ) -> NDArrayBackedExtensionArrayT: if allow_fill: fill_value = self._validate_fill_value(fill_value) new_data = take( - self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value, + self._ndarray, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + axis=axis, ) return self._from_backing_data(new_data) def _validate_fill_value(self, fill_value): """ If a fill_value is passed to `take` convert it to a representation - suitable for self._ndarray, raising ValueError if this is not possible. + suitable for self._ndarray, raising TypeError if this is not possible. Parameters ---------- @@ -59,7 +92,7 @@ def _validate_fill_value(self, fill_value): Raises ------ - ValueError + TypeError """ raise AbstractMethodError(self) @@ -68,7 +101,7 @@ def _validate_fill_value(self, fill_value): # TODO: make this a cache_readonly; for that to work we need to remove # the _index_data kludge in libreduction @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: return self._ndarray.shape def __len__(self) -> int: @@ -86,26 +119,42 @@ def size(self) -> int: def nbytes(self) -> int: return self._ndarray.nbytes - def reshape(self: _T, *args, **kwargs) -> _T: + def reshape( + self: NDArrayBackedExtensionArrayT, *args, **kwargs + ) -> NDArrayBackedExtensionArrayT: new_data = self._ndarray.reshape(*args, **kwargs) return self._from_backing_data(new_data) - def ravel(self: _T, *args, **kwargs) -> _T: + def ravel( + self: NDArrayBackedExtensionArrayT, *args, **kwargs + ) -> NDArrayBackedExtensionArrayT: new_data = self._ndarray.ravel(*args, **kwargs) return self._from_backing_data(new_data) @property - def T(self: _T) -> _T: + def T(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: new_data = self._ndarray.T return self._from_backing_data(new_data) # ------------------------------------------------------------------------ - def copy(self: _T) -> _T: + def equals(self, other) -> bool: + if type(self) is not type(other): + return False + if not is_dtype_equal(self.dtype, other.dtype): + return False + return bool(array_equivalent(self._ndarray, other._ndarray)) + + def _values_for_argsort(self): + return self._ndarray + + def copy(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: new_data = self._ndarray.copy() return self._from_backing_data(new_data) - def repeat(self: _T, repeats, axis=None) -> _T: + def repeat( + self: NDArrayBackedExtensionArrayT, repeats, axis=None + ) -> NDArrayBackedExtensionArrayT: """ Repeat elements of an array. @@ -113,10 +162,181 @@ def repeat(self: _T, repeats, axis=None) -> _T: -------- numpy.ndarray.repeat """ - nv.validate_repeat(tuple(), dict(axis=axis)) + nv.validate_repeat((), {"axis": axis}) new_data = self._ndarray.repeat(repeats, axis=axis) return self._from_backing_data(new_data) - def unique(self: _T) -> _T: + def unique(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: new_data = unique(self._ndarray) return self._from_backing_data(new_data) + + @classmethod + @doc(ExtensionArray._concat_same_type) + def _concat_same_type( + cls: Type[NDArrayBackedExtensionArrayT], + to_concat: Sequence[NDArrayBackedExtensionArrayT], + axis: int = 0, + ) -> NDArrayBackedExtensionArrayT: + dtypes = {str(x.dtype) for x in to_concat} + if len(dtypes) != 1: + raise ValueError("to_concat must have the same dtype (tz)", dtypes) + + new_values = [x._ndarray for x in to_concat] + new_values = np.concatenate(new_values, axis=axis) + return to_concat[0]._from_backing_data(new_values) + + @doc(ExtensionArray.searchsorted) + def searchsorted(self, value, side="left", sorter=None): + value = self._validate_searchsorted_value(value) + return self._ndarray.searchsorted(value, side=side, sorter=sorter) + + def _validate_searchsorted_value(self, value): + return value + + @doc(ExtensionArray.shift) + def shift(self, periods=1, fill_value=None, axis=0): + + fill_value = self._validate_shift_value(fill_value) + new_values = shift(self._ndarray, periods, axis, fill_value) + + return self._from_backing_data(new_values) + + def _validate_shift_value(self, fill_value): + # TODO: after deprecation in datetimelikearraymixin is enforced, + # we can remove this and ust validate_fill_value directly + return self._validate_fill_value(fill_value) + + def __setitem__(self, key, value): + key = check_array_indexer(self, key) + value = self._validate_setitem_value(value) + self._ndarray[key] = value + + def _validate_setitem_value(self, value): + return value + + def __getitem__( + self: NDArrayBackedExtensionArrayT, key: Union[int, slice, np.ndarray] + ) -> Union[NDArrayBackedExtensionArrayT, Any]: + if lib.is_integer(key): + # fast-path + result = self._ndarray[key] + if self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + key = extract_array(key, extract_numpy=True) + key = check_array_indexer(self, key) + result = self._ndarray[key] + if lib.is_scalar(result): + return self._box_func(result) + + result = self._from_backing_data(result) + return result + + @doc(ExtensionArray.fillna) + def fillna( + self: NDArrayBackedExtensionArrayT, value=None, method=None, limit=None + ) -> NDArrayBackedExtensionArrayT: + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + # TODO: share this with EA base class implementation + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = missing.get_fill_func(method) + new_values = func(self._ndarray.copy(), limit=limit, mask=mask) + # TODO: PandasArray didnt used to copy, need tests for this + new_values = self._from_backing_data(new_values) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + # ------------------------------------------------------------------------ + # Reductions + + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + meth = getattr(self, name, None) + if meth: + return meth(skipna=skipna, **kwargs) + else: + msg = f"'{type(self).__name__}' does not implement reduction '{name}'" + raise TypeError(msg) + + def _wrap_reduction_result(self, axis: Optional[int], result): + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + # ------------------------------------------------------------------------ + + def __repr__(self) -> str: + if self.ndim == 1: + return super().__repr__() + + from pandas.io.formats.printing import format_object_summary + + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + lines = [ + format_object_summary(x, self._formatter(), indent_for_name=False).rstrip( + ", \n" + ) + for x in self + ] + data = ",\n".join(lines) + class_name = f"<{type(self).__name__}>" + return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" + + # ------------------------------------------------------------------------ + # __array_function__ methods + + def putmask(self, mask, value): + """ + Analogue to np.putmask(self, mask, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Raises + ------ + TypeError + If value cannot be cast to self.dtype. + """ + value = self._validate_setitem_value(value) + + np.putmask(self._ndarray, mask, value) + + def where(self, mask, value): + """ + Analogue to np.where(mask, self, value) + + Parameters + ---------- + mask : np.ndarray[bool] + value : scalar or listlike + + Raises + ------ + TypeError + If value cannot be cast to self.dtype. + """ + value = self._validate_setitem_value(value) + + res_values = np.where(mask, self._ndarray, value) + return self._from_backing_data(res_values) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 32a2a30fcfd43..95470422f2ccd 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -6,13 +6,26 @@ This is an experimental API and subject to breaking changes without warning. """ +from __future__ import annotations + import operator -from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, +) import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Shape from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -20,17 +33,25 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_cast_to_extension_array -from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype +from pandas.core.dtypes.common import ( + is_array_like, + is_dtype_equal, + is_list_like, + is_scalar, + pandas_dtype, +) from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops -from pandas.core.algorithms import _factorize_array, unique -from pandas.core.missing import backfill_1d, pad_1d +from pandas.core.algorithms import factorize_array, unique +from pandas.core.missing import get_fill_func from pandas.core.sorting import nargminmax, nargsort -_extension_array_shared_docs: Dict[str, str] = dict() +_extension_array_shared_docs: Dict[str, str] = {} + +ExtensionArrayT = TypeVar("ExtensionArrayT", bound="ExtensionArray") class ExtensionArray: @@ -41,8 +62,6 @@ class ExtensionArray: with a custom type and will not attempt to coerce them to objects. They may be stored directly inside a :class:`DataFrame` or :class:`Series`. - .. versionadded:: 0.23.0 - Attributes ---------- dtype @@ -170,7 +189,7 @@ class ExtensionArray: # ------------------------------------------------------------------------ @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): """ Construct a new ExtensionArray from a sequence of scalars. @@ -192,7 +211,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): raise AbstractMethodError(cls) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy=False): """ Construct a new ExtensionArray from a sequence of strings. @@ -229,8 +248,8 @@ def _from_factorized(cls, values, original): See Also -------- - factorize - ExtensionArray.factorize + factorize : Top-level factorize method that dispatches here. + ExtensionArray.factorize : Encode the extension array as an enumerated type. """ raise AbstractMethodError(cls) @@ -238,8 +257,9 @@ def _from_factorized(cls, values, original): # Must be a Sequence # ------------------------------------------------------------------------ - def __getitem__(self, item): - # type (Any) -> Any + def __getitem__( + self, item: Union[int, slice, np.ndarray] + ) -> Union[ExtensionArray, Any]: """ Select a subset of self. @@ -335,6 +355,23 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def __contains__(self, item) -> bool: + """ + Return for `item in self`. + """ + # GH37867 + # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] + # would raise a TypeError. The implementation below works around that. + if is_scalar(item) and isna(item): + if not self._can_hold_na: + return False + elif item is self.dtype.na_value or isinstance(item, self.dtype.type): + return self.isna().any() + else: + return False + else: + return (item == self).any() + def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). @@ -400,7 +437,7 @@ def dtype(self) -> ExtensionDtype: raise AbstractMethodError(self) @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: """ Return a tuple of the array dimensions. """ @@ -452,9 +489,19 @@ def astype(self, dtype, copy=True): NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype + from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) - if isinstance(dtype, StringDtype): # allow conversion to StringArrays + if is_dtype_equal(dtype, self.dtype): + if not copy: + return self + else: + return self.copy() + + # FIXME: Really hard-code here? + if isinstance( + dtype, (ArrowStringDtype, StringDtype) + ): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -493,13 +540,18 @@ def _values_for_argsort(self) -> np.ndarray: See Also -------- - ExtensionArray.argsort + ExtensionArray.argsort : Return the indices that would sort this array. """ # Note: this is used in `ExtensionArray.argsort`. return np.array(self) def argsort( - self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs + self, + ascending: bool = True, + kind: str = "quicksort", + na_position: str = "last", + *args, + **kwargs, ) -> np.ndarray: """ Return the indices that would sort this array. @@ -530,8 +582,14 @@ def argsort( # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - result = nargsort(self, kind=kind, ascending=ascending, na_position="last") - return result + values = self._values_for_argsort() + return nargsort( + values, + kind=kind, + ascending=ascending, + na_position=na_position, + mask=np.asarray(self.isna()), + ) def argmin(self): """ @@ -608,7 +666,7 @@ def fillna(self, value=None, method=None, limit=None): if mask.any(): if method is not None: - func = pad_1d if method == "pad" else backfill_1d + func = get_fill_func(method) new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: @@ -629,7 +687,7 @@ def dropna(self): """ return self[~self.isna()] - def shift(self, periods: int = 1, fill_value: object = None) -> "ExtensionArray": + def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: """ Shift values by desired number. @@ -742,7 +800,7 @@ def searchsorted(self, value, side="left", sorter=None): arr = self.astype(object) return arr.searchsorted(value, side=side, sorter=sorter) - def equals(self, other: "ExtensionArray") -> bool: + def equals(self, other: object) -> bool: """ Return if another array is equivalent to this array. @@ -760,11 +818,12 @@ def equals(self, other: "ExtensionArray") -> bool: boolean Whether the arrays are equivalent. """ - if not type(self) == type(other): + if type(self) != type(other): return False - elif not self.dtype == other.dtype: + other = cast(ExtensionArray, other) + if not is_dtype_equal(self.dtype, other.dtype): return False - elif not len(self) == len(other): + elif len(self) != len(other): return False else: equal_values = self == other @@ -798,7 +857,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: """ Encode the extension array as an enumerated type. @@ -839,7 +898,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray" # Complete control over factorization. arr, na_value = self._values_for_factorize() - codes, uniques = _factorize_array( + codes, uniques = factorize_array( arr, na_sentinel=na_sentinel, na_value=na_value ) @@ -893,7 +952,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray" @Substitution(klass="ExtensionArray") @Appender(_extension_array_shared_docs["repeat"]) def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), dict(axis=axis)) + nv.validate_repeat((), {"axis": axis}) ind = np.arange(len(self)).repeat(repeats) return self.take(ind) @@ -902,8 +961,12 @@ def repeat(self, repeats, axis=None): # ------------------------------------------------------------------------ def take( - self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None - ) -> "ExtensionArray": + self, + indices: Sequence[int], + *, + allow_fill: bool = False, + fill_value: Any = None, + ) -> ExtensionArray: """ Take elements from an array. @@ -947,8 +1010,8 @@ def take( See Also -------- - numpy.take - api.extensions.take + numpy.take : Take elements from an array along an axis. + api.extensions.take : Take elements from an array. Notes ----- @@ -992,7 +1055,7 @@ def take(self, indices, allow_fill=False, fill_value=None): # pandas.api.extensions.take raise AbstractMethodError(self) - def copy(self) -> "ExtensionArray": + def copy(self: ExtensionArrayT) -> ExtensionArrayT: """ Return a copy of the array. @@ -1072,7 +1135,20 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: # Reshaping # ------------------------------------------------------------------------ - def ravel(self, order="C") -> "ExtensionArray": + def transpose(self, *axes) -> ExtensionArray: + """ + Return a transposed view on this array. + + Because ExtensionArrays are always 1D, this is a no-op. It is included + for compatibility with np.ndarray. + """ + return self[:] + + @property + def T(self) -> ExtensionArray: + return self.transpose() + + def ravel(self, order="C") -> ExtensionArray: """ Return a flattened view on this array. @@ -1093,8 +1169,8 @@ def ravel(self, order="C") -> "ExtensionArray": @classmethod def _concat_same_type( - cls, to_concat: Sequence["ExtensionArray"] - ) -> "ExtensionArray": + cls: Type[ExtensionArrayT], to_concat: Sequence[ExtensionArrayT] + ) -> ExtensionArrayT: """ Concatenate multiple array of this dtype. @@ -1120,7 +1196,7 @@ def _concat_same_type( # of objects _can_hold_na = True - def _reduce(self, name, skipna=True, **kwargs): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ Return a scalar result of performing the reduction operation. @@ -1161,42 +1237,54 @@ class ExtensionOpsMixin: with NumPy arrays. """ + @classmethod + def _create_arithmetic_method(cls, op): + raise AbstractMethodError(cls) + @classmethod def _add_arithmetic_ops(cls): - cls.__add__ = cls._create_arithmetic_method(operator.add) - cls.__radd__ = cls._create_arithmetic_method(ops.radd) - cls.__sub__ = cls._create_arithmetic_method(operator.sub) - cls.__rsub__ = cls._create_arithmetic_method(ops.rsub) - cls.__mul__ = cls._create_arithmetic_method(operator.mul) - cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) - cls.__pow__ = cls._create_arithmetic_method(operator.pow) - cls.__rpow__ = cls._create_arithmetic_method(ops.rpow) - cls.__mod__ = cls._create_arithmetic_method(operator.mod) - cls.__rmod__ = cls._create_arithmetic_method(ops.rmod) - cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv) - cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv) - cls.__truediv__ = cls._create_arithmetic_method(operator.truediv) - cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv) - cls.__divmod__ = cls._create_arithmetic_method(divmod) - cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) + setattr(cls, "__add__", cls._create_arithmetic_method(operator.add)) + setattr(cls, "__radd__", cls._create_arithmetic_method(ops.radd)) + setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub)) + setattr(cls, "__rsub__", cls._create_arithmetic_method(ops.rsub)) + setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul)) + setattr(cls, "__rmul__", cls._create_arithmetic_method(ops.rmul)) + setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow)) + setattr(cls, "__rpow__", cls._create_arithmetic_method(ops.rpow)) + setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) + setattr(cls, "__rmod__", cls._create_arithmetic_method(ops.rmod)) + setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) + setattr(cls, "__rfloordiv__", cls._create_arithmetic_method(ops.rfloordiv)) + setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) + setattr(cls, "__rtruediv__", cls._create_arithmetic_method(ops.rtruediv)) + setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) + setattr(cls, "__rdivmod__", cls._create_arithmetic_method(ops.rdivmod)) + + @classmethod + def _create_comparison_method(cls, op): + raise AbstractMethodError(cls) @classmethod def _add_comparison_ops(cls): - cls.__eq__ = cls._create_comparison_method(operator.eq) - cls.__ne__ = cls._create_comparison_method(operator.ne) - cls.__lt__ = cls._create_comparison_method(operator.lt) - cls.__gt__ = cls._create_comparison_method(operator.gt) - cls.__le__ = cls._create_comparison_method(operator.le) - cls.__ge__ = cls._create_comparison_method(operator.ge) + setattr(cls, "__eq__", cls._create_comparison_method(operator.eq)) + setattr(cls, "__ne__", cls._create_comparison_method(operator.ne)) + setattr(cls, "__lt__", cls._create_comparison_method(operator.lt)) + setattr(cls, "__gt__", cls._create_comparison_method(operator.gt)) + setattr(cls, "__le__", cls._create_comparison_method(operator.le)) + setattr(cls, "__ge__", cls._create_comparison_method(operator.ge)) + + @classmethod + def _create_logical_method(cls, op): + raise AbstractMethodError(cls) @classmethod def _add_logical_ops(cls): - cls.__and__ = cls._create_logical_method(operator.and_) - cls.__rand__ = cls._create_logical_method(ops.rand_) - cls.__or__ = cls._create_logical_method(operator.or_) - cls.__ror__ = cls._create_logical_method(ops.ror_) - cls.__xor__ = cls._create_logical_method(operator.xor) - cls.__rxor__ = cls._create_logical_method(ops.rxor) + setattr(cls, "__and__", cls._create_logical_method(operator.and_)) + setattr(cls, "__rand__", cls._create_logical_method(ops.rand_)) + setattr(cls, "__or__", cls._create_logical_method(operator.or_)) + setattr(cls, "__ror__", cls._create_logical_method(ops.ror_)) + setattr(cls, "__xor__", cls._create_logical_method(operator.xor)) + setattr(cls, "__rxor__", cls._create_logical_method(ops.rxor)) class ExtensionScalarOpsMixin(ExtensionOpsMixin): @@ -1273,7 +1361,7 @@ def convert_values(param): ovalues = [param] * len(self) return ovalues - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCSeries, ABCIndexClass, ABCDataFrame)): # rely on pandas to unbox and dispatch to us return NotImplemented diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dbce71b77a425..44cc108ed9cfd 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -6,7 +6,6 @@ from pandas._libs import lib, missing as libmissing from pandas._typing import ArrayLike -from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.core.dtypes.common import ( @@ -20,7 +19,6 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -28,7 +26,7 @@ from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow @register_extension_dtype @@ -59,8 +57,9 @@ class BooleanDtype(BaseMaskedDtype): name = "boolean" + # mypy: https://github.com/python/mypy/issues/4125 @property - def type(self) -> Type[np.bool_]: + def type(self) -> Type: # type: ignore[override] return np.bool_ @property @@ -99,7 +98,7 @@ def __from_arrow__( """ Construct BooleanArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow if isinstance(array, pyarrow.Array): chunks = [array] @@ -171,12 +170,13 @@ def coerce_to_array( values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's - if inferred_dtype in integer_like: - if not np.all( + if (inferred_dtype in integer_like) and not ( + np.all( values[~mask_values].astype(float) == values_object[~mask_values].astype(float) - ): - raise TypeError("Need to pass bool-like values") + ) + ): + raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) @@ -194,9 +194,9 @@ def coerce_to_array( if mask_values is not None: mask = mask | mask_values - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D list-like") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D list-like") return values, mask @@ -273,7 +273,9 @@ def dtype(self) -> BooleanDtype: return self._dtype @classmethod - def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "BooleanArray": + def _from_sequence( + cls, scalars, *, dtype=None, copy: bool = False + ) -> "BooleanArray": if dtype: assert dtype == "boolean" values, mask = coerce_to_array(scalars, copy=copy) @@ -281,7 +283,7 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "BooleanArra @classmethod def _from_sequence_of_strings( - cls, strings: List[str], dtype=None, copy: bool = False + cls, strings: List[str], *, dtype=None, copy: bool = False ) -> "BooleanArray": def map_string(s): if isna(s): @@ -294,7 +296,7 @@ def map_string(s): raise ValueError(f"{s} cannot be cast to bool") scalars = [map_string(x) for x in strings] - return cls._from_sequence(scalars, dtype, copy) + return cls._from_sequence(scalars, dtype=dtype, copy=copy) _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) @@ -376,7 +378,10 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if isinstance(dtype, BooleanDtype): values, mask = coerce_to_array(self, copy=copy) - return BooleanArray(values, mask, copy=False) + if not copy: + return self + else: + return BooleanArray(values, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -393,9 +398,8 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False ) # for integer, error if there are missing values - if is_integer_dtype(dtype): - if self._hasna: - raise ValueError("cannot convert NA to integer") + if is_integer_dtype(dtype) and self._hasna: + raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) na_value = self._na_value @@ -416,13 +420,13 @@ def _values_for_argsort(self) -> np.ndarray: See Also -------- - ExtensionArray.argsort + ExtensionArray.argsort : Return the indices that would sort this array. """ data = self._data.copy() data[self._mask] = -1 return data - def any(self, skipna: bool = True, **kwargs): + def any(self, *, skipna: bool = True, **kwargs): """ Return whether any element is True. @@ -490,7 +494,7 @@ def any(self, skipna: bool = True, **kwargs): else: return self.dtype.na_value - def all(self, skipna: bool = True, **kwargs): + def all(self, *, skipna: bool = True, **kwargs): """ Return whether all elements are True. @@ -557,108 +561,135 @@ def all(self, skipna: bool = True, **kwargs): else: return self.dtype.na_value - @classmethod - def _create_logical_method(cls, op): - def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented + def _logical_method(self, other, op): - assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} - other = lib.item_from_zerodim(other) - other_is_booleanarray = isinstance(other, BooleanArray) - other_is_scalar = lib.is_scalar(other) - mask = None - - if other_is_booleanarray: - other, mask = other._data, other._mask - elif is_list_like(other): - other = np.asarray(other, dtype="bool") - if other.ndim > 1: - raise NotImplementedError( - "can only perform ops with 1-d structures" - ) - other, mask = coerce_to_array(other, copy=False) - elif isinstance(other, np.bool_): - other = other.item() - - if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): - raise TypeError( - "'other' should be pandas.NA or a bool. " - f"Got {type(other).__name__} instead." - ) - - if not other_is_scalar and len(self) != len(other): - raise ValueError("Lengths must match to compare") + assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} + other_is_booleanarray = isinstance(other, BooleanArray) + other_is_scalar = lib.is_scalar(other) + mask = None - if op.__name__ in {"or_", "ror_"}: - result, mask = ops.kleene_or(self._data, other, self._mask, mask) - elif op.__name__ in {"and_", "rand_"}: - result, mask = ops.kleene_and(self._data, other, self._mask, mask) - elif op.__name__ in {"xor", "rxor"}: - result, mask = ops.kleene_xor(self._data, other, self._mask, mask) + if other_is_booleanarray: + other, mask = other._data, other._mask + elif is_list_like(other): + other = np.asarray(other, dtype="bool") + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + other, mask = coerce_to_array(other, copy=False) + elif isinstance(other, np.bool_): + other = other.item() - return BooleanArray(result, mask) + if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other): + raise TypeError( + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." + ) - name = f"__{op.__name__}__" - return set_function_name(logical_method, name, cls) + if not other_is_scalar and len(self) != len(other): + raise ValueError("Lengths must match to compare") - @classmethod - def _create_comparison_method(cls, op): - def cmp_method(self, other): - from pandas.arrays import IntegerArray - - if isinstance( - other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) - ): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented + if op.__name__ in {"or_", "ror_"}: + result, mask = ops.kleene_or(self._data, other, self._mask, mask) + elif op.__name__ in {"and_", "rand_"}: + result, mask = ops.kleene_and(self._data, other, self._mask, mask) + elif op.__name__ in {"xor", "rxor"}: + result, mask = ops.kleene_xor(self._data, other, self._mask, mask) - other = lib.item_from_zerodim(other) - mask = None + return BooleanArray(result, mask) - if isinstance(other, BooleanArray): - other, mask = other._data, other._mask + def _cmp_method(self, other, op): + from pandas.arrays import FloatingArray, IntegerArray - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError( - "can only perform ops with 1-d structures" - ) - if len(self) != len(other): - raise ValueError("Lengths must match to compare") + if isinstance(other, (IntegerArray, FloatingArray)): + return NotImplemented - if other is libmissing.NA: - # numpy does not handle pd.NA well as "other" scalar (it returns - # a scalar False instead of an array) - result = np.zeros_like(self._data) - mask = np.ones_like(self._data) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + result = np.zeros_like(self._data) + mask = np.ones_like(self._data) + else: + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) + + # nans propagate + if mask is None: + mask = self._mask.copy() else: - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - result = op(self._data, other) - - # nans propagate - if mask is None: - mask = self._mask.copy() - else: - mask = self._mask | mask + mask = self._mask | mask - return BooleanArray(result, mask, copy=False) + return BooleanArray(result, mask, copy=False) + + def _arith_method(self, other, op): + mask = None + op_name = op.__name__ + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if len(self) != len(other): + raise ValueError("Lengths must match") + + # nans propagate + if mask is None: + mask = self._mask + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | mask + + if other is libmissing.NA: + # if other is NA, the result will be all NA and we can't run the + # actual op, so we need to choose the resulting dtype manually + if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}: + dtype = "int8" + else: + dtype = "bool" + result = np.zeros(len(self._data), dtype=dtype) + else: + if op_name in {"pow", "rpow"} and isinstance(other, np.bool_): + # Avoid DeprecationWarning: In future, it will be an error + # for 'np.bool_' scalars to be interpreted as an index + other = bool(other) + + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) - name = f"__{op.__name__}" - return set_function_name(cmp_method, name, cls) + return self._maybe_mask_result(result, mask, other, op_name) - def _reduce(self, name: str, skipna: bool = True, **kwargs): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if name in {"any", "all"}: return getattr(self, name)(skipna=skipna, **kwargs) - return super()._reduce(name, skipna, **kwargs) + return super()._reduce(name, skipna=skipna, **kwargs) def _maybe_mask_result(self, result, mask, other, op_name: str): """ @@ -675,10 +706,11 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - result[mask] = np.nan - return result + from pandas.core.arrays import FloatingArray + + return FloatingArray(result, mask, copy=False) - if is_bool_dtype(result): + elif is_bool_dtype(result): return BooleanArray(result, mask, copy=False) elif is_integer_dtype(result): @@ -688,66 +720,3 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): else: result[mask] = np.nan return result - - @classmethod - def _create_arithmetic_method(cls, op): - op_name = op.__name__ - - def boolean_arithmetic_method(self, other): - - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) - mask = None - - if isinstance(other, BooleanArray): - other, mask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError( - "can only perform ops with 1-d structures" - ) - if len(self) != len(other): - raise ValueError("Lengths must match") - - # nans propagate - if mask is None: - mask = self._mask - if other is libmissing.NA: - mask |= True - else: - mask = self._mask | mask - - if other is libmissing.NA: - # if other is NA, the result will be all NA and we can't run the - # actual op, so we need to choose the resulting dtype manually - if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}: - dtype = "int8" - else: - dtype = "bool" - result = np.zeros(len(self._data), dtype=dtype) - else: - with np.errstate(all="ignore"): - result = op(self._data, other) - - # divmod returns a tuple - if op_name == "divmod": - div, mod = result - return ( - self._maybe_mask_result(div, mask, other, "floordiv"), - self._maybe_mask_result(mod, mask, other, "mod"), - ) - - return self._maybe_mask_result(result, mask, other, op_name) - - name = f"__{op_name}__" - return set_function_name(boolean_arithmetic_method, name, cls) - - -BooleanArray._add_logical_ops() -BooleanArray._add_comparison_ops() -BooleanArray._add_arithmetic_ops() diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1fedfa70cc469..3995e7b251184 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,7 +2,7 @@ from functools import partial import operator from shutil import get_terminal_size -from typing import Dict, Hashable, List, Type, Union, cast +from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast from warnings import warn import numpy as np @@ -10,9 +10,10 @@ from pandas._config import get_option from pandas._libs import NaT, algos as libalgos, hashtable as htable +from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv -from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc +from pandas.util._decorators import cache_readonly, deprecate_kwarg from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.dtypes.cast import ( @@ -28,6 +29,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_hashable, is_integer_dtype, is_list_like, is_object_dtype, @@ -37,38 +39,36 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms -from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d -from pandas.core.array_algos.transforms import shift -from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray -from pandas.core.base import ( - ExtensionArray, - NoNewAttributesMixin, - PandasObject, - _shared_docs, -) +from pandas.core.algorithms import factorize, get_data_algo, take_1d, unique1d +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array -from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing +from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort +from pandas.core.strings.object_array import ObjectStringArrayMixin from pandas.io.formats import console +CategoricalT = TypeVar("CategoricalT", bound="Categorical") + def _cat_compare_op(op): opname = f"__{op.__name__}__" + fill_value = True if op is operator.ne else False @unpack_zerodim_and_defer(opname) def func(self, other): - if is_list_like(other) and len(other) != len(self): - # TODO: Could this fail if the categories are listlike objects? + hashable = is_hashable(other) + if is_list_like(other) and len(other) != len(self) and not hashable: + # in hashable case we may have a tuple that is itself a category raise ValueError("Lengths must match.") if not self.ordered: @@ -77,58 +77,41 @@ def func(self, other): "Unordered Categoricals can only compare equality or not" ) if isinstance(other, Categorical): - # Two Categoricals can only be be compared if the categories are + # Two Categoricals can only be compared if the categories are # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." - if len(self.categories) != len(other.categories): - raise TypeError(msg + " Categories are different lengths") - elif self.ordered and not (self.categories == other.categories).all(): - raise TypeError(msg) - elif not set(self.categories) == set(other.categories): + if not self._categories_match_up_to_permutation(other): raise TypeError(msg) - if not (self.ordered == other.ordered): - raise TypeError( - "Categoricals can only be compared if 'ordered' is the same" - ) if not self.ordered and not self.categories.equals(other.categories): # both unordered and different order - other_codes = _get_codes_for_values(other, self.categories) + other_codes = recode_for_categories( + other.codes, other.categories, self.categories, copy=False + ) else: other_codes = other._codes - f = getattr(self._codes, opname) - ret = f(other_codes) + ret = op(self._codes, other_codes) mask = (self._codes == -1) | (other_codes == -1) if mask.any(): - # In other series, the leads to False, so do that here too - if opname == "__ne__": - ret[(self._codes == -1) & (other_codes == -1)] = True - else: - ret[mask] = False + ret[mask] = fill_value return ret - if is_scalar(other): + if hashable: if other in self.categories: - i = self.categories.get_loc(other) - ret = getattr(self._codes, opname)(i) + i = self._unbox_scalar(other) + ret = op(self._codes, i) if opname not in {"__eq__", "__ge__", "__gt__"}: - # check for NaN needed if we are not equal or larger + # GH#29820 performance trick; get_loc will always give i>=0, + # so in the cases (__ne__, __le__, __lt__) the setting + # here is a no-op, so can be skipped. mask = self._codes == -1 - ret[mask] = False + ret[mask] = fill_value return ret else: - if opname == "__eq__": - return np.zeros(len(self), dtype=bool) - elif opname == "__ne__": - return np.ones(len(self), dtype=bool) - else: - raise TypeError( - f"Cannot compare a Categorical for op {opname} with a " - "scalar, which is not a category." - ) + return ops.invalid_comparison(self, other, op) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons @@ -201,7 +184,7 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -class Categorical(NDArrayBackedExtensionArray, PandasObject): +class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): """ Represent a categorical variable in classic R / S-plus fashion. @@ -280,6 +263,19 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject): ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] + Missing values are not included as a category. + + >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) + >>> c + [1, 2, 3, 1, 2, 3, NaN] + Categories (3, int64): [1, 2, 3] + + However, their presence is indicated in the `codes` attribute + by code `-1`. + + >>> c.codes + array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) + Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. @@ -297,8 +293,9 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = PandasObject._deprecations | frozenset(["tolist"]) + _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) _typ = "categorical" + _can_hold_na = True def __init__( self, values, categories=None, ordered=None, dtype=None, fastpath=False @@ -328,7 +325,7 @@ def __init__( # sanitize_array coerces np.nan to a string under certain versions # of numpy values = maybe_infer_to_datetimelike(values, convert_dates=True) - if not isinstance(values, np.ndarray): + if not isinstance(values, (np.ndarray, ExtensionArray)): values = com.convert_to_list_like(values) # By convention, empty lists result in object dtype: @@ -362,9 +359,7 @@ def __init__( dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values.dtype): - old_codes = ( - values._values.codes if isinstance(values, ABCSeries) else values.codes - ) + old_codes = extract_array(values).codes codes = recode_for_categories( old_codes, values.dtype.categories, dtype.categories ) @@ -381,56 +376,6 @@ def __init__( self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) - @property - def categories(self): - """ - The categories of this categorical. - - Setting assigns new values to each category (effectively a rename of - each individual category). - - The assigned value has to be a list-like object. All items must be - unique and the number of items in the new categories must be the same - as the number of items in the old categories. - - Assigning to `categories` is a inplace operation! - - Raises - ------ - ValueError - If the new categories do not validate as categories or if the - number of new categories is unequal the number of old categories - - See Also - -------- - rename_categories : Rename categories. - reorder_categories : Reorder categories. - add_categories : Add new categories. - remove_categories : Remove the specified categories. - remove_unused_categories : Remove categories which are not used. - set_categories : Set the categories to the specified ones. - """ - return self.dtype.categories - - @categories.setter - def categories(self, categories): - new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if self.dtype.categories is not None and len(self.dtype.categories) != len( - new_dtype.categories - ): - raise ValueError( - "new categories need to have the same number of " - "items as the old categories!" - ) - self._dtype = new_dtype - - @property - def ordered(self) -> Ordered: - """ - Whether the categories have an ordered relationship. - """ - return self.dtype.ordered - @property def dtype(self) -> CategoricalDtype: """ @@ -443,13 +388,9 @@ def _constructor(self) -> Type["Categorical"]: return Categorical @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) - def _formatter(self, boxed=False): - # Defer to CategoricalFormatter's formatter. - return None - def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype @@ -462,20 +403,42 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: If copy is set to False and dtype is categorical, the original object is returned. """ - if is_categorical_dtype(dtype): + if self.dtype is dtype: + result = self.copy() if copy else self + + elif is_categorical_dtype(dtype): dtype = cast(Union[str, CategoricalDtype], dtype) - # GH 10696/18593 + # GH 10696/18593/18630 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self - if dtype == self.dtype: - return self - return self._set_dtype(dtype) - if is_extension_array_dtype(dtype): - return array(self, dtype=dtype, copy=copy) - if is_integer_dtype(dtype) and self.isna().any(): + result = self._set_dtype(dtype) + + # TODO: consolidate with ndarray case? + elif is_extension_array_dtype(dtype): + result = array(self, dtype=dtype, copy=copy) + + elif is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") - return np.array(self, dtype=dtype, copy=copy) + + elif len(self.codes) == 0 or len(self.categories) == 0: + result = np.array(self, dtype=dtype, copy=copy) + + else: + # GH8628 (PERF): astype category codes instead of astyping array + try: + astyped_cats = self.categories.astype(dtype=dtype, copy=copy) + except ( + TypeError, # downstream error msg for CategoricalIndex is misleading + ValueError, + ): + msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" + raise ValueError(msg) + + astyped_cats = extract_array(astyped_cats, extract_numpy=True) + result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) + + return result @cache_readonly def itemsize(self) -> int: @@ -520,7 +483,7 @@ def _from_inferred_categories( ------- Categorical """ - from pandas import Index, to_numeric, to_datetime, to_timedelta + from pandas import Index, to_datetime, to_numeric, to_timedelta cats = Index(inferred_categories) known_categories = ( @@ -628,6 +591,59 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): return cls(codes, dtype=dtype, fastpath=True) + # ------------------------------------------------------------------ + # Categories/Codes/Ordered + + @property + def categories(self): + """ + The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if self.dtype.categories is not None and len(self.dtype.categories) != len( + new_dtype.categories + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) + self._dtype = new_dtype + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + """ + return self.dtype.ordered + @property def codes(self) -> np.ndarray: """ @@ -732,8 +748,8 @@ def as_ordered(self, inplace=False): Returns ------- - Categorical - Ordered Categorical. + Categorical or None + Ordered Categorical or None if ``inplace=True``. """ inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(True, inplace=inplace) @@ -750,8 +766,8 @@ def as_unordered(self, inplace=False): Returns ------- - Categorical - Unordered Categorical. + Categorical or None + Unordered Categorical or None if ``inplace=True``. """ inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) @@ -850,8 +866,6 @@ def rename_categories(self, new_categories, inplace=False): * callable : a callable that is called on all items in the old categories and whose return values comprise the new categories. - .. versionadded:: 0.23.0. - inplace : bool, default False Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. @@ -859,8 +873,7 @@ def rename_categories(self, new_categories, inplace=False): Returns ------- cat : Categorical or None - With ``inplace=False``, the new categorical is returned. - With ``inplace=True``, there is no return value. + Categorical with removed categories or None if ``inplace=True``. Raises ------ @@ -928,7 +941,8 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): Returns ------- - cat : Categorical with reordered categories or None if inplace. + cat : Categorical or None + Categorical with removed categories or None if ``inplace=True``. Raises ------ @@ -968,7 +982,8 @@ def add_categories(self, new_categories, inplace=False): Returns ------- - cat : Categorical with new categories added or None if inplace. + cat : Categorical or None + Categorical with new categories added or None if ``inplace=True``. Raises ------ @@ -1018,7 +1033,8 @@ def remove_categories(self, removals, inplace=False): Returns ------- - cat : Categorical with removed categories or None if inplace. + cat : Categorical or None + Categorical with removed categories or None if ``inplace=True``. Raises ------ @@ -1053,7 +1069,7 @@ def remove_categories(self, removals, inplace=False): new_categories, ordered=self.ordered, rename=False, inplace=inplace ) - def remove_unused_categories(self, inplace=False): + def remove_unused_categories(self, inplace=no_default): """ Remove categories which are not used. @@ -1063,9 +1079,12 @@ def remove_unused_categories(self, inplace=False): Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. + .. deprecated:: 1.2.0 + Returns ------- - cat : Categorical with unused categories dropped or None if inplace. + cat : Categorical or None + Categorical with unused categories dropped or None if ``inplace=True``. See Also -------- @@ -1075,6 +1094,17 @@ def remove_unused_categories(self, inplace=False): remove_categories : Remove the specified categories. set_categories : Set the categories to the specified ones. """ + if inplace is not no_default: + warn( + "The `inplace` parameter in pandas.Categorical." + "remove_unused_categories is deprecated and " + "will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) @@ -1092,6 +1122,8 @@ def remove_unused_categories(self, inplace=False): if not inplace: return cat + # ------------------------------------------------------------------ + def map(self, mapper): """ Map categories using input correspondence (dict, Series, or function). @@ -1180,39 +1212,23 @@ def map(self, mapper): __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) - def shift(self, periods, fill_value=None): - """ - Shift Categorical by desired number of periods. - - Parameters - ---------- - periods : int - Number of periods to move, can be positive or negative - fill_value : object, optional - The scalar value to use for newly introduced missing values. + # ------------------------------------------------------------- + # Validators; ideally these can be de-duplicated - .. versionadded:: 0.24.0 - - Returns - ------- - shifted : Categorical - """ - # since categoricals always have ndim == 1, an axis parameter - # doesn't make any sense here. - codes = self.codes - if codes.ndim > 1: - raise NotImplementedError("Categorical with ndim > 1.") - - fill_value = self._validate_fill_value(fill_value) - - codes = shift(codes, periods, axis=0, fill_value=fill_value) - - return self._constructor(codes, dtype=self.dtype, fastpath=True) + def _validate_searchsorted_value(self, value): + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + if is_scalar(value): + codes = self._unbox_scalar(value) + else: + locs = [self.categories.get_loc(x) for x in value] + codes = np.array(locs, dtype=self.codes.dtype) + return codes def _validate_fill_value(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our - underlying ndarray, raising ValueError if this is not possible. + underlying ndarray, raising TypeError if this is not possible. Parameters ---------- @@ -1224,20 +1240,24 @@ def _validate_fill_value(self, fill_value): Raises ------ - ValueError + TypeError """ - if isna(fill_value): + if is_valid_nat_for_dtype(fill_value, self.categories.dtype): fill_value = -1 elif fill_value in self.categories: - fill_value = self.categories.get_loc(fill_value) + fill_value = self._unbox_scalar(fill_value) else: - raise ValueError( + raise TypeError( f"'fill_value={fill_value}' is not present " "in this Categorical's categories" ) return fill_value + _validate_scalar = _validate_fill_value + + # ------------------------------------------------------------- + def __array__(self, dtype=None) -> np.ndarray: """ The numpy array interface. @@ -1249,15 +1269,13 @@ def __array__(self, dtype=None) -> np.ndarray: if dtype==None (default), the same dtype as categorical.categories.dtype. """ - ret = take_1d(self.categories.values, self._codes) + ret = take_1d(self.categories._values, self._codes) if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) - if is_extension_array_dtype(ret): - # When we're a Categorical[ExtensionArray], like Interval, - # we need to ensure __array__ get's all the way to an - # ndarray. - ret = np.asarray(ret) - return ret + # When we're a Categorical[ExtensionArray], like Interval, + # we need to ensure __array__ gets all the way to an + # ndarray. + return np.asarray(ret) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods @@ -1286,10 +1304,10 @@ def __setstate__(self, state): setattr(self, k, v) @property - def nbytes(self): + def nbytes(self) -> int: return self._codes.nbytes + self.dtype.categories.values.nbytes - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: """ Memory usage of my values @@ -1314,18 +1332,6 @@ def memory_usage(self, deep=False): """ return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) - @doc(_shared_docs["searchsorted"], klass="Categorical") - def searchsorted(self, value, side="left", sorter=None): - # searchsorted is very performance sensitive. By converting codes - # to same dtype as self.codes, we get much faster performance. - if is_scalar(value): - codes = self.categories.get_loc(value) - codes = self.codes.dtype.type(codes) - else: - locs = [self.categories.get_loc(x) for x in value] - codes = np.array(locs, dtype=self.codes.dtype) - return self.codes.searchsorted(codes, side=side, sorter=sorter) - def isna(self): """ Detect missing values @@ -1343,8 +1349,7 @@ def isna(self): Categorical.notna : Boolean inverse of Categorical.isna. """ - ret = self._codes == -1 - return ret + return self._codes == -1 isnull = isna @@ -1370,20 +1375,6 @@ def notna(self): notnull = notna - def dropna(self): - """ - Return the Categorical without null values. - - Missing values (-1 in .codes) are detected. - - Returns - ------- - valid : Categorical - """ - result = self[self.notna()] - - return result - def value_counts(self, dropna=True): """ Return a Series containing counts of each category. @@ -1403,10 +1394,10 @@ def value_counts(self, dropna=True): -------- Series.value_counts """ - from pandas import Series, CategoricalIndex + from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories - ncat, mask = len(cat), 0 <= code + ncat, mask = (len(cat), code >= 0) ix, clean = np.arange(ncat), mask.all() if dropna or clean: @@ -1416,7 +1407,7 @@ def value_counts(self, dropna=True): count = np.bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, dtype=self.dtype, fastpath=True) + ix = self._from_backing_data(ix) return Series(count, index=CategoricalIndex(ix), dtype="int64") @@ -1448,9 +1439,6 @@ def check_for_ordered(self, op): "Categorical to an ordered one\n" ) - def _values_for_argsort(self): - return self._codes - def argsort(self, ascending=True, kind="quicksort", **kwargs): """ Return the indices that would sort the Categorical. @@ -1505,7 +1493,7 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs): return super().argsort(ascending=ascending, kind=kind, **kwargs) def sort_values( - self, inplace: bool = False, ascending: bool = True, na_position: str = "last", + self, inplace: bool = False, ascending: bool = True, na_position: str = "last" ): """ Sort the Categorical by category value returning a new @@ -1587,11 +1575,10 @@ def sort_values( sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) if inplace: - self._codes = self._codes[sorted_idx] + self._codes[:] = self._codes[sorted_idx] else: - return self._constructor( - values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True - ) + codes = self._codes[sorted_idx] + return self._from_backing_data(codes) def _values_for_rank(self): """ @@ -1626,7 +1613,7 @@ def _values_for_rank(self): def view(self, dtype=None): if dtype is not None: raise NotImplementedError(dtype) - return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True) + return self._from_backing_data(self._ndarray) def to_dense(self): """ @@ -1678,6 +1665,7 @@ def fillna(self, value=None, method=None, limit=None): value, method = validate_fillna_kwargs( value, method, validate_scalar_dict_value=False ) + value = extract_array(value, extract_numpy=True) if value is None: value = np.nan @@ -1686,130 +1674,31 @@ def fillna(self, value=None, method=None, limit=None): "specifying a limit for fillna has not been implemented yet" ) - codes = self._codes - - # pad / bfill if method is not None: + # pad / bfill # TODO: dispatch when self.categories is EA-dtype values = np.asarray(self).reshape(-1, len(self)) - values = interpolate_2d(values, method, 0, None, value).astype( + values = interpolate_2d(values, method, 0, None).astype( self.categories.dtype )[0] codes = _get_codes_for_values(values, self.categories) else: + # We copy even if there is nothing to fill + codes = self._ndarray.copy() + mask = self.isna() + + new_codes = self._validate_setitem_value(value) - # If value is a dict or a Series (a dict value has already - # been converted to a Series) - if isinstance(value, (np.ndarray, Categorical, ABCSeries)): + if isinstance(value, (np.ndarray, Categorical)): # We get ndarray or Categorical if called via Series.fillna, # where it will unwrap another aligned Series before getting here - - mask = ~algorithms.isin(value, self.categories) - if not isna(value[mask]).all(): - raise ValueError("fill value must be in categories") - - values_codes = _get_codes_for_values(value, self.categories) - indexer = np.where(codes == -1) - codes = codes.copy() - codes[indexer] = values_codes[indexer] - - # If value is not a dict or Series it should be a scalar - elif is_hashable(value): - if not isna(value) and value not in self.categories: - raise ValueError("fill value must be in categories") - - mask = codes == -1 - if mask.any(): - codes = codes.copy() - if isna(value): - codes[mask] = -1 - else: - codes[mask] = self.categories.get_loc(value) - + codes[mask] = new_codes[mask] else: - raise TypeError( - f"'value' parameter must be a scalar, dict " - f"or Series, but you passed a {type(value).__name__}" - ) - - return self._constructor(codes, dtype=self.dtype, fastpath=True) - - def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T: - """ - Take elements from the Categorical. - - Parameters - ---------- - indexer : sequence of int - The indices in `self` to take. The meaning of negative values in - `indexer` depends on the value of `allow_fill`. - allow_fill : bool, default False - How to handle negative values in `indexer`. - - * False: negative values in `indices` indicate positional indices - from the right. This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate missing values - (the default). These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - .. versionchanged:: 1.0.0 - - Default value changed from ``True`` to ``False``. - - fill_value : object - The value to use for `indices` that are missing (-1), when - ``allow_fill=True``. This should be the category, i.e. a value - in ``self.categories``, not a code. - - Returns - ------- - Categorical - This Categorical will have the same categories and ordered as - `self`. - - See Also - -------- - Series.take : Similar method for Series. - numpy.ndarray.take : Similar method for NumPy arrays. - - Examples - -------- - >>> cat = pd.Categorical(['a', 'a', 'b']) - >>> cat - ['a', 'a', 'b'] - Categories (2, object): ['a', 'b'] - - Specify ``allow_fill==False`` to have negative indices mean indexing - from the right. - - >>> cat.take([0, -1, -2], allow_fill=False) - ['a', 'b', 'a'] - Categories (2, object): ['a', 'b'] - - With ``allow_fill=True``, indices equal to ``-1`` mean "missing" - values that should be filled with the `fill_value`, which is - ``np.nan`` by default. - - >>> cat.take([0, -1, -1], allow_fill=True) - ['a', NaN, NaN] - Categories (2, object): ['a', 'b'] + codes[mask] = new_codes - The fill value can be specified. - - >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') - ['a', 'a', 'a'] - Categories (2, object): ['a', 'b'] - - Specifying a fill value that's not in ``self.categories`` - will raise a ``ValueError``. - """ - return NDArrayBackedExtensionArray.take( - self, indexer, allow_fill=allow_fill, fill_value=fill_value - ) + return self._from_backing_data(codes) # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat @@ -1821,6 +1710,18 @@ def _ndarray(self) -> np.ndarray: def _from_backing_data(self, arr: np.ndarray) -> "Categorical": return self._constructor(arr, dtype=self.dtype, fastpath=True) + def _box_func(self, i: int): + if i == -1: + return np.NaN + return self.categories[i] + + def _unbox_scalar(self, key) -> int: + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + code = self.categories.get_loc(key) + code = self._codes.dtype.type(code) + return code + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1848,6 +1749,13 @@ def __contains__(self, key) -> bool: return contains(self, key, container=self._codes) + # ------------------------------------------------------------------ + # Rendering Methods + + def _formatter(self, boxed=False): + # Defer to CategoricalFormatter's formatter. + return None + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default @@ -1946,59 +1854,34 @@ def __repr__(self) -> str: return result - def _maybe_coerce_indexer(self, indexer): - """ - return an indexer coerced to the codes dtype - """ - if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": - indexer = indexer.astype(self._codes.dtype) - return indexer + # ------------------------------------------------------------------ def __getitem__(self, key): """ Return an item. """ - if isinstance(key, (int, np.integer)): - i = self._codes[key] - if i == -1: - return np.nan - else: - return self.categories[i] - - key = check_array_indexer(self, key) - - result = self._codes[key] - if result.ndim > 1: + result = super().__getitem__(key) + if getattr(result, "ndim", 0) > 1: + result = result._ndarray deprecate_ndim_indexing(result) - return result - return self._constructor(result, dtype=self.dtype, fastpath=True) - - def __setitem__(self, key, value): - """ - Item assignment. + return result - Raises - ------ - ValueError - If (one or more) Value is not in categories or if a assigned - `Categorical` does not have the same categories - """ + def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) # require identical categories set if isinstance(value, Categorical): - if not is_dtype_equal(self, value): + if not is_dtype_equal(self.dtype, value.dtype): raise ValueError( "Cannot set a Categorical with another, " "without identical categories" ) - if not self.categories.equals(value.categories): - new_codes = recode_for_categories( - value.codes, value.categories, self.categories - ) - value = Categorical.from_codes(new_codes, dtype=self.dtype) + # is_dtype_equal implies categories_match_up_to_permutation + value = self._encode_with_my_categories(value) + return value._codes - rvalue = value if is_list_like(value) else [value] + # wrap scalars and hashable-listlikes in list + rvalue = value if not is_hashable(value) else [value] from pandas import Index @@ -2012,34 +1895,8 @@ def __setitem__(self, key, value): "category, set the categories first" ) - # set by position - if isinstance(key, (int, np.integer)): - pass - - # tuple of indexers (dataframe) - elif isinstance(key, tuple): - # only allow 1 dimensional slicing, but can - # in a 2-d case be passed (slice(None),....) - if len(key) == 2: - if not com.is_null_slice(key[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") - key = key[1] - elif len(key) == 1: - key = key[0] - else: - raise AssertionError("invalid slicing for a 1-ndim categorical") - - # slicing in Series or Categorical - elif isinstance(key, slice): - pass - - # else: array of True/False in Series or Categorical - - lindexer = self.categories.get_indexer(rvalue) - lindexer = self._maybe_coerce_indexer(lindexer) - - key = check_array_indexer(self, key) - self._codes[key] = lindexer + codes = self.categories.get_indexer(rvalue) + return codes.astype(self._ndarray.dtype, copy=False) def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ @@ -2072,18 +1929,13 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, _result)) - return result + return dict(zip(categories, _result)) - # reduction ops # - def _reduce(self, name, axis=0, **kwargs): - func = getattr(self, name, None) - if func is None: - raise TypeError(f"Categorical cannot perform the operation {name}") - return func(**kwargs) + # ------------------------------------------------------------------ + # Reductions @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") - def min(self, skipna=True, **kwargs): + def min(self, *, skipna=True, **kwargs): """ The minimum value of the object. @@ -2102,6 +1954,7 @@ def min(self, skipna=True, **kwargs): ------- min : the minimum of this `Categorical` """ + nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_min((), kwargs) self.check_for_ordered("min") @@ -2116,10 +1969,10 @@ def min(self, skipna=True, **kwargs): return np.nan else: pointer = self._codes.min() - return self.categories[pointer] + return self._wrap_reduction_result(None, pointer) @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") - def max(self, skipna=True, **kwargs): + def max(self, *, skipna=True, **kwargs): """ The maximum value of the object. @@ -2138,6 +1991,7 @@ def max(self, skipna=True, **kwargs): ------- max : the maximum of this `Categorical` """ + nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_max((), kwargs) self.check_for_ordered("max") @@ -2152,7 +2006,7 @@ def max(self, skipna=True, **kwargs): return np.nan else: pointer = self._codes.max() - return self.categories[pointer] + return self._wrap_reduction_result(None, pointer) def mode(self, dropna=True): """ @@ -2176,7 +2030,10 @@ def mode(self, dropna=True): good = self._codes != -1 codes = self._codes[good] codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) - return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + return self._from_backing_data(codes) + + # ------------------------------------------------------------------ + # ExtensionArray Interface def unique(self): """ @@ -2196,7 +2053,7 @@ def unique(self): -------- pandas.unique CategoricalIndex.unique - Series.unique + Series.unique : Return unique values of Series object. Examples -------- @@ -2233,8 +2090,7 @@ def unique(self): return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): - codes = self.codes.astype("int64") - return codes, -1 + return self._ndarray, -1 @classmethod def _from_factorized(cls, uniques, original): @@ -2242,7 +2098,7 @@ def _from_factorized(cls, uniques, original): original.categories.take(uniques), dtype=original.dtype ) - def equals(self, other): + def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. @@ -2254,18 +2110,41 @@ def equals(self, other): ------- bool """ - if self.is_dtype_equal(other): - if self.categories.equals(other.categories): - # fastpath to avoid re-coding - other_codes = other._codes - else: - other_codes = recode_for_categories( - other.codes, other.categories, self.categories - ) - return np.array_equal(self._codes, other_codes) + if not isinstance(other, Categorical): + return False + elif self._categories_match_up_to_permutation(other): + other = self._encode_with_my_categories(other) + return np.array_equal(self._codes, other._codes) return False - def is_dtype_equal(self, other): + @classmethod + def _concat_same_type( + cls: Type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0 + ) -> CategoricalT: + from pandas.core.dtypes.concat import union_categoricals + + return union_categoricals(to_concat) + + # ------------------------------------------------------------------ + + def _encode_with_my_categories(self, other: "Categorical") -> "Categorical": + """ + Re-encode another categorical using this Categorical's categories. + + Notes + ----- + This assumes we have already checked + self._categories_match_up_to_permutation(other). + """ + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + codes = recode_for_categories( + other.codes, other.categories, self.categories, copy=False + ) + return self._from_backing_data(codes) + + def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype same categories, and same ordered @@ -2278,8 +2157,17 @@ def is_dtype_equal(self, other): ------- bool """ + return hash(self.dtype) == hash(other.dtype) + + def is_dtype_equal(self, other) -> bool: + warn( + "Categorical.is_dtype_equal is deprecated and will be removed " + "in a future version", + FutureWarning, + stacklevel=2, + ) try: - return hash(self.dtype) == hash(other.dtype) + return self._categories_match_up_to_permutation(other) except (AttributeError, TypeError): return False @@ -2303,18 +2191,7 @@ def describe(self): return result - # Implement the ExtensionArray interface - @property - def _can_hold_na(self): - return True - - @classmethod - def _concat_same_type(self, to_concat): - from pandas.core.dtypes.concat import union_categoricals - - return union_categoricals(to_concat) - - def isin(self, values): + def isin(self, values) -> np.ndarray: """ Check whether `values` are contained in Categorical. @@ -2426,6 +2303,25 @@ def replace(self, to_replace, value, inplace: bool = False): if not inplace: return cat + # ------------------------------------------------------------------------ + # String methods interface + def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)): + # Optimization to apply the callable `f` to the categories once + # and rebuild the result by `take`ing from the result with the codes. + # Returns the same type as the object-dtype implementation though. + from pandas.core.arrays import PandasArray + + categories = self.categories + codes = self.codes + result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) + return take_1d(result, codes, fill_value=na_value) + + def _str_get_dummies(self, sep="|"): + # sep may not be in categories. Just bail on this. + from pandas.core.arrays import PandasArray + + return PandasArray(self.astype(str))._str_get_dummies(sep) + # The Series.cat accessor @@ -2596,9 +2492,11 @@ def _delegate_method(self, name, *args, **kwargs): # utility routines -def _get_codes_for_values(values, categories): +def _get_codes_for_values(values, categories) -> np.ndarray: """ utility routine to turn values into codes given the specified categories + + If `values` is known to be a Categorical, use recode_for_categories instead. """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) @@ -2621,14 +2519,16 @@ def _get_codes_for_values(values, categories): # Only hit here when we've already coerced to object dtypee. - hash_klass, vals = _get_data_algo(values) - _, cats = _get_data_algo(categories) + hash_klass, vals = get_data_algo(values) + _, cats = get_data_algo(categories) t = hash_klass(len(cats)) t.map_locations(cats) return coerce_indexer_dtype(t.lookup(vals), cats) -def recode_for_categories(codes: np.ndarray, old_categories, new_categories): +def recode_for_categories( + codes: np.ndarray, old_categories, new_categories, copy: bool = True +) -> np.ndarray: """ Convert a set of codes for to a new set of categories @@ -2636,6 +2536,8 @@ def recode_for_categories(codes: np.ndarray, old_categories, new_categories): ---------- codes : np.ndarray old_categories, new_categories : Index + copy: bool, default True + Whether to copy if the codes are unchanged. Returns ------- @@ -2651,14 +2553,19 @@ def recode_for_categories(codes: np.ndarray, old_categories, new_categories): """ if len(old_categories) == 0: # All null anyway, so just retain the nulls - return codes.copy() + if copy: + return codes.copy() + return codes elif new_categories.equals(old_categories): # Same categories, so no need to actually recode - return codes.copy() + if copy: + return codes.copy() + return codes + indexer = coerce_indexer_dtype( new_categories.get_indexer(old_categories), new_categories ) - new_codes = take_1d(indexer, codes.copy(), fill_value=-1) + new_codes = take_1d(indexer, codes, fill_value=-1) return new_codes diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a306268cd8ede..be9864731842d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1,6 +1,19 @@ +from __future__ import annotations + from datetime import datetime, timedelta import operator -from typing import Any, Callable, Optional, Sequence, Tuple, Type, TypeVar, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, +) import warnings import numpy as np @@ -24,11 +37,9 @@ round_nsint64, ) from pandas._typing import DatetimeLikeScalar, DtypeObj -from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning -from pandas.util._decorators import Appender, Substitution -from pandas.util._validators import validate_fillna_kwargs +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -48,101 +59,64 @@ is_unsigned_integer_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import missing, nanops, ops -from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts -from pandas.core.array_algos.transforms import shift -from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray -from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +from pandas.core import nanops, ops +from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray import pandas.core.common as com from pandas.core.construction import array, extract_array -from pandas.core.indexers import check_array_indexer +from pandas.core.indexers import check_array_indexer, check_setitem_lengths from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op from pandas.tseries import frequencies +if TYPE_CHECKING: + from pandas.core.arrays import DatetimeArray, TimedeltaArray + DTScalarOrNaT = Union[DatetimeLikeScalar, NaTType] +DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") -def _datetimelike_array_cmp(cls, op): +class InvalidComparison(Exception): """ - Wrap comparison operations to convert Timestamp/Timedelta/Period-like to - boxed scalars/arrays. + Raised by _validate_comparison_value to indicate to caller it should + return invalid_comparison. """ - opname = f"__{op.__name__}__" - nat_result = opname == "__ne__" - - class InvalidComparison(Exception): - pass - - def _validate_comparison_value(self, other): - if isinstance(other, str): - try: - # GH#18435 strings get a pass from tzawareness compat - other = self._scalar_from_string(other) - except ValueError: - # failed to parse as Timestamp/Timedelta/Period - raise InvalidComparison(other) - - if isinstance(other, self._recognized_scalars) or other is NaT: - other = self._scalar_type(other) - self._check_compatible_with(other) - - elif not is_list_like(other): - raise InvalidComparison(other) - - elif len(other) != len(self): - raise ValueError("Lengths must match") - - else: - try: - other = self._validate_listlike(other, opname, allow_object=True) - except TypeError as err: - raise InvalidComparison(other) from err - - return other - - @unpack_zerodim_and_defer(opname) - def wrapper(self, other): - if self.ndim > 1 and getattr(other, "shape", None) == self.shape: - # TODO: handle 2D-like listlikes - return op(self.ravel(), other.ravel()).reshape(self.shape) - - try: - other = _validate_comparison_value(self, other) - except InvalidComparison: - return invalid_comparison(self, other, op) - - dtype = getattr(other, "dtype", None) - if is_object_dtype(dtype): - # We have to use comp_method_OBJECT_ARRAY instead of numpy - # comparison otherwise it would fail to raise when - # comparing tz-aware and tz-naive - with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY(op, self.astype(object), other) - return result - other_i8 = self._unbox(other) - result = op(self.asi8, other_i8) + pass - o_mask = isna(other) - if self._hasnans | np.any(o_mask): - result[self._isnan | o_mask] = nat_result - return result +class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): + """ + Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray - return set_function_name(wrapper, opname, cls) + Assumes that __new__/__init__ defines: + _data + _freq + and that the inheriting class has methods: + _generate_range + """ -class AttributesMixin: + # _infer_matches -> which infer_dtype strings are close enough to our own + _infer_matches: Tuple[str, ...] + _is_recognized_dtype: Callable[[DtypeObj], bool] + _recognized_scalars: Tuple[Type, ...] _data: np.ndarray + def __init__(self, data, dtype=None, freq=None, copy=False): + raise AbstractMethodError(self) + @classmethod - def _simple_new(cls, values: np.ndarray, **kwargs): + def _simple_new( + cls: Type[DatetimeLikeArrayT], + values: np.ndarray, + freq: Optional[BaseOffset] = None, + dtype=None, + ) -> DatetimeLikeArrayT: raise AbstractMethodError(cls) @property @@ -176,7 +150,9 @@ def _scalar_from_string(self, value: str) -> DTScalarOrNaT: """ raise AbstractMethodError(self) - def _unbox_scalar(self, value: DTScalarOrNaT) -> int: + def _unbox_scalar( + self, value: DTScalarOrNaT, setitem: bool = False + ) -> Union[np.int64, np.datetime64, np.timedelta64]: """ Unbox the integer value of a scalar `value`. @@ -184,6 +160,8 @@ def _unbox_scalar(self, value: DTScalarOrNaT) -> int: ---------- value : Period, Timestamp, Timedelta, or NaT Depending on subclass. + setitem : bool, default False + Whether to check compatibility with setitem strictness. Returns ------- @@ -221,272 +199,38 @@ def _check_compatible_with( """ raise AbstractMethodError(self) - -class DatelikeOps: - """ - Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. - """ - - @Substitution( - URL="https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior" - ) - def strftime(self, date_format): - """ - Convert to Index using specified date_format. - - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. Details - of the string format can be found in `python string format - doc <%(URL)s>`__. - - Parameters - ---------- - date_format : str - Date format string (e.g. "%%Y-%%m-%%d"). - - Returns - ------- - ndarray - NumPy ndarray of formatted strings. - - See Also - -------- - to_datetime : Convert the given argument to datetime. - DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. - DatetimeIndex.round : Round the DatetimeIndex to the specified freq. - DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. - - Examples - -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%%B %%d, %%Y, %%r') - Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', - 'March 10, 2018, 09:00:02 AM'], - dtype='object') - """ - result = self._format_native_types(date_format=date_format, na_rep=np.nan) - return result.astype(object) - - -class TimelikeOps: - """ - Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. - """ - - _round_doc = """ - Perform {op} operation on the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - :ref:`frequency aliases ` for - a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - Only relevant for DatetimeIndex: - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times. - - .. versionadded:: 0.24.0 - - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times. - - .. versionadded:: 0.24.0 - - Returns - ------- - DatetimeIndex, TimedeltaIndex, or Series - Index of the same type for a DatetimeIndex or TimedeltaIndex, - or a Series with the same index for a Series. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq='T') - """ - - _round_example = """>>> rng.round('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.round("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - - _floor_example = """>>> rng.floor('H') - DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.floor("H") - 0 2018-01-01 11:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - - _ceil_example = """>>> rng.ceil('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 13:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.ceil("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 13:00:00 - dtype: datetime64[ns] - """ - - def _round(self, freq, mode, ambiguous, nonexistent): - # round the local times - if is_datetime64tz_dtype(self.dtype): - # operate on naive timestamps, then convert back to aware - naive = self.tz_localize(None) - result = naive._round(freq, mode, ambiguous, nonexistent) - aware = result.tz_localize( - self.tz, ambiguous=ambiguous, nonexistent=nonexistent - ) - return aware - - values = self.view("i8") - result = round_nsint64(values, mode, freq) - result = self._maybe_mask_results(result, fill_value=NaT) - return self._simple_new(result, dtype=self.dtype) - - @Appender((_round_doc + _round_example).format(op="round")) - def round(self, freq, ambiguous="raise", nonexistent="raise"): - return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) - - @Appender((_round_doc + _floor_example).format(op="floor")) - def floor(self, freq, ambiguous="raise", nonexistent="raise"): - return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) - - @Appender((_round_doc + _ceil_example).format(op="ceil")) - def ceil(self, freq, ambiguous="raise", nonexistent="raise"): - return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) - - def _with_freq(self, freq): - """ - Helper to get a view on the same data, with a new freq. - - Parameters - ---------- - freq : DateOffset, None, or "infer" - - Returns - ------- - Same type as self - """ - # GH#29843 - if freq is None: - # Always valid - pass - elif len(self) == 0 and isinstance(freq, BaseOffset): - # Always valid. In the TimedeltaArray case, we assume this - # is a Tick offset. - pass - else: - # As an internal method, we can ensure this assertion always holds - assert freq == "infer" - freq = to_offset(self.inferred_freq) - - arr = self.view() - arr._freq = freq - return arr - - -DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") - - -class DatetimeLikeArrayMixin( - ExtensionOpsMixin, AttributesMixin, NDArrayBackedExtensionArray -): - """ - Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray - - Assumes that __new__/__init__ defines: - _data - _freq - - and that the inheriting class has methods: - _generate_range - """ - - _is_recognized_dtype: Callable[[DtypeObj], bool] - _recognized_scalars: Tuple[Type, ...] - # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat - # TODO: make this a cache_readonly; need to get around _index_data - # kludge in libreduction - @property + @cache_readonly def _ndarray(self) -> np.ndarray: - # NB: A bunch of Interval tests fail if we use ._data - return self.asi8 + return self._data - def _from_backing_data(self: _T, arr: np.ndarray) -> _T: + def _from_backing_data( + self: DatetimeLikeArrayT, arr: np.ndarray + ) -> DatetimeLikeArrayT: # Note: we do not retain `freq` - return type(self)(arr, dtype=self.dtype) # type: ignore + return type(self)._simple_new(arr, dtype=self.dtype) # ------------------------------------------------------------------ - @property - def _box_func(self): + def _box_func(self, x): """ box function to get object from internal representation """ raise AbstractMethodError(self) - def _box_values(self, values): + def _box_values(self, values) -> np.ndarray: """ apply box func to passed values """ return lib.map_infer(values, self._box_func) def __iter__(self): - return (self._box_func(v) for v in self.asi8) + if self.ndim > 1: + return (self[n] for n in range(len(self))) + else: + return (self._box_func(v) for v in self.asi8) @property def asi8(self) -> np.ndarray: @@ -525,41 +269,21 @@ def __array__(self, dtype=None) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) - return self._data + return self._ndarray - def __getitem__(self, key): + def __getitem__( + self, key: Union[int, slice, np.ndarray] + ) -> Union[DatetimeLikeArrayMixin, DTScalarOrNaT]: """ This getitem defers to the underlying array, which by-definition can only handle list-likes, slices, and integer scalars """ - - if lib.is_integer(key): - # fast-path - result = self._data[key] - if self.ndim == 1: - return self._box_func(result) - return self._simple_new(result, dtype=self.dtype) - - if com.is_bool_indexer(key): - # first convert to boolean, because check_array_indexer doesn't - # allow object dtype - if is_object_dtype(key): - key = np.asarray(key, dtype=bool) - - key = check_array_indexer(self, key) - key = lib.maybe_booleans_to_slice(key.view(np.uint8)) - elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): - # see https://github.com/pandas-dev/pandas/issues/31299, need to allow - # this for now (would otherwise raise in check_array_indexer) - pass - else: - key = check_array_indexer(self, key) - - freq = self._get_getitem_freq(key) - result = self._data[key] + result = super().__getitem__(key) if lib.is_scalar(result): - return self._box_func(result) - return self._simple_new(result, dtype=self.dtype, freq=freq) + return result + + result._freq = self._get_getitem_freq(key) + return result def _get_getitem_freq(self, key): """ @@ -568,7 +292,10 @@ def _get_getitem_freq(self, key): is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq + elif self.ndim != 1: + freq = None else: + key = check_array_indexer(self, key) # maybe ndarray[bool] -> slice freq = None if isinstance(key, slice): if self.freq is not None and key.step is not None: @@ -579,6 +306,10 @@ def _get_getitem_freq(self, key): # GH#21282 indexing with Ellipsis is similar to a full slice, # should preserve `freq` attribute freq = self.freq + elif com.is_bool_indexer(key): + new_key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + if isinstance(new_key, slice): + return self._get_getitem_freq(new_key) return freq def __setitem__( @@ -591,27 +322,11 @@ def __setitem__( # to a period in from_sequence). For DatetimeArray, it's Timestamp... # I don't know if mypy can do that, possibly with Generics. # https://mypy.readthedocs.io/en/latest/generics.html - if is_list_like(value): - is_slice = isinstance(key, slice) - - if lib.is_scalar(key): - raise ValueError("setting an array element with a sequence.") - - if not is_slice: - key = cast(Sequence, key) - if len(key) != len(value) and not com.is_bool_indexer(key): - msg = ( - f"shape mismatch: value array of length '{len(key)}' " - "does not match indexing result of length " - f"'{len(value)}'." - ) - raise ValueError(msg) - elif not len(key): - return - - value = self._validate_setitem_value(value) - key = check_array_indexer(self, key) - self._data[key] = value + no_op = check_setitem_lengths(key, value, self) + if no_op: + return + + super().__setitem__(key, value) self._maybe_clear_freq() def _maybe_clear_freq(self): @@ -662,26 +377,23 @@ def astype(self, dtype, copy=True): def view(self, dtype=None): if dtype is None or dtype is self.dtype: - return type(self)(self._data, dtype=self.dtype) - return self._data.view(dtype=dtype) + return type(self)(self._ndarray, dtype=self.dtype) + return self._ndarray.view(dtype=dtype) # ------------------------------------------------------------------ # ExtensionArray Interface @classmethod - def _concat_same_type(cls, to_concat, axis: int = 0): - - # do not pass tz to set because tzlocal cannot be hashed - dtypes = {str(x.dtype) for x in to_concat} - if len(dtypes) != 1: - raise ValueError("to_concat must have the same dtype (tz)", dtypes) + def _concat_same_type( + cls: Type[DatetimeLikeArrayT], + to_concat: Sequence[DatetimeLikeArrayT], + axis: int = 0, + ) -> DatetimeLikeArrayT: + new_obj = super()._concat_same_type(to_concat, axis) obj = to_concat[0] dtype = obj.dtype - i8values = [x.asi8 for x in to_concat] - values = np.concatenate(i8values, axis=axis) - new_freq = None if is_period_dtype(dtype): new_freq = obj.freq @@ -695,38 +407,69 @@ def _concat_same_type(cls, to_concat, axis: int = 0): if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): new_freq = obj.freq - return cls._simple_new(values, dtype=dtype, freq=new_freq) + new_obj._freq = new_freq + return new_obj def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: - values = self.asi8.copy() - return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) + new_obj = super().copy() + new_obj._freq = self.freq + return new_obj def _values_for_factorize(self): - return self.asi8, iNaT + return self._ndarray, iNaT @classmethod - def _from_factorized(cls, values, original): + def _from_factorized( + cls: Type[DatetimeLikeArrayT], values, original + ) -> DatetimeLikeArrayT: return cls(values, dtype=original.dtype) - def _values_for_argsort(self): - return self._data + # ------------------------------------------------------------------ + # Validation Methods + # TODO: try to de-duplicate these, ensure identical behavior - @Appender(ExtensionArray.shift.__doc__) - def shift(self, periods=1, fill_value=None, axis=0): + def _validate_comparison_value(self, other): + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except ValueError: + # failed to parse as Timestamp/Timedelta/Period + raise InvalidComparison(other) - fill_value = self._validate_shift_value(fill_value) - new_values = shift(self._data, periods, axis, fill_value) + if isinstance(other, self._recognized_scalars) or other is NaT: + # pandas\core\arrays\datetimelike.py:432: error: Too many arguments + # for "object" [call-arg] + other = self._scalar_type(other) # type: ignore[call-arg] + try: + self._check_compatible_with(other) + except TypeError as err: + # e.g. tzawareness mismatch + raise InvalidComparison(other) from err - return type(self)._simple_new(new_values, dtype=self.dtype) + elif not is_list_like(other): + raise InvalidComparison(other) - # ------------------------------------------------------------------ - # Validation Methods - # TODO: try to de-duplicate these, ensure identical behavior + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + try: + other = self._validate_listlike(other, allow_object=True) + self._check_compatible_with(other) + except TypeError as err: + if is_object_dtype(getattr(other, "dtype", None)): + # We will have to operate element-wise + pass + else: + raise InvalidComparison(other) from err + + return other def _validate_fill_value(self, fill_value): """ If a fill_value is passed to `take` convert it to an i8 representation, - raising ValueError if this is not possible. + raising TypeError if this is not possible. Parameters ---------- @@ -734,35 +477,31 @@ def _validate_fill_value(self, fill_value): Returns ------- - fill_value : np.int64 + fill_value : np.int64, np.datetime64, or np.timedelta64 Raises ------ - ValueError + TypeError """ - msg = ( - f"'fill_value' should be a {self._scalar_type}. " - f"Got '{str(fill_value)}'." - ) - try: - fill_value = self._validate_scalar(fill_value, msg) - except TypeError as err: - raise ValueError(msg) from err - return self._unbox(fill_value) + return self._validate_scalar(fill_value) def _validate_shift_value(self, fill_value): # TODO(2.0): once this deprecation is enforced, use _validate_fill_value if is_valid_nat_for_dtype(fill_value, self.dtype): fill_value = NaT elif isinstance(fill_value, self._recognized_scalars): - fill_value = self._scalar_type(fill_value) + # pandas\core\arrays\datetimelike.py:746: error: Too many arguments + # for "object" [call-arg] + fill_value = self._scalar_type(fill_value) # type: ignore[call-arg] else: # only warn if we're not going to raise if self._scalar_type is Period and lib.is_integer(fill_value): # kludge for #31971 since Period(integer) tries to cast to str - new_fill = Period._from_ordinal(fill_value, freq=self.dtype.freq) + new_fill = Period._from_ordinal(fill_value, freq=self.freq) else: - new_fill = self._scalar_type(fill_value) + # pandas\core\arrays\datetimelike.py:753: error: Too many + # arguments for "object" [call-arg] + new_fill = self._scalar_type(fill_value) # type: ignore[call-arg] # stacklevel here is chosen to be correct when called from # DataFrame.shift or Series.shift @@ -775,10 +514,15 @@ def _validate_shift_value(self, fill_value): ) fill_value = new_fill - return self._unbox(fill_value) + return self._unbox(fill_value, setitem=True) def _validate_scalar( - self, value, msg: Optional[str] = None, cast_str: bool = False + self, + value, + *, + allow_listlike: bool = False, + setitem: bool = True, + unbox: bool = True, ): """ Validate that the input value can be cast to our scalar_type. @@ -786,22 +530,25 @@ def _validate_scalar( Parameters ---------- value : object - msg : str, optional. - Message to raise in TypeError on invalid input. - If not provided, `value` is cast to a str and used - as the message. - cast_str : bool, default False - Whether to try to parse string input to scalar_type. + allow_listlike: bool, default False + When raising an exception, whether the message should say + listlike inputs are allowed. + setitem : bool, default True + Whether to check compatibility with setitem strictness. + unbox : bool, default True + Whether to unbox the result before returning. Note: unbox=False + skips the setitem compatibility check. Returns ------- self._scalar_type or NaT """ - if cast_str and isinstance(value, str): + if isinstance(value, str): # NB: Careful about tzawareness try: value = self._scalar_from_string(value) except ValueError as err: + msg = self._validation_error_message(value, allow_listlike) raise TypeError(msg) from err elif is_valid_nat_for_dtype(value, self.dtype): @@ -809,18 +556,48 @@ def _validate_scalar( value = NaT elif isinstance(value, self._recognized_scalars): - value = self._scalar_type(value) # type: ignore + # error: Too many arguments for "object" [call-arg] + value = self._scalar_type(value) # type: ignore[call-arg] else: - if msg is None: - msg = str(value) + msg = self._validation_error_message(value, allow_listlike) raise TypeError(msg) - return value + if not unbox: + # NB: In general NDArrayBackedExtensionArray will unbox here; + # this option exists to prevent a performance hit in + # TimedeltaIndex.get_loc + return value + return self._unbox_scalar(value, setitem=setitem) - def _validate_listlike( - self, value, opname: str, cast_str: bool = False, allow_object: bool = False - ): + def _validation_error_message(self, value, allow_listlike: bool = False) -> str: + """ + Construct an exception message on validation error. + + Some methods allow only scalar inputs, while others allow either scalar + or listlike. + + Parameters + ---------- + allow_listlike: bool, default False + + Returns + ------- + str + """ + if allow_listlike: + msg = ( + f"value should be a '{self._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{type(value).__name__}' instead." + ) + else: + msg = ( + f"value should be a '{self._scalar_type.__name__}' or 'NaT'. " + f"Got '{type(value).__name__}' instead." + ) + return msg + + def _validate_listlike(self, value, allow_object: bool = False): if isinstance(value, type(self)): return value @@ -829,7 +606,7 @@ def _validate_listlike( value = array(value) value = extract_array(value, extract_numpy=True) - if cast_str and is_dtype_equal(value.dtype, "string"): + if is_dtype_equal(value.dtype, "string"): # We got a StringArray try: # TODO: Could use from_sequence_of_strings if implemented @@ -843,72 +620,45 @@ def _validate_listlike( if is_dtype_equal(value.categories.dtype, self.dtype): # TODO: do we need equal dtype or just comparable? value = value._internal_get_values() + value = extract_array(value, extract_numpy=True) if allow_object and is_object_dtype(value.dtype): pass elif not type(self)._is_recognized_dtype(value.dtype): - raise TypeError( - f"{opname} requires compatible dtype or scalar, " - f"not {type(value).__name__}" - ) + msg = self._validation_error_message(value, True) + raise TypeError(msg) return value def _validate_searchsorted_value(self, value): - msg = "searchsorted requires compatible dtype or scalar" if not is_list_like(value): - value = self._validate_scalar(value, msg, cast_str=True) + return self._validate_scalar(value, allow_listlike=True, setitem=False) else: - # TODO: cast_str? we accept it for scalar - value = self._validate_listlike(value, "searchsorted") + value = self._validate_listlike(value) return self._unbox(value) def _validate_setitem_value(self, value): - msg = ( - f"'value' should be a '{self._scalar_type.__name__}', 'NaT', " - f"or array of those. Got '{type(value).__name__}' instead." - ) if is_list_like(value): - value = self._validate_listlike(value, "setitem", cast_str=True) - else: - # TODO: cast_str for consistency? - value = self._validate_scalar(value, msg, cast_str=False) - - self._check_compatible_with(value, setitem=True) - return self._unbox(value) - - def _validate_insert_value(self, value): - msg = f"cannot insert {type(self).__name__} with incompatible label" - value = self._validate_scalar(value, msg, cast_str=False) - - self._check_compatible_with(value, setitem=True) - # TODO: if we dont have compat, should we raise or astype(object)? - # PeriodIndex does astype(object) - return value - - def _validate_where_value(self, other): - msg = f"Where requires matching dtype, not {type(other)}" - if not is_list_like(other): - other = self._validate_scalar(other, msg) + value = self._validate_listlike(value) else: - other = self._validate_listlike(other, "where") - self._check_compatible_with(other, setitem=True) + return self._validate_scalar(value, allow_listlike=True) - self._check_compatible_with(other, setitem=True) - return self._unbox(other) + return self._unbox(value, setitem=True) - def _unbox(self, other) -> Union[np.int64, np.ndarray]: + def _unbox( + self, other, setitem: bool = False + ) -> Union[np.int64, np.datetime64, np.timedelta64, np.ndarray]: """ Unbox either a scalar with _unbox_scalar or an instance of our own type. """ if lib.is_scalar(other): - other = self._unbox_scalar(other) + other = self._unbox_scalar(other, setitem=setitem) else: # same type as self - self._check_compatible_with(other) - other = other.view("i8") + self._check_compatible_with(other, setitem=setitem) + other = other._ndarray return other # ------------------------------------------------------------------ @@ -916,37 +666,7 @@ def _unbox(self, other) -> Union[np.int64, np.ndarray]: # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def searchsorted(self, value, side="left", sorter=None): - """ - Find indices where elements should be inserted to maintain order. - - Find the indices into a sorted array `self` such that, if the - corresponding elements in `value` were inserted before the indices, - the order of `self` would be preserved. - - Parameters - ---------- - value : array_like - Values to insert into `self`. - side : {'left', 'right'}, optional - If 'left', the index of the first suitable location found is given. - If 'right', return the last such index. If there is no suitable - index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array_like, optional - Optional array of integer indices that sort `self` into ascending - order. They are typically the result of ``np.argsort``. - - Returns - ------- - indices : array of ints - Array of insertion points with the same shape as `value`. - """ - value = self._validate_searchsorted_value(value) - - # TODO: Use datetime64 semantics for sorting, xref GH#29844 - return self.asi8.searchsorted(value, side=side, sorter=sorter) - - def value_counts(self, dropna=False): + def value_counts(self, dropna: bool = False): """ Return a Series containing counts of unique values. @@ -959,12 +679,12 @@ def value_counts(self, dropna=False): ------- Series """ - from pandas import Series, Index + from pandas import Index, Series if dropna: - values = self[~self.isna()]._data + values = self[~self.isna()]._ndarray else: - values = self._data + values = self._ndarray cls = type(self) @@ -984,31 +704,86 @@ def map(self, mapper): return Index(self).map(mapper).array + def isin(self, values) -> np.ndarray: + """ + Compute boolean array of whether each value is found in the + passed set of values. + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + ndarray[bool] + """ + if not hasattr(values, "dtype"): + values = np.asarray(values) + + if values.dtype.kind in ["f", "i", "u", "c"]: + # TODO: de-duplicate with equals, validate_comparison_value + return np.zeros(self.shape, dtype=bool) + + if not isinstance(values, type(self)): + inferrable = [ + "timedelta", + "timedelta64", + "datetime", + "datetime64", + "date", + "period", + ] + if values.dtype == object: + inferred = lib.infer_dtype(values, skipna=False) + if inferred not in inferrable: + if inferred == "string": + pass + + elif "mixed" in inferred: + return isin(self.astype(object), values) + else: + return np.zeros(self.shape, dtype=bool) + + try: + values = type(self)._from_sequence(values) + except ValueError: + return isin(self.astype(object), values) + + try: + self._check_compatible_with(values) + except (TypeError, ValueError): + # Includes tzawareness mismatch and IncompatibleFrequencyError + return np.zeros(self.shape, dtype=bool) + + return isin(self.asi8, values.asi8) + # ------------------------------------------------------------------ # Null Handling - def isna(self): + def isna(self) -> np.ndarray: return self._isnan @property # NB: override with cache_readonly in immutable subclasses - def _isnan(self): + def _isnan(self) -> np.ndarray: """ return if each value is nan """ return self.asi8 == iNaT @property # NB: override with cache_readonly in immutable subclasses - def _hasnans(self): + def _hasnans(self) -> np.ndarray: """ return if I have any nans; enables various perf speedups """ return bool(self._isnan.any()) - def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): + def _maybe_mask_results( + self, result: np.ndarray, fill_value=iNaT, convert=None + ) -> np.ndarray: """ Parameters ---------- - result : a ndarray + result : np.ndarray fill_value : object, default iNaT convert : str, dtype or None @@ -1026,56 +801,9 @@ def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): result = result.astype(convert) if fill_value is None: fill_value = np.nan - result[self._isnan] = fill_value + np.putmask(result, self._isnan, fill_value) return result - def fillna(self, value=None, method=None, limit=None): - # TODO(GH-20300): remove this - # Just overriding to ensure that we avoid an astype(object). - # Either 20300 or a `_values_for_fillna` would avoid this duplication. - if isinstance(value, ABCSeries): - value = value.array - - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f" expected {len(self)}" - ) - value = value[mask] - - if mask.any(): - if method is not None: - if method == "pad": - func = missing.pad_1d - else: - func = missing.backfill_1d - - values = self._data - if not is_period_dtype(self.dtype): - # For PeriodArray self._data is i8, which gets copied - # by `func`. Otherwise we need to make a copy manually - # to avoid modifying `self` in-place. - values = values.copy() - - new_values = func(values, limit=limit, mask=mask) - if is_datetime64tz_dtype(self.dtype): - # we need to pass int64 values to the constructor to avoid - # re-localizing incorrectly - new_values = new_values.view("i8") - new_values = type(self)(new_values, dtype=self.dtype) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - # ------------------------------------------------------------------ # Frequency Properties/Methods @@ -1129,7 +857,8 @@ def resolution(self) -> str: """ Returns day, hour, minute, second, millisecond or microsecond """ - return self._resolution_obj.attrname # type: ignore + # error: Item "None" of "Optional[Any]" has no attribute "attrname" + return self._resolution_obj.attrname # type: ignore[union-attr] @classmethod def _validate_frequency(cls, index, freq, **kwargs): @@ -1170,24 +899,62 @@ def _validate_frequency(cls, index, freq, **kwargs): f"does not conform to passed frequency {freq.freqstr}" ) from e + @classmethod + def _generate_range( + cls: Type[DatetimeLikeArrayT], start, end, periods, freq, *args, **kwargs + ) -> DatetimeLikeArrayT: + raise AbstractMethodError(cls) + # monotonicity/uniqueness properties are called via frequencies.infer_freq, # see GH#23789 @property - def _is_monotonic_increasing(self): + def _is_monotonic_increasing(self) -> bool: return algos.is_monotonic(self.asi8, timelike=True)[0] @property - def _is_monotonic_decreasing(self): + def _is_monotonic_decreasing(self) -> bool: return algos.is_monotonic(self.asi8, timelike=True)[1] @property - def _is_unique(self): + def _is_unique(self) -> bool: return len(unique1d(self.asi8)) == len(self) # ------------------------------------------------------------------ # Arithmetic Methods - _create_comparison_method = classmethod(_datetimelike_array_cmp) + + def _cmp_method(self, other, op): + if self.ndim > 1 and getattr(other, "shape", None) == self.shape: + # TODO: handle 2D-like listlikes + return op(self.ravel(), other.ravel()).reshape(self.shape) + + try: + other = self._validate_comparison_value(other) + except InvalidComparison: + return invalid_comparison(self, other, op) + + dtype = getattr(other, "dtype", None) + if is_object_dtype(dtype): + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would fail to raise when + # comparing tz-aware and tz-naive + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY( + op, np.asarray(self.astype(object)), other + ) + return result + + other_vals = self._unbox(other) + # GH#37462 comparison on i8 values is almost 2x faster than M8/m8 + result = op(self._ndarray.view("i8"), other_vals.view("i8")) + + o_mask = isna(other) + mask = self._isnan | o_mask + if mask.any(): + nat_result = op is operator.ne + np.putmask(result, mask, nat_result) + + return result # pow is invalid for all three subclasses; TimedeltaArray will override # the multiplication and division ops @@ -1222,7 +989,7 @@ def _sub_period(self, other): raise TypeError(f"cannot subtract Period from a {type(self).__name__}") def _add_period(self, other: Period): - # Overriden by TimedeltaArray + # Overridden by TimedeltaArray raise TypeError(f"cannot add Period to a {type(self).__name__}") def _add_offset(self, offset): @@ -1239,7 +1006,7 @@ def _add_timedeltalike_scalar(self, other): if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds new_values = np.empty(self.shape, dtype="i8") - new_values[:] = iNaT + new_values.fill(iNaT) return type(self)(new_values, dtype=self.dtype) inc = delta_to_nanoseconds(other) @@ -1253,7 +1020,7 @@ def _add_timedeltalike_scalar(self, other): # adding a scalar preserves freq new_freq = self.freq - return type(self)(new_values, dtype=self.dtype, freq=new_freq) + return type(self)._simple_new(new_values, dtype=self.dtype, freq=new_freq) def _add_timedelta_arraylike(self, other): """ @@ -1280,8 +1047,8 @@ def _add_timedelta_arraylike(self, other): self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan ) if self._hasnans or other._hasnans: - mask = (self._isnan) | (other._isnan) - new_values[mask] = iNaT + mask = self._isnan | other._isnan + np.putmask(new_values, mask, iNaT) return type(self)(new_values, dtype=self.dtype) @@ -1296,7 +1063,7 @@ def _add_nat(self): # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes - result = np.zeros(self.shape, dtype=np.int64) + result = np.empty(self.shape, dtype=np.int64) result.fill(iNaT) return type(self)(result, dtype=self.dtype, freq=None) @@ -1310,7 +1077,7 @@ def _sub_nat(self): # For datetime64 dtypes by convention we treat NaT as a datetime, so # this subtraction returns a timedelta64 dtype. # For period dtype, timedelta64 is a close-enough return dtype. - result = np.zeros(self.shape, dtype=np.int64) + result = np.empty(self.shape, dtype=np.int64) result.fill(iNaT) return result.view("timedelta64[ns]") @@ -1334,9 +1101,8 @@ def _addsub_object_array(self, other: np.ndarray, op): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1: + if len(other) == 1 and self.ndim == 1: # If both 1D then broadcasting is unambiguous - # TODO(EA2D): require self.ndim == other.ndim here return op(self, other[0]) warnings.warn( @@ -1372,11 +1138,10 @@ def _time_shift(self, periods, freq=None): if isinstance(freq, str): freq = to_offset(freq) offset = periods * freq - result = self + offset - return result + return self + offset - if periods == 0: - # immutable so OK + if periods == 0 or len(self) == 0: + # GH#14811 empty case return self.copy() if self.freq is None: @@ -1526,6 +1291,7 @@ def __rsub__(self, other): # TODO: Can we simplify/generalize these cases at all? raise TypeError(f"cannot subtract {type(self).__name__} from {other.dtype}") elif is_timedelta64_dtype(self.dtype): + self = cast("TimedeltaArray", self) return (-self) + other # We get here with e.g. datetime objects @@ -1552,14 +1318,7 @@ def __isub__(self, other): # -------------------------------------------------------------- # Reductions - def _reduce(self, name, axis=0, skipna=True, **kwargs): - op = getattr(self, name, None) - if op: - return op(skipna=skipna, **kwargs) - else: - return super()._reduce(name, skipna, **kwargs) - - def min(self, axis=None, skipna=True, *args, **kwargs): + def min(self, *, axis=None, skipna=True, **kwargs): """ Return the minimum value of the Array or minimum along an axis. @@ -1570,16 +1329,25 @@ def min(self, axis=None, skipna=True, *args, **kwargs): Index.min : Return the minimum value in an Index. Series.min : Return the minimum value in a Series. """ - nv.validate_min(args, kwargs) - nv.validate_minmax_axis(axis) + nv.validate_min((), kwargs) + nv.validate_minmax_axis(axis, self.ndim) + + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmin( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + if result is NaT: + return NaT + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) - result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna()) - if isna(result): - # Period._from_ordinal does not handle np.nan gracefully - return NaT - return self._box_func(result) + result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) - def max(self, axis=None, skipna=True, *args, **kwargs): + def max(self, *, axis=None, skipna=True, **kwargs): """ Return the maximum value of the Array or maximum along an axis. @@ -1592,26 +1360,25 @@ def max(self, axis=None, skipna=True, *args, **kwargs): """ # TODO: skipna is broken with max. # See https://github.com/pandas-dev/pandas/issues/24265 - nv.validate_max(args, kwargs) - nv.validate_minmax_axis(axis) - - mask = self.isna() - if skipna: - values = self[~mask].asi8 - elif mask.any(): - return NaT - else: - values = self.asi8 + nv.validate_max((), kwargs) + nv.validate_minmax_axis(axis, self.ndim) - if not len(values): - # short-circuit for empty max / min - return NaT + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmax( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + if result is NaT: + return result + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) - result = nanops.nanmax(values, skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) - def mean(self, skipna=True): + def mean(self, *, skipna=True, axis: Optional[int] = 0): """ Return the mean value of the Array. @@ -1621,6 +1388,7 @@ def mean(self, skipna=True): ---------- skipna : bool, default True Whether to ignore any NaT elements. + axis : int, optional, default 0 Returns ------- @@ -1644,24 +1412,268 @@ def mean(self, skipna=True): "obj.to_timestamp(how='start').mean()" ) - mask = self.isna() - if skipna: - values = self[~mask] - elif mask.any(): - return NaT + result = nanops.nanmean( + self._ndarray, axis=axis, skipna=skipna, mask=self.isna() + ) + return self._wrap_reduction_result(axis, result) + + def median(self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs): + nv.validate_median((), kwargs) + + if axis is not None and abs(axis) >= self.ndim: + raise ValueError("abs(axis) must be less than ndim") + + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmedian( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) + + +class DatelikeOps(DatetimeLikeArrayMixin): + """ + Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. + """ + + @Substitution( + URL="https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior" + ) + def strftime(self, date_format): + """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format + doc <%(URL)s>`__. + + Parameters + ---------- + date_format : str + Date format string (e.g. "%%Y-%%m-%%d"). + + Returns + ------- + ndarray + NumPy ndarray of formatted strings. + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%%B %%d, %%Y, %%r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """ + result = self._format_native_types(date_format=date_format, na_rep=np.nan) + return result.astype(object) + + +_round_doc = """ + Perform {op} operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + .. versionadded:: 0.24.0 + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0 + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='T') + """ + +_round_example = """>>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + +_floor_example = """>>> rng.floor('H') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("H") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + +_ceil_example = """>>> rng.ceil('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + """ + + +class TimelikeOps(DatetimeLikeArrayMixin): + """ + Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. + """ + + def _round(self, freq, mode, ambiguous, nonexistent): + # round the local times + if is_datetime64tz_dtype(self.dtype): + # operate on naive timestamps, then convert back to aware + self = cast("DatetimeArray", self) + naive = self.tz_localize(None) + result = naive._round(freq, mode, ambiguous, nonexistent) + return result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + + values = self.view("i8") + result = round_nsint64(values, mode, freq) + result = self._maybe_mask_results(result, fill_value=NaT) + return self._simple_new(result, dtype=self.dtype) + + @Appender((_round_doc + _round_example).format(op="round")) + def round(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) + + @Appender((_round_doc + _floor_example).format(op="floor")) + def floor(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) + + @Appender((_round_doc + _ceil_example).format(op="ceil")) + def ceil(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + + # -------------------------------------------------------------- + # Frequency Methods + + def _maybe_clear_freq(self): + self._freq = None + + def _with_freq(self, freq): + """ + Helper to get a view on the same data, with a new freq. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + + Returns + ------- + Same type as self + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, BaseOffset): + # Always valid. In the TimedeltaArray case, we assume this + # is a Tick offset. + pass else: - values = self + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) - if not len(values): - # short-circuit for empty max / min - return NaT + arr = self.view() + arr._freq = freq + return arr - result = nanops.nanmean(values.view("i8"), skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + # -------------------------------------------------------------- + def factorize(self, na_sentinel=-1, sort: bool = False): + if self.freq is not None: + # We must be unique, so can short-circuit (and retain freq) + codes = np.arange(len(self), dtype=np.intp) + uniques = self.copy() # TODO: copy or view? + if sort and self.freq.n < 0: + codes = codes[::-1] + # TODO: overload __getitem__, a slice indexer returns same type as self + # error: Incompatible types in assignment (expression has type + # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable + # has type "TimelikeOps") [assignment] + uniques = uniques[::-1] # type: ignore[assignment] + return codes, uniques + # FIXME: shouldn't get here; we are ignoring sort + return super().factorize(na_sentinel=na_sentinel) -DatetimeLikeArrayMixin._add_comparison_ops() # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7058ed3682d59..f073fc2d70457 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1,12 +1,14 @@ from datetime import datetime, time, timedelta, tzinfo -from typing import Optional, Union +from typing import Optional, Union, cast import warnings import numpy as np from pandas._libs import lib, tslib from pandas._libs.tslibs import ( + BaseOffset, NaT, + NaTType, Resolution, Timestamp, conversion, @@ -36,6 +38,7 @@ is_float_dtype, is_object_dtype, is_period_dtype, + is_sparse, is_string_dtype, is_timedelta64_dtype, pandas_dtype, @@ -75,9 +78,7 @@ def tz_to_dtype(tz): def _field_accessor(name, field, docstring=None): def f(self): - values = self.asi8 - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() + values = self._local_timestamps() if field in self._bool_ops: if field.endswith(("start", "end")): @@ -113,7 +114,7 @@ def f(self): return property(f) -class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): +class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. @@ -154,6 +155,7 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps _scalar_type = Timestamp _recognized_scalars = (datetime, np.datetime64) _is_recognized_dtype = is_datetime64_any_dtype + _infer_matches = ("datetime", "datetime64", "date") # define my properties & methods for delegation _bool_ops = [ @@ -177,7 +179,9 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps "week", "weekday", "dayofweek", + "day_of_week", "dayofyear", + "day_of_year", "quarter", "days_in_month", "daysinmonth", @@ -284,7 +288,9 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new(cls, values, freq=None, dtype=DT64NS_DTYPE): + def _simple_new( + cls, values, freq: Optional[BaseOffset] = None, dtype=DT64NS_DTYPE + ) -> "DatetimeArray": assert isinstance(values, np.ndarray) if values.dtype != DT64NS_DTYPE: assert values.dtype == "i8" @@ -297,7 +303,11 @@ def _simple_new(cls, values, freq=None, dtype=DT64NS_DTYPE): return result @classmethod - def _from_sequence( + def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): + return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_sequence_not_strict( cls, data, dtype=None, @@ -418,9 +428,9 @@ def _generate_range( # index is localized datetime64 array -> have to convert # start/end as well to compare if start is not None: - start = start.tz_localize(tz).asm8 + start = start.tz_localize(tz, ambiguous, nonexistent).asm8 if end is not None: - end = end.tz_localize(tz).asm8 + end = end.tz_localize(tz, ambiguous, nonexistent).asm8 else: # Create a linearly spaced date_range in local time # Nanosecond-granularity timestamps aren't always correctly @@ -436,9 +446,11 @@ def _generate_range( ) if not left_closed and len(index) and index[0] == start: - index = index[1:] + # TODO: overload DatetimeLikeArrayMixin.__getitem__ + index = cast(DatetimeArray, index[1:]) if not right_closed and len(index) and index[-1] == end: - index = index[:-1] + # TODO: overload DatetimeLikeArrayMixin.__getitem__ + index = cast(DatetimeArray, index[:-1]) dtype = tz_to_dtype(tz) return cls._simple_new(index.asi8, freq=freq, dtype=dtype) @@ -446,12 +458,13 @@ def _generate_range( # ----------------------------------------------------------------- # DatetimeLike Interface - def _unbox_scalar(self, value): + def _unbox_scalar(self, value, setitem: bool = False) -> np.datetime64: if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") if not isna(value): - self._check_compatible_with(value) - return value.value + self._check_compatible_with(value, setitem=setitem) + return value.asm8 + return np.datetime64(value.value, "ns") def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) @@ -463,17 +476,13 @@ def _check_compatible_with(self, other, setitem: bool = False): if setitem: # Stricter check for setitem vs comparison methods if not timezones.tz_compare(self.tz, other.tz): - raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") - - def _maybe_clear_freq(self): - self._freq = None + raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'") # ----------------------------------------------------------------- # Descriptive Properties - @property - def _box_func(self): - return lambda x: Timestamp(x, freq=self.freq, tz=self.tz) + def _box_func(self, x) -> Union[Timestamp, NaTType]: + return Timestamp(x, freq=self.freq, tz=self.tz) @property def dtype(self) -> Union[np.dtype, DatetimeTZDtype]: @@ -554,20 +563,22 @@ def __iter__(self): ------ tstamp : Timestamp """ - - # convert in chunks of 10k for efficiency - data = self.asi8 - length = len(self) - chunksize = 10000 - chunks = int(length / chunksize) + 1 - for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, length) - converted = ints_to_pydatetime( - data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" - ) - for v in converted: - yield v + if self.ndim > 1: + for i in range(len(self)): + yield self[i] + else: + # convert in chunks of 10k for efficiency + data = self.asi8 + length = len(self) + chunksize = 10000 + chunks = int(length / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pydatetime( + data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" + ) + yield from converted def astype(self, dtype, copy=True): # We handle @@ -602,9 +613,9 @@ def astype(self, dtype, copy=True): # Rendering Methods def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import _get_format_datetime64_from_values + from pandas.io.formats.format import get_format_datetime64_from_values - fmt = _get_format_datetime64_from_values(self, date_format) + fmt = get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( self.asi8.ravel(), tz=self.tz, format=fmt, na_rep=na_rep @@ -670,7 +681,7 @@ def _sub_datetime_arraylike(self, other): arr_mask = self._isnan | other._isnan new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=arr_mask) if self._hasnans or other._hasnans: - new_values[arr_mask] = iNaT + np.putmask(new_values, arr_mask, iNaT) return new_values.view("timedelta64[ns]") def _add_offset(self, offset): @@ -683,7 +694,7 @@ def _add_offset(self, offset): values = self.tz_localize(None) else: values = self - result = offset.apply_index(values) + result = offset._apply_array(values) result = DatetimeArray._simple_new(result) result = result.tz_localize(self.tz) @@ -728,6 +739,8 @@ def _local_timestamps(self): This is used to calculate time-of-day information as if the timestamps were timezone-naive. """ + if self.tz is None or timezones.is_utc(self.tz): + return self.asi8 return tzconversion.tz_convert_from_utc(self.asi8, self.tz) def tz_convert(self, tz): @@ -1144,8 +1157,6 @@ def month_name(self, locale=None): """ Return the month names of the DateTimeIndex with specified locale. - .. versionadded:: 0.23.0 - Parameters ---------- locale : str, optional @@ -1166,10 +1177,7 @@ def month_name(self, locale=None): >>> idx.month_name() Index(['January', 'February', 'March'], dtype='object') """ - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - else: - values = self.asi8 + values = self._local_timestamps() result = fields.get_date_name_field(values, "month_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) @@ -1179,8 +1187,6 @@ def day_name(self, locale=None): """ Return the day names of the DateTimeIndex with specified locale. - .. versionadded:: 0.23.0 - Parameters ---------- locale : str, optional @@ -1201,10 +1207,7 @@ def day_name(self, locale=None): >>> idx.day_name() Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') """ - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - else: - values = self.asi8 + values = self._local_timestamps() result = fields.get_date_name_field(values, "day_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) @@ -1218,10 +1221,7 @@ def time(self): # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - if self.tz is not None and not timezones.is_utc(self.tz): - timestamps = self._local_timestamps() - else: - timestamps = self.asi8 + timestamps = self._local_timestamps() return ints_to_pydatetime(timestamps, box="time") @@ -1242,10 +1242,7 @@ def date(self): # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - if self.tz is not None and not timezones.is_utc(self.tz): - timestamps = self._local_timestamps() - else: - timestamps = self.asi8 + timestamps = self._local_timestamps() return ints_to_pydatetime(timestamps, box="date") @@ -1263,8 +1260,10 @@ def isocalendar(self): See Also -------- - Timestamp.isocalendar - datetime.date.isocalendar + Timestamp.isocalendar : Function return a 3-tuple containing ISO year, + week number, and weekday for the given Timestamp object. + datetime.date.isocalendar : Return a named tuple object with + three components: year, week and weekday. Examples -------- @@ -1284,10 +1283,7 @@ def isocalendar(self): """ from pandas import DataFrame - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - else: - values = self.asi8 + values = self._local_timestamps() sarray = fields.build_isocalendar_sarray(values) iso_calendar_df = DataFrame( sarray, columns=["year", "week", "day"], dtype="UInt32" @@ -1540,16 +1536,18 @@ def weekofyear(self): 2017-01-08 6 Freq: D, dtype: int64 """ - dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc) - weekday = dayofweek + day_of_week = _field_accessor("day_of_week", "dow", _dayofweek_doc) + dayofweek = day_of_week + weekday = day_of_week - dayofyear = _field_accessor( + day_of_year = _field_accessor( "dayofyear", "doy", """ The ordinal day of the year. """, ) + dayofyear = day_of_year quarter = _field_accessor( "quarter", "q", @@ -1863,6 +1861,28 @@ def to_julian_date(self): / 24.0 ) + # ----------------------------------------------------------------- + # Reductions + + def std( + self, + axis=None, + dtype=None, + out=None, + ddof: int = 1, + keepdims: bool = False, + skipna: bool = True, + ): + # Because std is translation-invariant, we can get self.std + # by calculating (self - Timestamp(0)).std, and we can do it + # without creating a copy by using a view on self._ndarray + from pandas.core.arrays import TimedeltaArray + + tda = TimedeltaArray(self._ndarray.view("i8")) + return tda.std( + axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims, skipna=skipna + ) + # ------------------------------------------------------------------- # Constructor Helpers @@ -1937,7 +1957,11 @@ def sequence_to_dt64ns( data, copy = maybe_convert_dtype(data, copy) data_dtype = getattr(data, "dtype", None) - if is_object_dtype(data_dtype) or is_string_dtype(data_dtype): + if ( + is_object_dtype(data_dtype) + or is_string_dtype(data_dtype) + or is_sparse(data_dtype) + ): # TODO: We do not have tests specific to string-dtypes, # also complex or categorical or other extension copy = False @@ -1949,7 +1973,13 @@ def sequence_to_dt64ns( data, inferred_tz = objects_to_datetime64ns( data, dayfirst=dayfirst, yearfirst=yearfirst ) - tz = _maybe_infer_tz(tz, inferred_tz) + if tz and inferred_tz: + # two timezones: convert to intended from base UTC repr + data = tzconversion.tz_convert_from_utc(data.view("i8"), tz) + data = data.view(DT64NS_DTYPE) + elif inferred_tz: + tz = inferred_tz + data_dtype = data.dtype # `data` may have originally been a Categorical[datetime64[ns, tz]], @@ -2019,6 +2049,7 @@ def objects_to_datetime64ns( utc : bool, default False Whether to convert timezone-aware timestamps to UTC. errors : {'raise', 'ignore', 'coerce'} + require_iso8601 : bool, default False allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py new file mode 100644 index 0000000000000..1077538f6a21d --- /dev/null +++ b/pandas/core/arrays/floating.py @@ -0,0 +1,515 @@ +import numbers +from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union +import warnings + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas._typing import ArrayLike, DtypeObj +from pandas.compat.numpy import function as nv +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_datetime64_dtype, + is_float_dtype, + is_integer_dtype, + is_list_like, + is_object_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.core import ops +from pandas.core.ops import invalid_comparison +from pandas.core.tools.numeric import to_numeric + +from .masked import BaseMaskedDtype +from .numeric import NumericArray + +if TYPE_CHECKING: + import pyarrow + + +class FloatingDtype(BaseMaskedDtype): + """ + An ExtensionDtype to hold a single size of floating dtype. + + These specific implementations are subclasses of the non-public + FloatingDtype. For example we have Float32Dtype to represent float32. + + The attributes name & type are set when these subclasses are created. + """ + + def __repr__(self) -> str: + return f"{self.name}Dtype()" + + @property + def _is_numeric(self) -> bool: + return True + + @classmethod + def construct_array_type(cls) -> Type["FloatingArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return FloatingArray + + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # for now only handle other floating types + if not all(isinstance(t, FloatingDtype) for t in dtypes): + return None + np_dtype = np.find_common_type( + [t.numpy_dtype for t in dtypes], [] # type: ignore[union-attr] + ) + if np.issubdtype(np_dtype, np.floating): + return FLOAT_STR_TO_DTYPE[str(np_dtype)] + return None + + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "FloatingArray": + """ + Construct FloatingArray from pyarrow Array/ChunkedArray. + """ + import pyarrow + + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + pyarrow_type = pyarrow.from_numpy_dtype(self.type) + if not array.type.equals(pyarrow_type): + array = array.cast(pyarrow_type) + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) + float_arr = FloatingArray(data.copy(), ~mask, copy=False) + results.append(float_arr) + + return FloatingArray._concat_same_type(results) + + +def coerce_to_array( + values, dtype=None, mask=None, copy: bool = False +) -> Tuple[np.ndarray, np.ndarray]: + """ + Coerce the input values array to numpy arrays with a mask. + + Parameters + ---------- + values : 1D list-like + dtype : float dtype + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + # if values is floating numpy array, preserve its dtype + if dtype is None and hasattr(values, "dtype"): + if is_float_dtype(values.dtype): + dtype = values.dtype + + if dtype is not None: + if isinstance(dtype, str) and dtype.startswith("Float"): + # Avoid DeprecationWarning from NumPy about np.dtype("Float64") + # https://github.com/numpy/numpy/pull/7476 + dtype = dtype.lower() + + if not issubclass(type(dtype), FloatingDtype): + try: + dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err + + if isinstance(values, FloatingArray): + values, mask = values._data, values._mask + if dtype is not None: + values = values.astype(dtype.numpy_dtype, copy=False) + + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + values = np.array(values, copy=copy) + if is_object_dtype(values): + inferred_type = lib.infer_dtype(values, skipna=True) + if inferred_type == "empty": + values = np.empty(len(values)) + values.fill(np.nan) + elif inferred_type not in [ + "floating", + "integer", + "mixed-integer", + "integer-na", + "mixed-integer-float", + ]: + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + + elif is_bool_dtype(values) and is_float_dtype(dtype): + values = np.array(values, dtype=float, copy=copy) + + elif not (is_integer_dtype(values) or is_float_dtype(values)): + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + + if mask is None: + mask = isna(values) + else: + assert len(mask) == len(values) + + if not values.ndim == 1: + raise TypeError("values must be a 1D list-like") + if not mask.ndim == 1: + raise TypeError("mask must be a 1D list-like") + + # infer dtype if needed + if dtype is None: + dtype = np.dtype("float64") + else: + dtype = dtype.type + + # if we are float, let's make sure that we can + # safely cast + + # we copy as need to coerce here + # TODO should this be a safe cast? + if mask.any(): + values = values.copy() + values[mask] = np.nan + values = values.astype(dtype, copy=False) # , casting="safe") + else: + values = values.astype(dtype, copy=False) # , casting="safe") + + return values, mask + + +class FloatingArray(NumericArray): + """ + Array of floating (optional missing) values. + + .. versionadded:: 1.2.0 + + .. warning:: + + FloatingArray is currently experimental, and its API or internal + implementation may change without warning. Expecially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. + + We represent a FloatingArray with 2 numpy arrays: + + - data: contains a numpy float array of the appropriate dtype + - mask: a boolean array holding a mask on the data, True is missing + + To construct an FloatingArray from generic array-like input, use + :func:`pandas.array` with one of the float dtypes (see examples). + + See :ref:`integer_na` for more. + + Parameters + ---------- + values : numpy.ndarray + A 1-d float-dtype array. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values. + copy : bool, default False + Whether to copy the `values` and `mask`. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + FloatingArray + + Examples + -------- + Create an FloatingArray with :func:`pandas.array`: + + >>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype()) + + [0.1, , 0.3] + Length: 3, dtype: Float32 + + String aliases for the dtypes are also available. They are capitalized. + + >>> pd.array([0.1, None, 0.3], dtype="Float32") + + [0.1, , 0.3] + Length: 3, dtype: Float32 + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 0.0 + + @cache_readonly + def dtype(self) -> FloatingDtype: + return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] + + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): + raise TypeError( + "values should be floating numpy array. Use " + "the 'pd.array' function instead" + ) + super().__init__(values, mask, copy=copy) + + @classmethod + def _from_sequence( + cls, scalars, *, dtype=None, copy: bool = False + ) -> "FloatingArray": + values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) + return FloatingArray(values, mask) + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype=None, copy: bool = False + ) -> "FloatingArray": + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype=dtype, copy=copy) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): + # For FloatingArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (FloatingArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, FloatingArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + # TODO + if is_float_dtype(x.dtype): + m = mask.copy() + return FloatingArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]: + return coerce_to_array(value, dtype=self.dtype) + + def astype(self, dtype, copy: bool = True) -> ArrayLike: + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + ndarray or ExtensionArray + NumPy ndarray, or BooleanArray, IntegerArray or FloatingArray with + 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an FloatingDtype, equivalent of same_kind + casting + """ + from pandas.core.arrays.string_ import StringArray, StringDtype + + dtype = pandas_dtype(dtype) + + # if the dtype is exactly the same, we can fastpath + if self.dtype == dtype: + # return the same object for copy=False + return self.copy() if copy else self + # if we are astyping to another nullable masked dtype, we can fastpath + if isinstance(dtype, BaseMaskedDtype): + # TODO deal with NaNs + data = self._data.astype(dtype.numpy_dtype, copy=copy) + # mask is copied depending on whether the data was copied, and + # not directly depending on the `copy` keyword + mask = self._mask if data is self._data else self._mask.copy() + return dtype.construct_array_type()(data, mask, copy=False) + elif isinstance(dtype, StringDtype): + return StringArray._from_sequence(self, copy=False) + + # coerce + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = {"na_value": np.nan} + elif is_datetime64_dtype(dtype): + kwargs = {"na_value": np.datetime64("NaT")} + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) + return astype_nansafe(data, dtype, copy=False) + + def _values_for_argsort(self) -> np.ndarray: + return self._data + + def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray, IntegerArray + + mask = None + + if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op.__name__}__") + result = method(other) + + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask + + return BooleanArray(result, mask) + + def sum(self, *, skipna=True, min_count=0, **kwargs): + nv.validate_sum((), kwargs) + return super()._reduce("sum", skipna=skipna, min_count=min_count) + + def prod(self, *, skipna=True, min_count=0, **kwargs): + nv.validate_prod((), kwargs) + return super()._reduce("prod", skipna=skipna, min_count=min_count) + + def min(self, *, skipna=True, **kwargs): + nv.validate_min((), kwargs) + return super()._reduce("min", skipna=skipna) + + def max(self, *, skipna=True, **kwargs): + nv.validate_max((), kwargs) + return super()._reduce("max", skipna=skipna) + + def _maybe_mask_result(self, result, mask, other, op_name: str): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # TODO are there cases we don't end up with float? + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + # if (is_float_dtype(other) or is_float(other)) or ( + # op_name in ["rtruediv", "truediv"] + # ): + # result[mask] = np.nan + # return result + + return type(self)(result, mask, copy=False) + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} data. + +This dtype uses ``pd.NA`` as missing value indicator. + +Attributes +---------- +None + +Methods +------- +None +""" + +# create the Dtype + + +@register_extension_dtype +class Float32Dtype(FloatingDtype): + type = np.float32 + name = "Float32" + __doc__ = _dtype_docstring.format(dtype="float32") + + +@register_extension_dtype +class Float64Dtype(FloatingDtype): + type = np.float64 + name = "Float64" + __doc__ = _dtype_docstring.format(dtype="float64") + + +FLOAT_STR_TO_DTYPE = { + "float32": Float32Dtype(), + "float64": Float64Dtype(), +} diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b0958af41158c..fa427e94fe08f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -4,9 +4,8 @@ import numpy as np -from pandas._libs import lib, missing as libmissing +from pandas._libs import iNaT, lib, missing as libmissing from pandas._typing import ArrayLike, DtypeObj -from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly @@ -16,7 +15,6 @@ is_datetime64_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_list_like, is_object_dtype, @@ -25,15 +23,14 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops -from pandas.core.array_algos import masked_reductions from pandas.core.ops import invalid_comparison -from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric from .masked import BaseMaskedArray, BaseMaskedDtype +from .numeric import NumericArray if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow class _IntegerDtype(BaseMaskedDtype): @@ -46,10 +43,6 @@ class _IntegerDtype(BaseMaskedDtype): The attributes name & type are set when these subclasses are created. """ - name: str - base = None - type: Type - def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" return f"{sign}Int{8 * self.itemsize}Dtype()" @@ -66,20 +59,6 @@ def is_unsigned_integer(self) -> bool: def _is_numeric(self) -> bool: return True - @cache_readonly - def numpy_dtype(self) -> np.dtype: - """ Return an instance of our numpy dtype """ - return np.dtype(self.type) - - @cache_readonly - def kind(self) -> str: - return self.numpy_dtype.kind - - @cache_readonly - def itemsize(self) -> int: - """ Return the number of bytes in this dtype """ - return self.numpy_dtype.itemsize - @classmethod def construct_array_type(cls) -> Type["IntegerArray"]: """ @@ -106,7 +85,11 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): - return _dtypes[str(np_dtype)] + return INT_STR_TO_DTYPE[str(np_dtype)] + elif np.issubdtype(np_dtype, np.floating): + from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE + + return FLOAT_STR_TO_DTYPE[str(np_dtype)] return None def __from_arrow__( @@ -115,7 +98,8 @@ def __from_arrow__( """ Construct IntegerArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask pyarrow_type = pyarrow.from_numpy_dtype(self.type) @@ -137,7 +121,7 @@ def __from_arrow__( return IntegerArray._concat_same_type(results) -def integer_array(values, dtype=None, copy: bool = False,) -> "IntegerArray": +def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray": """ Infer and return an integer array of the values. @@ -181,7 +165,7 @@ def safe_cast(values, dtype, copy: bool): def coerce_to_array( - values, dtype, mask=None, copy: bool = False, + values, dtype, mask=None, copy: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask @@ -198,7 +182,7 @@ def coerce_to_array( ------- tuple of (values, mask) """ - # if values is integer numpy array, preserve it's dtype + # if values is integer numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype @@ -213,7 +197,7 @@ def coerce_to_array( if not issubclass(type(dtype), _IntegerDtype): try: - dtype = _dtypes[str(np.dtype(dtype))] + dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err @@ -278,7 +262,7 @@ def coerce_to_array( return values, mask -class IntegerArray(BaseMaskedArray): +class IntegerArray(NumericArray): """ Array of integer (optional missing) values. @@ -353,7 +337,7 @@ class IntegerArray(BaseMaskedArray): @cache_readonly def dtype(self) -> _IntegerDtype: - return _dtypes[str(self._data.dtype)] + return INT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): @@ -363,16 +347,27 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): ) super().__init__(values, mask, copy=copy) + def __neg__(self): + return type(self)(-self._data, self._mask) + + def __pos__(self): + return self + + def __abs__(self): + return type(self)(np.abs(self._data), self._mask) + @classmethod - def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "IntegerArray": + def _from_sequence( + cls, scalars, *, dtype=None, copy: bool = False + ) -> "IntegerArray": return integer_array(scalars, dtype=dtype, copy=copy) @classmethod def _from_sequence_of_strings( - cls, strings, dtype=None, copy: bool = False + cls, strings, *, dtype=None, copy: bool = False ) -> "IntegerArray": scalars = to_numeric(strings, errors="raise") - return cls._from_sequence(scalars, dtype, copy) + return cls._from_sequence(scalars, dtype=dtype, copy=copy) _HANDLED_TYPES = (np.ndarray, numbers.Number) @@ -417,7 +412,7 @@ def reconstruct(x): result = getattr(ufunc, method)(*inputs2, **kwargs) if isinstance(result, tuple): - tuple(reconstruct(x) for x in result) + return tuple(reconstruct(x) for x in result) else: return reconstruct(result) @@ -490,74 +485,73 @@ def _values_for_argsort(self) -> np.ndarray: See Also -------- - ExtensionArray.argsort + ExtensionArray.argsort : Return the indices that would sort this array. """ data = self._data.copy() if self._mask.any(): data[self._mask] = data.min() - 1 return data - @classmethod - def _create_comparison_method(cls, op): - op_name = op.__name__ - - @unpack_zerodim_and_defer(op.__name__) - def cmp_method(self, other): - from pandas.arrays import BooleanArray - - mask = None - - if isinstance(other, (BooleanArray, IntegerArray)): - other, mask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError( - "can only perform ops with 1-d structures" - ) - if len(self) != len(other): - raise ValueError("Lengths must match to compare") - - if other is libmissing.NA: - # numpy does not handle pd.NA well as "other" scalar (it returns - # a scalar False instead of an array) - # This may be fixed by NA.__array_ufunc__. Revisit this check - # once that's implemented. - result = np.zeros(self._data.shape, dtype="bool") - mask = np.ones(self._data.shape, dtype="bool") - else: - with warnings.catch_warnings(): - # numpy may show a FutureWarning: - # elementwise comparison failed; returning scalar instead, - # but in the future will perform elementwise comparison - # before returning NotImplemented. We fall back to the correct - # behavior today, so that should be fine to ignore. - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - method = getattr(self._data, f"__{op_name}__") - result = method(other) - - if result is NotImplemented: - result = invalid_comparison(self._data, other, op) - - # nans propagate - if mask is None: - mask = self._mask.copy() - else: - mask = self._mask | mask + def _cmp_method(self, other, op): + from pandas.core.arrays import BooleanArray + + mask = None + + if isinstance(other, BaseMaskedArray): + other, mask = other._data, other._mask - return BooleanArray(result, mask) + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op.__name__}__") + result = method(other) + + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask - name = f"__{op.__name__}__" - return set_function_name(cmp_method, name, cls) + return BooleanArray(result, mask) - def sum(self, skipna=True, min_count=0, **kwargs): + def sum(self, *, skipna=True, min_count=0, **kwargs): nv.validate_sum((), kwargs) - result = masked_reductions.sum( - values=self._data, mask=self._mask, skipna=skipna, min_count=min_count - ) - return result + return super()._reduce("sum", skipna=skipna, min_count=min_count) + + def prod(self, *, skipna=True, min_count=0, **kwargs): + nv.validate_prod((), kwargs) + return super()._reduce("prod", skipna=skipna, min_count=min_count) + + def min(self, *, skipna=True, **kwargs): + nv.validate_min((), kwargs) + return super()._reduce("min", skipna=skipna) + + def max(self, *, skipna=True, **kwargs): + nv.validate_max((), kwargs) + return super()._reduce("max", skipna=skipna) def _maybe_mask_result(self, result, mask, other, op_name: str): """ @@ -574,88 +568,17 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - result[mask] = np.nan - return result - - return type(self)(result, mask, copy=False) - - @classmethod - def _create_arithmetic_method(cls, op): - op_name = op.__name__ + from pandas.core.arrays import FloatingArray - @unpack_zerodim_and_defer(op.__name__) - def integer_arithmetic_method(self, other): - - omask = None - - if getattr(other, "ndim", 0) > 1: - raise NotImplementedError("can only perform ops with 1-d structures") + return FloatingArray(result, mask, copy=False) - if isinstance(other, IntegerArray): - other, omask = other._data, other._mask + if result.dtype == "timedelta64[ns]": + from pandas.core.arrays import TimedeltaArray - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError( - "can only perform ops with 1-d structures" - ) - if len(self) != len(other): - raise ValueError("Lengths must match") - if not (is_float_dtype(other) or is_integer_dtype(other)): - raise TypeError("can only perform ops with numeric values") + result[mask] = iNaT + return TimedeltaArray._simple_new(result) - else: - if not (is_float(other) or is_integer(other) or other is libmissing.NA): - raise TypeError("can only perform ops with numeric values") - - if omask is None: - mask = self._mask.copy() - if other is libmissing.NA: - mask |= True - else: - mask = self._mask | omask - - if op_name == "pow": - # 1 ** x is 1. - mask = np.where((self._data == 1) & ~self._mask, False, mask) - # x ** 0 is 1. - if omask is not None: - mask = np.where((other == 0) & ~omask, False, mask) - elif other is not libmissing.NA: - mask = np.where(other == 0, False, mask) - - elif op_name == "rpow": - # 1 ** x is 1. - if omask is not None: - mask = np.where((other == 1) & ~omask, False, mask) - elif other is not libmissing.NA: - mask = np.where(other == 1, False, mask) - # x ** 0 is 1. - mask = np.where((self._data == 0) & ~self._mask, False, mask) - - if other is libmissing.NA: - result = np.ones_like(self._data) - else: - with np.errstate(all="ignore"): - result = op(self._data, other) - - # divmod returns a tuple - if op_name == "divmod": - div, mod = result - return ( - self._maybe_mask_result(div, mask, other, "floordiv"), - self._maybe_mask_result(mod, mask, other, "mod"), - ) - - return self._maybe_mask_result(result, mask, other, op_name) - - name = f"__{op.__name__}__" - return set_function_name(integer_arithmetic_method, name, cls) - - -IntegerArray._add_arithmetic_ops() -IntegerArray._add_comparison_ops() + return type(self)(result, mask, copy=False) _dtype_docstring = """ @@ -734,7 +657,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -_dtypes: Dict[str, _IntegerDtype] = { +INT_STR_TO_DTYPE: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c861d25afd13f..53a98fc43becc 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1,11 +1,19 @@ +import operator from operator import le, lt import textwrap +from typing import Sequence, Type, TypeVar import numpy as np from pandas._config import get_option -from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds +from pandas._libs.interval import ( + VALID_CLOSED, + Interval, + IntervalMixin, + intervals_to_interval_bounds, +) +from pandas._libs.missing import NA from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -15,7 +23,6 @@ is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, - is_interval, is_interval_dtype, is_list_like, is_object_dtype, @@ -27,27 +34,34 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( ABCDatetimeIndex, - ABCIndexClass, ABCIntervalIndex, ABCPeriodIndex, ABCSeries, ) -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core.algorithms import take, value_counts from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com -from pandas.core.construction import array +from pandas.core.construction import ( + array, + ensure_wrapped_if_datetimelike, + extract_array, +) from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index +from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer + +IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") -_VALID_CLOSED = {"left", "right", "both", "neither"} _interval_shared_docs = {} -_shared_docs_kwargs = dict( - klass="IntervalArray", qualname="arrays.IntervalArray", name="" -) +_shared_docs_kwargs = { + "klass": "IntervalArray", + "qualname": "arrays.IntervalArray", + "name": "", +} _interval_shared_docs[ @@ -67,8 +81,6 @@ neither. dtype : dtype or None, default None If None, dtype will be inferred. - - .. versionadded:: 0.23.0 copy : bool, default False Copy the input data. %(name)s\ @@ -117,14 +129,14 @@ @Appender( _interval_shared_docs["class"] - % dict( - klass="IntervalArray", - summary="Pandas array for interval data that are closed on the same side.", - versionadded="0.24.0", - name="", - extra_attributes="", - extra_methods="", - examples=textwrap.dedent( + % { + "klass": "IntervalArray", + "summary": "Pandas array for interval data that are closed on the same side.", + "versionadded": "0.24.0", + "name": "", + "extra_attributes": "", + "extra_methods": "", + "examples": textwrap.dedent( """\ Examples -------- @@ -141,13 +153,16 @@ :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. """ ), - ) + } ) class IntervalArray(IntervalMixin, ExtensionArray): ndim = 1 can_hold_na = True _na_value = _fill_value = np.nan + # --------------------------------------------------------------------- + # Constructors + def __new__( cls, data, @@ -157,12 +172,14 @@ def __new__( verify_integrity: bool = True, ): - if isinstance(data, ABCSeries) and is_interval_dtype(data.dtype): - data = data._values + if isinstance(data, (ABCSeries, ABCIntervalIndex)) and is_interval_dtype( + data.dtype + ): + data = data._values # TODO: extract_array? - if isinstance(data, (cls, ABCIntervalIndex)): - left = data.left - right = data.right + if isinstance(data, cls): + left = data._left + right = data._right closed = closed or data.closed else: @@ -239,6 +256,18 @@ def _simple_new( ) raise ValueError(msg) + # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray + left = ensure_wrapped_if_datetimelike(left) + left = extract_array(left, extract_numpy=True) + right = ensure_wrapped_if_datetimelike(right) + right = extract_array(right, extract_numpy=True) + + lbase = getattr(left, "_ndarray", left).base + rbase = getattr(right, "_ndarray", right).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() + result._left = left result._right = right result._closed = closed @@ -247,7 +276,7 @@ def _simple_new( return result @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars, dtype=dtype, copy=copy) @classmethod @@ -261,42 +290,40 @@ def _from_factorized(cls, values, original): _interval_shared_docs["from_breaks"] = textwrap.dedent( """ - Construct an %(klass)s from an array of splits. + Construct an %(klass)s from an array of splits. - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - copy : bool, default False - Copy the data. - dtype : dtype or None, default None - If None, dtype will be inferred. - - .. versionadded:: 0.23.0 + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + Copy the data. + dtype : dtype or None, default None + If None, dtype will be inferred. - Returns - ------- - %(klass)s + Returns + ------- + %(klass)s - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct from a left and right array. - %(klass)s.from_tuples : Construct from a sequence of tuples. + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct from a left and right array. + %(klass)s.from_tuples : Construct from a sequence of tuples. - %(examples)s\ - """ + %(examples)s\ + """ ) @classmethod @Appender( _interval_shared_docs["from_breaks"] - % dict( - klass="IntervalArray", - examples=textwrap.dedent( + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( """\ Examples -------- @@ -306,7 +333,7 @@ def _from_factorized(cls, values, original): Length: 3, closed: right, dtype: interval[int64] """ ), - ) + } ) def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): breaks = maybe_convert_platform_interval(breaks) @@ -331,8 +358,6 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): dtype : dtype, optional If None, dtype will be inferred. - .. versionadded:: 0.23.0 - Returns ------- %(klass)s @@ -367,9 +392,9 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): @classmethod @Appender( _interval_shared_docs["from_arrays"] - % dict( - klass="IntervalArray", - examples=textwrap.dedent( + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( """\ >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) @@ -377,7 +402,7 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): Length: 3, closed: right, dtype: interval[int64] """ ), - ) + } ) def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) @@ -389,44 +414,42 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): _interval_shared_docs["from_tuples"] = textwrap.dedent( """ - Construct an %(klass)s from an array-like of tuples. + Construct an %(klass)s from an array-like of tuples. - Parameters - ---------- - data : array-like (1-dimensional) - Array of tuples. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - copy : bool, default False - By-default copy the data, this is compat only and ignored. - dtype : dtype or None, default None - If None, dtype will be inferred. - - .. versionadded:: 0.23.0 + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + By-default copy the data, this is compat only and ignored. + dtype : dtype or None, default None + If None, dtype will be inferred. - Returns - ------- - %(klass)s + Returns + ------- + %(klass)s - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct an %(klass)s from a left and - right array. - %(klass)s.from_breaks : Construct an %(klass)s from an array of - splits. + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. - %(examples)s\ - """ + %(examples)s\ + """ ) @classmethod @Appender( _interval_shared_docs["from_tuples"] - % dict( - klass="IntervalArray", - examples=textwrap.dedent( + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( """\ Examples -------- @@ -436,7 +459,7 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): Length: 2, closed: right, dtype: interval[int64] """ ), - ) + } ) def from_tuples(cls, data, closed="right", copy=False, dtype=None): if len(data): @@ -475,95 +498,85 @@ def _validate(self): * left and right have the same missing values * left is always below right """ - if self.closed not in _VALID_CLOSED: + if self.closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {self.closed}" raise ValueError(msg) - if len(self.left) != len(self.right): + if len(self._left) != len(self._right): msg = "left and right must have the same length" raise ValueError(msg) - left_mask = notna(self.left) - right_mask = notna(self.right) + left_mask = notna(self._left) + right_mask = notna(self._right) if not (left_mask == right_mask).all(): msg = ( "missing values must be missing in the same " "location both left and right sides" ) raise ValueError(msg) - if not (self.left[left_mask] <= self.right[left_mask]).all(): + if not (self._left[left_mask] <= self._right[left_mask]).all(): msg = "left side of interval must be <= right side" raise ValueError(msg) - # --------- - # Interface - # --------- + def _shallow_copy(self, left, right): + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : Index + Values to be used for the left-side of the intervals. + right : Index + Values to be used for the right-side of the intervals. + """ + return self._simple_new(left, right, closed=self.closed, verify_integrity=False) + + # --------------------------------------------------------------------- + # Descriptive + + @property + def dtype(self): + return IntervalDtype(self.left.dtype) + + @property + def nbytes(self) -> int: + return self.left.nbytes + self.right.nbytes + + @property + def size(self) -> int: + # Avoid materializing self.values + return self.left.size + + # --------------------------------------------------------------------- + # EA Interface + def __iter__(self): return iter(np.asarray(self)) def __len__(self) -> int: - return len(self.left) + return len(self._left) - def __getitem__(self, value): - value = check_array_indexer(self, value) - left = self.left[value] - right = self.right[value] + def __getitem__(self, key): + key = check_array_indexer(self, key) + left = self._left[key] + right = self._right[key] - # scalar - if not isinstance(left, ABCIndexClass): + if not isinstance(left, (np.ndarray, ExtensionArray)): + # scalar if is_scalar(left) and isna(left): return self._fill_value - if np.ndim(left) > 1: - # GH#30588 multi-dimensional indexer disallowed - raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) - + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") return self._shallow_copy(left, right) def __setitem__(self, key, value): - # na value: need special casing to set directly on numpy arrays - needs_float_conversion = False - if is_scalar(value) and isna(value): - if is_integer_dtype(self.dtype.subtype): - # can't set NaN on a numpy integer array - needs_float_conversion = True - elif is_datetime64_any_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.datetime64("NaT") - elif is_timedelta64_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.timedelta64("NaT") - value_left, value_right = value, value - - # scalar interval - elif is_interval_dtype(value) or isinstance(value, Interval): - self._check_closed_matches(value, name="value") - value_left, value_right = value.left, value.right - - else: - # list-like of intervals - try: - array = IntervalArray(value) - value_left, value_right = array.left, array.right - except TypeError as err: - # wrong type: not interval or NA - msg = f"'value' should be an interval type, got {type(value)} instead." - raise TypeError(msg) from err - - if needs_float_conversion: - raise ValueError("Cannot set float NaN to integer-backed IntervalArray") - + value_left, value_right = self._validate_setitem_value(value) key = check_array_indexer(self, key) - # Need to ensure that left and right are updated atomically, so we're - # forced to copy, update the copy, and swap in the new values. - left = self.left.copy(deep=True) - left._values[key] = value_left - self._left = left - - right = self.right.copy(deep=True) - right._values[key] = value_right - self._right = right + self._left[key] = value_left + self._right[key] = value_right - def __eq__(self, other): + def _cmp_method(self, other, op): # ensure pandas array for list-like and eliminate non-interval scalars if is_list_like(other): if len(self) != len(other): @@ -571,7 +584,7 @@ def __eq__(self, other): other = array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches - return np.zeros(len(self), dtype=bool) + return invalid_comparison(self, other, op) # determine the dtype of the elements we want to compare if isinstance(other, Interval): @@ -585,33 +598,97 @@ def __eq__(self, other): # extract intervals if we have interval categories with matching closed if is_interval_dtype(other_dtype): if self.closed != other.categories.closed: - return np.zeros(len(self), dtype=bool) - other = other.categories.take(other.codes) + return invalid_comparison(self, other, op) + + other = other.categories.take( + other.codes, allow_fill=True, fill_value=other.categories._na_value + ) # interval-like -> need same closed and matching endpoints if is_interval_dtype(other_dtype): if self.closed != other.closed: - return np.zeros(len(self), dtype=bool) - return (self.left == other.left) & (self.right == other.right) + return invalid_comparison(self, other, op) + elif not isinstance(other, Interval): + other = type(self)(other) + + if op is operator.eq: + return (self._left == other.left) & (self._right == other.right) + elif op is operator.ne: + return (self._left != other.left) | (self._right != other.right) + elif op is operator.gt: + return (self._left > other.left) | ( + (self._left == other.left) & (self._right > other.right) + ) + elif op is operator.ge: + return (self == other) | (self > other) + elif op is operator.lt: + return (self._left < other.left) | ( + (self._left == other.left) & (self._right < other.right) + ) + else: + # operator.lt + return (self == other) | (self < other) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): - return np.zeros(len(self), dtype=bool) + return invalid_comparison(self, other, op) # object dtype -> iteratively check for intervals result = np.zeros(len(self), dtype=bool) for i, obj in enumerate(other): - # need object to be an Interval with same closed and endpoints - if ( - isinstance(obj, Interval) - and self.closed == obj.closed - and self.left[i] == obj.left - and self.right[i] == obj.right - ): - result[i] = True - + try: + result[i] = op(self[i], obj) + except TypeError: + if obj is NA: + # comparison with np.nan returns NA + # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 + result[i] = op is operator.ne + else: + raise return result + @unpack_zerodim_and_defer("__eq__") + def __eq__(self, other): + return self._cmp_method(other, operator.eq) + + @unpack_zerodim_and_defer("__ne__") + def __ne__(self, other): + return self._cmp_method(other, operator.ne) + + @unpack_zerodim_and_defer("__gt__") + def __gt__(self, other): + return self._cmp_method(other, operator.gt) + + @unpack_zerodim_and_defer("__ge__") + def __ge__(self, other): + return self._cmp_method(other, operator.ge) + + @unpack_zerodim_and_defer("__lt__") + def __lt__(self, other): + return self._cmp_method(other, operator.lt) + + @unpack_zerodim_and_defer("__le__") + def __le__(self, other): + return self._cmp_method(other, operator.le) + + def argsort( + self, + ascending: bool = True, + kind: str = "quicksort", + na_position: str = "last", + *args, + **kwargs, + ) -> np.ndarray: + ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) + + if ascending and kind == "quicksort" and na_position == "last": + return np.lexsort((self.right, self.left)) + + # TODO: other cases we can use lexsort for? much more performant. + return super().argsort( + ascending=ascending, kind=kind, na_position=na_position, **kwargs + ) + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -644,23 +721,12 @@ def fillna(self, value=None, method=None, limit=None): if limit is not None: raise TypeError("limit is not supported for IntervalArray.") - if not isinstance(value, Interval): - msg = ( - "'IntervalArray.fillna' only supports filling with a " - f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." - ) - raise TypeError(msg) - - self._check_closed_matches(value, name="value") + value_left, value_right = self._validate_fill_value(value) - left = self.left.fillna(value=value.left) - right = self.right.fillna(value=value.right) + left = self.left.fillna(value=value_left) + right = self.right.fillna(value=value_right) return self._shallow_copy(left, right) - @property - def dtype(self): - return IntervalDtype(self.left.dtype) - def astype(self, dtype, copy=True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. @@ -680,6 +746,7 @@ def astype(self, dtype, copy=True): array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ + from pandas import Index from pandas.core.arrays.string_ import StringDtype if dtype is not None: @@ -691,8 +758,10 @@ def astype(self, dtype, copy=True): # need to cast to different subtype try: - new_left = self.left.astype(dtype.subtype) - new_right = self.right.astype(dtype.subtype) + # We need to use Index rules for astype to prevent casting + # np.nan entries to int subtypes + new_left = Index(self._left, copy=False).astype(dtype.subtype) + new_right = Index(self._right, copy=False).astype(dtype.subtype) except TypeError as err: msg = ( f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" @@ -700,7 +769,7 @@ def astype(self, dtype, copy=True): raise TypeError(msg) from err return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): - return Categorical(np.asarray(self)) + return Categorical(np.asarray(self), dtype=dtype) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -711,8 +780,20 @@ def astype(self, dtype, copy=True): msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" raise TypeError(msg) from err + def equals(self, other) -> bool: + if type(self) != type(other): + return False + + return bool( + self.closed == other.closed + and self.left.equals(other.left) + and self.right.equals(other.right) + ) + @classmethod - def _concat_same_type(cls, to_concat): + def _concat_same_type( + cls: Type[IntervalArrayT], to_concat: Sequence[IntervalArrayT] + ) -> IntervalArrayT: """ Concatenate multiple IntervalArray @@ -733,20 +814,7 @@ def _concat_same_type(cls, to_concat): right = np.concatenate([interval.right for interval in to_concat]) return cls._simple_new(left, right, closed=closed, copy=False) - def _shallow_copy(self, left, right): - """ - Return a new IntervalArray with the replacement attributes - - Parameters - ---------- - left : Index - Values to be used for the left-side of the intervals. - right : Index - Values to be used for the right-side of the intervals. - """ - return self._simple_new(left, right, closed=self.closed, verify_integrity=False) - - def copy(self): + def copy(self: IntervalArrayT) -> IntervalArrayT: """ Return a copy of the array. @@ -754,23 +822,14 @@ def copy(self): ------- IntervalArray """ - left = self.left.copy(deep=True) - right = self.right.copy(deep=True) + left = self._left.copy() + right = self._right.copy() closed = self.closed # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) - def isna(self): - return isna(self.left) - - @property - def nbytes(self) -> int: - return self.left.nbytes + self.right.nbytes - - @property - def size(self) -> int: - # Avoid materializing self.values - return self.left.size + def isna(self) -> np.ndarray: + return isna(self._left) def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": if not len(self) or periods == 0: @@ -786,7 +845,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": empty_len = min(abs(periods), len(self)) if isna(fill_value): - fill_value = self.left._na_value + from pandas import Index + + fill_value = Index(self._left, copy=False)._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: empty = self._from_sequence([fill_value] * empty_len) @@ -799,7 +860,7 @@ def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": b = empty return self._concat_same_type([a, b]) - def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): + def take(self, indices, *, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. @@ -845,32 +906,77 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): When `indices` contains negative values other than ``-1`` and `allow_fill` is True. """ - nv.validate_take(tuple(), kwargs) + nv.validate_take((), kwargs) fill_left = fill_right = fill_value if allow_fill: - if fill_value is None: - fill_left = fill_right = self.left._na_value - elif is_interval(fill_value): - self._check_closed_matches(fill_value, name="fill_value") - fill_left, fill_right = fill_value.left, fill_value.right - elif not is_scalar(fill_value) and notna(fill_value): - msg = ( - "'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval or NA'. " - f"Got a '{type(fill_value).__name__}' instead." - ) - raise ValueError(msg) + fill_left, fill_right = self._validate_fill_value(fill_value) left_take = take( - self.left, indices, allow_fill=allow_fill, fill_value=fill_left + self._left, indices, allow_fill=allow_fill, fill_value=fill_left ) right_take = take( - self.right, indices, allow_fill=allow_fill, fill_value=fill_right + self._right, indices, allow_fill=allow_fill, fill_value=fill_right ) return self._shallow_copy(left_take, right_take) + def _validate_listlike(self, value): + # list-like of intervals + try: + array = IntervalArray(value) + # TODO: self._check_closed_matches(array, name="value") + value_left, value_right = array.left, array.right + except TypeError as err: + # wrong type: not interval or NA + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) from err + return value_left, value_right + + def _validate_scalar(self, value): + if isinstance(value, Interval): + self._check_closed_matches(value, name="value") + left, right = value.left, value.right + elif is_valid_nat_for_dtype(value, self.left.dtype): + # GH#18295 + left = right = value + else: + raise TypeError( + "can only insert Interval objects and NA into an IntervalArray" + ) + return left, right + + def _validate_fill_value(self, value): + return self._validate_scalar(value) + + def _validate_setitem_value(self, value): + needs_float_conversion = False + + if is_valid_nat_for_dtype(value, self.left.dtype): + # na value: need special casing to set directly on numpy arrays + if is_integer_dtype(self.dtype.subtype): + # can't set NaN on a numpy integer array + needs_float_conversion = True + elif is_datetime64_any_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.datetime64("NaT") + elif is_timedelta64_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.timedelta64("NaT") + value_left, value_right = value, value + + elif is_interval_dtype(value) or isinstance(value, Interval): + # scalar interval + self._check_closed_matches(value, name="value") + value_left, value_right = value.left, value.right + + else: + return self._validate_listlike(value) + + if needs_float_conversion: + raise ValueError("Cannot set float NaN to integer-backed IntervalArray") + return value_left, value_right + def value_counts(self, dropna=True): """ Returns a Series containing counts of each interval. @@ -891,7 +997,8 @@ def value_counts(self, dropna=True): # TODO: implement this is a non-naive way! return value_counts(np.asarray(self), dropna=dropna) - # Formatting + # --------------------------------------------------------------------- + # Rendering Methods def _format_data(self): @@ -945,13 +1052,18 @@ def _format_space(self): space = " " * (len(type(self).__name__) + 1) return f"\n{space}" + # --------------------------------------------------------------------- + # Vectorized Interval Properties/Attributes + @property def left(self): """ Return the left endpoints of each Interval in the IntervalArray as an Index. """ - return self._left + from pandas import Index + + return Index(self._left, copy=False) @property def right(self): @@ -959,7 +1071,112 @@ def right(self): Return the right endpoints of each Interval in the IntervalArray as an Index. """ - return self._right + from pandas import Index + + return Index(self._right, copy=False) + + @property + def length(self): + """ + Return an Index with entries denoting the length of each Interval in + the IntervalArray. + """ + try: + return self.right - self.left + except TypeError as err: + # length not defined for some types, e.g. string + msg = ( + "IntervalArray contains Intervals without defined length, " + "e.g. Intervals with string endpoints" + ) + raise TypeError(msg) from err + + @property + def mid(self): + """ + Return the midpoint of each Interval in the IntervalArray as an Index. + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * self.length + + _interval_shared_docs["overlaps"] = textwrap.dedent( + """ + Check elementwise if an Interval overlaps the values in the %(klass)s. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + other : %(klass)s + Interval to check against for an overlap. + + Returns + ------- + ndarray + Boolean array positionally indicating where an overlap occurs. + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + + Examples + -------- + %(examples)s + >>> intervals.overlaps(pd.Interval(0.5, 1.5)) + array([ True, True, False]) + + Intervals that share closed endpoints overlap: + + >>> intervals.overlaps(pd.Interval(1, 3, closed='left')) + array([ True, True, True]) + + Intervals that only have an open endpoint in common do not overlap: + + >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) + array([False, True, False]) + """ + ) + + @Appender( + _interval_shared_docs["overlaps"] + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( + """\ + >>> data = [(0, 1), (1, 3), (2, 4)] + >>> intervals = pd.arrays.IntervalArray.from_tuples(data) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + } + ) + def overlaps(self, other): + if isinstance(other, (IntervalArray, ABCIntervalIndex)): + raise NotImplementedError + elif not isinstance(other, Interval): + msg = f"`other` must be Interval-like, got {type(other).__name__}" + raise TypeError(msg) + + # equality is okay if both endpoints are closed (overlap at a point) + op1 = le if (self.closed_left and other.closed_right) else lt + op2 = le if (other.closed_left and self.closed_right) else lt + + # overlaps is equivalent negation of two interval being disjoint: + # disjoint = (A.left > B.right) or (B.left > A.right) + # (simplifying the negation allows this to be done in less operations) + return op1(self.left, other.right) & op2(other.left, self.right) + + # --------------------------------------------------------------------- @property def closed(self): @@ -992,9 +1209,9 @@ def closed(self): @Appender( _interval_shared_docs["set_closed"] - % dict( - klass="IntervalArray", - examples=textwrap.dedent( + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( """\ Examples -------- @@ -1009,44 +1226,17 @@ def closed(self): Length: 3, closed: both, dtype: interval[int64] """ ), - ) + } ) def set_closed(self, closed): - if closed not in _VALID_CLOSED: + if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) return type(self)._simple_new( - left=self.left, right=self.right, closed=closed, verify_integrity=False + left=self._left, right=self._right, closed=closed, verify_integrity=False ) - @property - def length(self): - """ - Return an Index with entries denoting the length of each Interval in - the IntervalArray. - """ - try: - return self.right - self.left - except TypeError as err: - # length not defined for some types, e.g. string - msg = ( - "IntervalArray contains Intervals without defined length, " - "e.g. Intervals with string endpoints" - ) - raise TypeError(msg) from err - - @property - def mid(self): - """ - Return the midpoint of each Interval in the IntervalArray as an Index. - """ - try: - return 0.5 * (self.left + self.right) - except TypeError: - # datetime safe version - return self.left + 0.5 * self.length - _interval_shared_docs[ "is_non_overlapping_monotonic" ] = """ @@ -1057,7 +1247,7 @@ def mid(self): # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties - @property # type: ignore + @property # type: ignore[misc] @Appender( _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs ) @@ -1070,25 +1260,27 @@ def is_non_overlapping_monotonic(self): # at a point when both sides of intervals are included if self.closed == "both": return bool( - (self.right[:-1] < self.left[1:]).all() - or (self.left[:-1] > self.right[1:]).all() + (self._right[:-1] < self._left[1:]).all() + or (self._left[:-1] > self._right[1:]).all() ) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping return bool( - (self.right[:-1] <= self.left[1:]).all() - or (self.left[:-1] >= self.right[1:]).all() + (self._right[:-1] <= self._left[1:]).all() + or (self._left[:-1] >= self._right[1:]).all() ) + # --------------------------------------------------------------------- # Conversion + def __array__(self, dtype=None) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ - left = self.left - right = self.right + left = self._left + right = self._right mask = self.isna() closed = self._closed @@ -1105,6 +1297,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType try: @@ -1117,8 +1310,8 @@ def __arrow_array__(self, type=None): interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pyarrow.StructArray.from_arrays( [ - pyarrow.array(self.left, type=subtype, from_pandas=True), - pyarrow.array(self.right, type=subtype, from_pandas=True), + pyarrow.array(self._left, type=subtype, from_pandas=True), + pyarrow.array(self._right, type=subtype, from_pandas=True), ], names=["left", "right"], ) @@ -1162,8 +1355,6 @@ def __arrow_array__(self, type=None): Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA value itself if False, ``nan``. - .. versionadded:: 0.23.0 - Returns ------- tuples: %(return_type)s @@ -1171,18 +1362,20 @@ def __arrow_array__(self, type=None): """ @Appender( - _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") + _interval_shared_docs["to_tuples"] % {"return_type": "ndarray", "examples": ""} ) def to_tuples(self, na_tuple=True): - tuples = com.asarray_tuplesafe(zip(self.left, self.right)) + tuples = com.asarray_tuplesafe(zip(self._left, self._right)) if not na_tuple: # GH 18756 tuples = np.where(~self.isna(), tuples, np.nan) return tuples + # --------------------------------------------------------------------- + @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), dict(axis=axis)) + nv.validate_repeat((), {"axis": axis}) left_repeat = self.left.repeat(repeats) right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) @@ -1221,9 +1414,9 @@ def repeat(self, repeats, axis=None): @Appender( _interval_shared_docs["contains"] - % dict( - klass="IntervalArray", - examples=textwrap.dedent( + % { + "klass": "IntervalArray", + "examples": textwrap.dedent( """\ >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) >>> intervals @@ -1232,90 +1425,16 @@ def repeat(self, repeats, axis=None): Length: 3, closed: right, dtype: interval[int64] """ ), - ) + } ) def contains(self, other): if isinstance(other, Interval): raise NotImplementedError("contains not implemented for two intervals") - return (self.left < other if self.open_left else self.left <= other) & ( - other < self.right if self.open_right else other <= self.right + return (self._left < other if self.open_left else self._left <= other) & ( + other < self._right if self.open_right else other <= self._right ) - _interval_shared_docs["overlaps"] = textwrap.dedent( - """ - Check elementwise if an Interval overlaps the values in the %(klass)s. - - Two intervals overlap if they share a common point, including closed - endpoints. Intervals that only have an open endpoint in common do not - overlap. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - other : %(klass)s - Interval to check against for an overlap. - - Returns - ------- - ndarray - Boolean array positionally indicating where an overlap occurs. - - See Also - -------- - Interval.overlaps : Check whether two Interval objects overlap. - - Examples - -------- - %(examples)s - >>> intervals.overlaps(pd.Interval(0.5, 1.5)) - array([ True, True, False]) - - Intervals that share closed endpoints overlap: - - >>> intervals.overlaps(pd.Interval(1, 3, closed='left')) - array([ True, True, True]) - - Intervals that only have an open endpoint in common do not overlap: - - >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) - array([False, True, False]) - """ - ) - - @Appender( - _interval_shared_docs["overlaps"] - % dict( - klass="IntervalArray", - examples=textwrap.dedent( - """\ - >>> data = [(0, 1), (1, 3), (2, 4)] - >>> intervals = pd.arrays.IntervalArray.from_tuples(data) - >>> intervals - - [(0, 1], (1, 3], (2, 4]] - Length: 3, closed: right, dtype: interval[int64] - """ - ), - ) - ) - def overlaps(self, other): - if isinstance(other, (IntervalArray, ABCIntervalIndex)): - raise NotImplementedError - elif not isinstance(other, Interval): - msg = f"`other` must be Interval-like, got {type(other).__name__}" - raise TypeError(msg) - - # equality is okay if both endpoints are closed (overlap at a point) - op1 = le if (self.closed_left and other.closed_right) else lt - op2 = le if (other.closed_left and self.closed_right) else lt - - # overlaps is equivalent negation of two interval being disjoint: - # disjoint = (A.left > B.right) or (B.left > A.right) - # (simplifying the negation allows this to be done in less operations) - return op1(self.left, other.right) & op2(other.left, self.right) - def maybe_convert_platform_interval(values): """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 235840d6d201e..caed932cd7857 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1,11 +1,13 @@ -from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union import numpy as np from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( @@ -17,9 +19,10 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops -from pandas.core.algorithms import _factorize_array, take +from pandas.core.algorithms import factorize_array, take from pandas.core.array_algos import masked_reductions -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ExtensionArray from pandas.core.indexers import check_array_indexer if TYPE_CHECKING: @@ -34,14 +37,28 @@ class BaseMaskedDtype(ExtensionDtype): Base class for dtypes for BasedMaskedArray subclasses. """ + name: str + base = None + type: Type + na_value = libmissing.NA - @property + @cache_readonly def numpy_dtype(self) -> np.dtype: - raise AbstractMethodError + """ Return an instance of our numpy dtype """ + return np.dtype(self.type) + + @cache_readonly + def kind(self) -> str: + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """ Return the number of bytes in this dtype """ + return self.numpy_dtype.itemsize @classmethod - def construct_array_type(cls) -> Type["BaseMaskedArray"]: + def construct_array_type(cls) -> Type[BaseMaskedArray]: """ Return the array type associated with this dtype. @@ -52,7 +69,7 @@ def construct_array_type(cls) -> Type["BaseMaskedArray"]: raise NotImplementedError -class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): +class BaseMaskedArray(OpsMixin, ExtensionArray): """ Base class for masked arrays (which use _data and _mask to store the data). @@ -69,9 +86,9 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): "mask should be boolean numpy array. Use " "the 'pd.array' function instead" ) - if not values.ndim == 1: + if values.ndim != 1: raise ValueError("values must be a 1D array") - if not mask.ndim == 1: + if mask.ndim != 1: raise ValueError("mask must be a 1D array") if copy: @@ -85,7 +102,9 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): def dtype(self) -> BaseMaskedDtype: raise AbstractMethodError(self) - def __getitem__(self, item): + def __getitem__( + self, item: Union[int, slice, np.ndarray] + ) -> Union[BaseMaskedArray, Any]: if is_integer(item): if self._mask[item]: return self.dtype.na_value @@ -126,7 +145,7 @@ def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: return type(self)(~self._data, self._mask) def to_numpy( - self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default, + self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default ) -> np.ndarray: """ Convert to a NumPy Array. @@ -194,7 +213,8 @@ def to_numpy( dtype = object if self._hasna: if ( - not (is_object_dtype(dtype) or is_string_dtype(dtype)) + not is_object_dtype(dtype) + and not is_string_dtype(dtype) and na_value is libmissing.NA ): raise ValueError( @@ -245,7 +265,9 @@ def nbytes(self) -> int: return self._data.nbytes + self._mask.nbytes @classmethod - def _concat_same_type(cls: Type[BaseMaskedArrayT], to_concat) -> BaseMaskedArrayT: + def _concat_same_type( + cls: Type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT] + ) -> BaseMaskedArrayT: data = np.concatenate([x._data for x in to_concat]) mask = np.concatenate([x._mask for x in to_concat]) return cls(data, mask) @@ -253,6 +275,7 @@ def _concat_same_type(cls: Type[BaseMaskedArrayT], to_concat) -> BaseMaskedArray def take( self: BaseMaskedArrayT, indexer, + *, allow_fill: bool = False, fill_value: Optional[Scalar] = None, ) -> BaseMaskedArrayT: @@ -287,7 +310,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: arr = self._data mask = self._mask - codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) # the hashtables don't handle all different types of bits uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) @@ -341,7 +364,7 @@ def value_counts(self, dropna: bool = True) -> "Series": return Series(counts, index=index) - def _reduce(self, name: str, skipna: bool = True, **kwargs): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): data = self._data mask = self._mask diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py new file mode 100644 index 0000000000000..5447a84c86ac1 --- /dev/null +++ b/pandas/core/arrays/numeric.py @@ -0,0 +1,92 @@ +import datetime + +import numpy as np + +from pandas._libs import Timedelta, missing as libmissing +from pandas.errors import AbstractMethodError + +from pandas.core.dtypes.common import ( + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, +) + +from .masked import BaseMaskedArray + + +class NumericArray(BaseMaskedArray): + """ + Base class for IntegerArray and FloatingArray. + """ + + def _maybe_mask_result(self, result, mask, other, op_name: str): + raise AbstractMethodError(self) + + def _arith_method(self, other, op): + op_name = op.__name__ + omask = None + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, NumericArray): + other, omask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + + elif isinstance(other, (datetime.timedelta, np.timedelta64)): + other = Timedelta(other) + + else: + if not (is_float(other) or is_integer(other) or other is libmissing.NA): + raise TypeError("can only perform ops with numeric values") + + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | omask + + if op_name == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op_name == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index f6dfb1f0f1e62..50d12703c3a30 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -1,5 +1,5 @@ import numbers -from typing import Optional, Tuple, Type, Union +from typing import Tuple, Type, Union import numpy as np from numpy.lib.mixins import NDArrayOperatorsMixin @@ -7,23 +7,14 @@ from pandas._libs import lib from pandas._typing import Scalar from pandas.compat.numpy import function as nv -from pandas.util._decorators import doc -from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna -from pandas import compat -from pandas.core import nanops -from pandas.core.algorithms import searchsorted -from pandas.core.array_algos import masked_reductions +from pandas.core import nanops, ops +from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin -from pandas.core.construction import extract_array -from pandas.core.indexers import check_array_indexer -from pandas.core.missing import backfill_1d, pad_1d +from pandas.core.strings.object_array import ObjectStringArrayMixin class PandasDtype(ExtensionDtype): @@ -122,7 +113,10 @@ def itemsize(self) -> int: class PandasArray( - NDArrayBackedExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin + OpsMixin, + NDArrayBackedExtensionArray, + NDArrayOperatorsMixin, + ObjectStringArrayMixin, ): """ A pandas ExtensionArray for NumPy data. @@ -150,7 +144,7 @@ class PandasArray( # If you're wondering why pd.Series(cls) doesn't put the array in an # ExtensionBlock, search for `ABCPandasArray`. We check for - # that _typ to ensure that that users don't unnecessarily use EAs inside + # that _typ to ensure that users don't unnecessarily use EAs inside # pandas internals, which turns off things like block consolidation. _typ = "npy_extension" __array_priority__ = 1000 @@ -177,7 +171,9 @@ def __init__(self, values: Union[np.ndarray, "PandasArray"], copy: bool = False) self._dtype = PandasDtype(values.dtype) @classmethod - def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "PandasArray": + def _from_sequence( + cls, scalars, *, dtype=None, copy: bool = False + ) -> "PandasArray": if isinstance(dtype, PandasDtype): dtype = dtype._dtype @@ -190,10 +186,6 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "PandasArray def _from_factorized(cls, values, original) -> "PandasArray": return cls(values) - @classmethod - def _concat_same_type(cls, to_concat) -> "PandasArray": - return cls(np.concatenate(to_concat)) - def _from_backing_data(self, arr: np.ndarray) -> "PandasArray": return type(self)(arr) @@ -226,6 +218,16 @@ def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)): return NotImplemented + if ufunc not in [np.logical_or, np.bitwise_or, np.bitwise_xor]: + # For binary ops, use our custom dunder methods + # We haven't implemented logical dunder funcs, so exclude these + # to avoid RecursionError + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + # Defer to the implementation of the ufunc on unwrapped values. inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs) if out: @@ -255,158 +257,113 @@ def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): # ------------------------------------------------------------------------ # Pandas ExtensionArray Interface - def __getitem__(self, item): - if isinstance(item, type(self)): - item = item._ndarray - - item = check_array_indexer(self, item) - - result = self._ndarray[item] - if not lib.is_scalar(item): - result = type(self)(result) - return result - - def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) - - key = check_array_indexer(self, key) - scalar_value = lib.is_scalar(value) - - if not scalar_value: - value = np.asarray(value, dtype=self._ndarray.dtype) - - self._ndarray[key] = value - def isna(self) -> np.ndarray: return isna(self._ndarray) - def fillna( - self, value=None, method: Optional[str] = None, limit: Optional[int] = None, - ) -> "PandasArray": - # TODO(_values_for_fillna): remove this - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f" expected {len(self)}" - ) - value = value[mask] - - if mask.any(): - if method is not None: - func = pad_1d if method == "pad" else backfill_1d - new_values = func(self._ndarray, limit=limit, mask=mask) - new_values = self._from_sequence(new_values, dtype=self.dtype) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - def _validate_fill_value(self, fill_value): if fill_value is None: # Primarily for subclasses fill_value = self.dtype.na_value return fill_value - def _values_for_argsort(self) -> np.ndarray: - return self._ndarray - def _values_for_factorize(self) -> Tuple[np.ndarray, int]: return self._ndarray, -1 # ------------------------------------------------------------------------ # Reductions - def _reduce(self, name, skipna=True, **kwargs): - meth = getattr(self, name, None) - if meth: - return meth(skipna=skipna, **kwargs) - else: - msg = f"'{type(self).__name__}' does not implement reduction '{name}'" - raise TypeError(msg) - - def any(self, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_any((), dict(out=out, keepdims=keepdims)) - return nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + def any(self, *, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_any((), {"out": out, "keepdims": keepdims}) + result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) - def all(self, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_all((), dict(out=out, keepdims=keepdims)) - return nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + def all(self, *, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_all((), {"out": out, "keepdims": keepdims}) + result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) - def min(self, skipna: bool = True, **kwargs) -> Scalar: + def min(self, *, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) - result = masked_reductions.min( - values=self.to_numpy(), mask=self.isna(), skipna=skipna + result = nanops.nanmin( + values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna ) - return result + return self._wrap_reduction_result(axis, result) - def max(self, skipna: bool = True, **kwargs) -> Scalar: + def max(self, *, axis=None, skipna: bool = True, **kwargs) -> Scalar: nv.validate_max((), kwargs) - result = masked_reductions.max( - values=self.to_numpy(), mask=self.isna(), skipna=skipna + result = nanops.nanmax( + values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna ) - return result + return self._wrap_reduction_result(axis, result) - def sum(self, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: + def sum(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_sum((), kwargs) - return nanops.nansum( + result = nanops.nansum( self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) + return self._wrap_reduction_result(axis, result) - def prod(self, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: + def prod(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_prod((), kwargs) - return nanops.nanprod( + result = nanops.nanprod( self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) + return self._wrap_reduction_result(axis, result) - def mean(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): - nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) - return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) + def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims}) + result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) def median( - self, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True + self, *, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True ): nv.validate_median( - (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) + (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims} ) - return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) - def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + def std( + self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True + ): nv.validate_stat_ddof_func( - (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" ) - return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) - def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + def var( + self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True + ): nv.validate_stat_ddof_func( - (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var" + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var" ) - return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) - def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): + def sem( + self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True + ): nv.validate_stat_ddof_func( - (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem" + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem" ) - return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + return self._wrap_reduction_result(axis, result) - def kurt(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + def kurt(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt" + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt" ) - return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) - def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + def skew(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew" + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew" ) - return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) + result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) + return self._wrap_reduction_result(axis, result) # ------------------------------------------------------------------------ # Additional Methods @@ -424,38 +381,44 @@ def to_numpy( return result - @doc(ExtensionArray.searchsorted) - def searchsorted(self, value, side="left", sorter=None): - return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) - # ------------------------------------------------------------------------ # Ops def __invert__(self): return type(self)(~self._ndarray) - @classmethod - def _create_arithmetic_method(cls, op): - def arithmetic_method(self, other): - if isinstance(other, (ABCIndexClass, ABCSeries)): - return NotImplemented - - elif isinstance(other, cls): - other = other._ndarray - - with np.errstate(all="ignore"): - result = op(self._ndarray, other) - - if op is divmod: - a, b = result - return cls(a), cls(b) - - return cls(result) + def _cmp_method(self, other, op): + if isinstance(other, PandasArray): + other = other._ndarray + + pd_op = ops.get_array_op(op) + result = pd_op(self._ndarray, other) + + if op is divmod or op is ops.rdivmod: + a, b = result + if isinstance(a, np.ndarray): + # for e.g. op vs TimedeltaArray, we may already + # have an ExtensionArray, in which case we do not wrap + return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b) + return a, b + + if isinstance(result, np.ndarray): + # for e.g. multiplication vs TimedeltaArray, we may already + # have an ExtensionArray, in which case we do not wrap + return self._wrap_ndarray_result(result) + return result - return compat.set_function_name(arithmetic_method, f"__{op.__name__}__", cls) + _arith_method = _cmp_method - _create_comparison_method = _create_arithmetic_method + def _wrap_ndarray_result(self, result: np.ndarray): + # If we have timedelta64[ns] result, return a TimedeltaArray instead + # of a PandasArray + if result.dtype == "timedelta64[ns]": + from pandas.core.arrays import TimedeltaArray + return TimedeltaArray._simple_new(result) + return type(self)(result) -PandasArray._add_arithmetic_ops() -PandasArray._add_comparison_ops() + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = np.nan diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8d5cb12d60e4d..7b0e4ce5b0748 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -33,6 +33,7 @@ TD64NS_DTYPE, ensure_object, is_datetime64_dtype, + is_dtype_equal, is_float_dtype, is_period_dtype, pandas_dtype, @@ -62,11 +63,13 @@ def f(self): return property(f) -class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): +class PeriodArray(PeriodMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for storing Period data. - Users should use :func:`period_array` to create new instances. + Users should use :func:`period_range` to create new instances. + Alternatively, :func:`array` can be used to create new instances + from a sequence of Period scalars. Parameters ---------- @@ -75,14 +78,14 @@ class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): converted to ordinals without inference or copy (PeriodArray, ndarray[int64]), or a box around such an array (Series[period], PeriodIndex). + dtype : PeriodDtype, optional + A PeriodDtype instance from which to extract a `freq`. If both + `freq` and `dtype` are specified, then the frequencies must match. freq : str or DateOffset The `freq` to use for the array. Mostly applicable when `values` is an ndarray of integers, when `freq` is required. When `values` is a PeriodArray (or box around), it's checked that ``values.freq`` matches `freq`. - dtype : PeriodDtype, optional - A PeriodDtype instance from which to extract a `freq`. If both - `freq` and `dtype` are specified, then the frequencies must match. copy : bool, default False Whether to copy the ordinals before storing. @@ -96,8 +99,10 @@ class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): See Also -------- - period_array : Create a new PeriodArray. + Period: Represents a period of time. PeriodIndex : Immutable Index for period data. + period_range: Create a fixed-frequency PeriodArray. + array: Construct a pandas array. Notes ----- @@ -119,6 +124,7 @@ class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): _scalar_type = Period _recognized_scalars = (Period,) _is_recognized_dtype = is_period_dtype + _infer_matches = ("period",) # Names others delegate to us _other_ops: List[str] = [] @@ -135,7 +141,9 @@ class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): "weekday", "week", "dayofweek", + "day_of_week", "dayofyear", + "day_of_year", "quarter", "qyear", "days_in_month", @@ -147,7 +155,7 @@ class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): # -------------------------------------------------------------------- # Constructors - def __init__(self, values, freq=None, dtype=None, copy=False): + def __init__(self, values, dtype=None, freq=None, copy=False): freq = validate_dtype_freq(dtype, freq) if freq is not None: @@ -173,16 +181,19 @@ def __init__(self, values, freq=None, dtype=None, copy=False): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new(cls, values: np.ndarray, freq=None, **kwargs) -> "PeriodArray": + def _simple_new( + cls, values: np.ndarray, freq: Optional[BaseOffset] = None, dtype=None + ) -> "PeriodArray": # alias for PeriodArray.__init__ assertion_msg = "Should be numpy array of type i8" assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg - return cls(values, freq=freq, **kwargs) + return cls(values, freq=freq, dtype=dtype) @classmethod def _from_sequence( cls: Type["PeriodArray"], scalars: Union[Sequence[Optional[Period]], AnyArrayLike], + *, dtype: Optional[PeriodDtype] = None, copy: bool = False, ) -> "PeriodArray": @@ -198,8 +209,6 @@ def _from_sequence( return scalars periods = np.asarray(scalars, dtype=object) - if copy: - periods = periods.copy() freq = freq or libperiod.extract_freq(periods) ordinals = libperiod.extract_ordinals(periods, freq) @@ -207,9 +216,9 @@ def _from_sequence( @classmethod def _from_sequence_of_strings( - cls, strings, dtype=None, copy=False + cls, strings, *, dtype=None, copy=False ) -> "PeriodArray": - return cls._from_sequence(strings, dtype, copy) + return cls._from_sequence(strings, dtype=dtype, copy=copy) @classmethod def _from_datetime64(cls, data, freq, tz=None) -> "PeriodArray": @@ -253,12 +262,14 @@ def _generate_range(cls, start, end, periods, freq, fields): # ----------------------------------------------------------------- # DatetimeLike Interface - def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: + def _unbox_scalar( + self, value: Union[Period, NaTType], setitem: bool = False + ) -> int: if value is NaT: - return value.value + return np.int64(value.value) elif isinstance(value, self._scalar_type): - self._check_compatible_with(value) - return value.ordinal + self._check_compatible_with(value, setitem=setitem) + return np.int64(value.ordinal) else: raise ValueError(f"'value' should be a Period. Got '{value}' instead.") @@ -278,8 +289,8 @@ def _check_compatible_with(self, other, setitem: bool = False): def dtype(self) -> PeriodDtype: return self._dtype - # error: Read-only property cannot override read-write property [misc] - @property # type: ignore + # error: Read-only property cannot override read-write property + @property # type: ignore[misc] def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray. @@ -300,6 +311,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType if type is not None: @@ -367,12 +379,13 @@ def __arrow_array__(self, type=None): """, ) week = weekofyear - dayofweek = _field_accessor( - "weekday", + day_of_week = _field_accessor( + "day_of_week", """ The day of the week with Monday=0, Sunday=6. """, ) + dayofweek = day_of_week weekday = dayofweek dayofyear = day_of_year = _field_accessor( "day_of_year", @@ -479,9 +492,8 @@ def _time_shift(self, periods, freq=None): values[self._isnan] = iNaT return type(self)(values, freq=self.freq) - @property - def _box_func(self): - return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) + def _box_func(self, x) -> Union[Period, NaTType]: + return Period._from_ordinal(ordinal=x, freq=self.freq) def asfreq(self, freq=None, how: str = "E") -> "PeriodArray": """ @@ -576,11 +588,22 @@ def astype(self, dtype, copy: bool = True): # We handle Period[T] -> Period[U] # Our parent handles everything else. dtype = pandas_dtype(dtype) - + if is_dtype_equal(dtype, self._dtype): + if not copy: + return self + else: + return self.copy() if is_period_dtype(dtype): return self.asfreq(dtype.freq) return super().astype(dtype, copy=copy) + def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: + value = self._validate_searchsorted_value(value).view("M8[ns]") + + # Cast to M8 to get datetime-like NaT placement + m8arr = self._ndarray.view("M8[ns]") + return m8arr.searchsorted(value, side=side, sorter=sorter) + # ------------------------------------------------------------------ # Arithmetic Methods @@ -628,12 +651,12 @@ def _sub_period_array(self, other): new_values = np.array([self.freq.base * x for x in new_values]) if self._hasnans or other._hasnans: - mask = (self._isnan) | (other._isnan) + mask = self._isnan | other._isnan new_values[mask] = NaT return new_values def _addsub_int_array( - self, other: np.ndarray, op: Callable[[Any, Any], Any], + self, other: np.ndarray, op: Callable[[Any, Any], Any] ) -> "PeriodArray": """ Add or subtract array of integers; equivalent to applying @@ -653,7 +676,7 @@ def _addsub_int_array( other = -other res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) res_values = res_values.view("i8") - res_values[self._isnan] = iNaT + np.putmask(res_values, self._isnan, iNaT) return type(self)(res_values, freq=self.freq) def _add_offset(self, other: BaseOffset): @@ -862,7 +885,7 @@ def period_array( if is_datetime64_dtype(data_dtype): return PeriodArray._from_datetime64(data, freq) if is_period_dtype(data_dtype): - return PeriodArray(data, freq) + return PeriodArray(data, freq=freq) # other iterable of some kind if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): @@ -1059,11 +1082,9 @@ def _make_field_arrays(*fields): elif length is None: length = len(x) - arrays = [ + return [ np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) else np.repeat(x, length) for x in fields ] - - return arrays diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index e928db499a771..e9ff4b7d4ffc2 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -5,6 +5,6 @@ BlockIndex, IntIndex, SparseArray, - _make_index, + make_sparse_index, ) from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8a30d2b954b55..ec4b0fd89860c 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -87,10 +87,10 @@ def from_coo(cls, A, dense_index=False): 1 0 3.0 dtype: Sparse[float64, nan] """ - from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series + from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series - result = _coo_to_sparse_series(A, dense_index=dense_index) + result = coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) return result @@ -168,9 +168,9 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): >>> columns [('a', 0), ('a', 1), ('b', 0), ('b', 1)] """ - from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo + from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo - A, rows, columns = _sparse_series_to_coo( + A, rows, columns = sparse_series_to_coo( self._parent, row_levels, column_levels, sort_labels=sort_labels ) return A, rows, columns @@ -253,9 +253,10 @@ def from_spmatrix(cls, data, index=None, columns=None): 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas import DataFrame from pandas._libs.sparse import IntIndex + from pandas import DataFrame + data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape @@ -354,8 +355,8 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): - import pandas.core.indexes.base as ibase from pandas.core.indexes.api import ensure_index + import pandas.core.indexes.base as ibase N, K = data.shape if index is None: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b18a58da3950f..b8375af797b3a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -4,7 +4,7 @@ from collections import abc import numbers import operator -from typing import Any, Callable, Union +from typing import Any, Callable, Sequence, Type, TypeVar, Union import warnings import numpy as np @@ -14,7 +14,6 @@ from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex from pandas._libs.tslibs import NaT from pandas._typing import Scalar -import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning @@ -23,6 +22,7 @@ construct_1d_arraylike_from_scalar, find_common_type, infer_dtype_from_scalar, + maybe_box_datetimelike, ) from pandas.core.dtypes.common import ( is_array_like, @@ -40,7 +40,8 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -49,15 +50,15 @@ from pandas.core.missing import interpolate_2d from pandas.core.nanops import check_below_min_count import pandas.core.ops as ops -from pandas.core.ops.common import unpack_zerodim_and_defer import pandas.io.formats.printing as printing # ---------------------------------------------------------------------------- # Array +SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray") -_sparray_doc_kwargs = dict(klass="SparseArray") +_sparray_doc_kwargs = {"klass": "SparseArray"} def _get_fill(arr: "SparseArray") -> np.ndarray: @@ -195,7 +196,7 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): ) -class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): +class SparseArray(OpsMixin, PandasObject, ExtensionArray): """ An ExtensionArray for storing sparse data. @@ -234,7 +235,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'int', 'block'}, default 'int' + kind : {'integer', 'block'}, default 'integer' The type of storage for sparse locations. * 'block': Stores a `block` and `block_length` for each @@ -271,7 +272,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): """ _subtyp = "sparse_array" # register ABCSparseArray - _deprecations = PandasObject._deprecations | frozenset(["get_values"]) + _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"]) _sparse_index: SparseIndex def __init__( @@ -317,9 +318,8 @@ def __init__( raise Exception("must only pass scalars with an index") if is_scalar(data): - if index is not None: - if data is None: - data = np.nan + if index is not None and data is None: + data = np.nan if index is not None: npoints = len(index) @@ -398,8 +398,11 @@ def __init__( @classmethod def _simple_new( - cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype - ) -> "SparseArray": + cls: Type[SparseArrayT], + sparse_array: np.ndarray, + sparse_index: SparseIndex, + dtype: SparseDtype, + ) -> SparseArrayT: new = object.__new__(cls) new._sparse_index = sparse_index new._sparse_values = sparse_array @@ -452,7 +455,7 @@ def from_spmatrix(cls, data): return cls._simple_new(arr, index, dtype) - def __array__(self, dtype=None, copy=True) -> np.ndarray: + def __array__(self, dtype=None) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: @@ -485,7 +488,7 @@ def __setitem__(self, key, value): raise TypeError(msg) @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): return cls(scalars, dtype=dtype) @classmethod @@ -576,8 +579,7 @@ def density(self): >>> s.density 0.6 """ - r = float(self.sp_index.npoints) / float(self.sp_index.length) - return r + return float(self.sp_index.npoints) / float(self.sp_index.length) @property def npoints(self) -> int: @@ -735,35 +737,25 @@ def value_counts(self, dropna=True): """ from pandas import Index, Series - keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps - if fcounts > 0: - if self._null_fill_value and dropna: - pass + if fcounts > 0 and (not self._null_fill_value or not dropna): + mask = isna(keys) if self._null_fill_value else keys == self.fill_value + if mask.any(): + counts[mask] += fcounts else: - if self._null_fill_value: - mask = isna(keys) - else: - mask = keys == self.fill_value - - if mask.any(): - counts[mask] += fcounts - else: - keys = np.insert(keys, 0, self.fill_value) - counts = np.insert(counts, 0, fcounts) + keys = np.insert(keys, 0, self.fill_value) + counts = np.insert(counts, 0, fcounts) if not isinstance(keys, ABCIndexClass): keys = Index(keys) - result = Series(counts, index=keys) - return result + return Series(counts, index=keys) # -------- # Indexing # -------- def __getitem__(self, key): - # avoid mypy issues when importing at the top-level - from pandas.core.indexing import check_bool_indexer if isinstance(key, tuple): if len(key) > 1: @@ -796,7 +788,6 @@ def __getitem__(self, key): key = check_array_indexer(self, key) if com.is_bool_indexer(key): - key = check_bool_indexer(self, key) return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): @@ -819,10 +810,10 @@ def _get_val_at(self, loc): return self.fill_value else: val = self.sp_values[sp_loc] - val = com.maybe_box_datetimelike(val, self.sp_values.dtype) + val = maybe_box_datetimelike(val, self.sp_values.dtype) return val - def take(self, indices, allow_fill=False, fill_value=None) -> "SparseArray": + def take(self, indices, *, allow_fill=False, fill_value=None) -> "SparseArray": if is_scalar(indices): raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") indices = np.asarray(indices, dtype=np.int32) @@ -862,21 +853,26 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: else: raise IndexError("cannot do a non-empty take from an empty axes.") + # sp_indexer may be -1 for two reasons + # 1.) we took for an index of -1 (new) + # 2.) we took a value that was self.fill_value (old) sp_indexer = self.sp_index.lookup_array(indices) + new_fill_indices = indices == -1 + old_fill_indices = (sp_indexer == -1) & ~new_fill_indices - if self.sp_index.npoints == 0: + if self.sp_index.npoints == 0 and old_fill_indices.all(): + # We've looked up all valid points on an all-sparse array. + taken = np.full( + sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype + ) + + elif self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values _dtype = np.result_type(self.dtype.subtype, type(fill_value)) taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) else: taken = self.sp_values.take(sp_indexer) - # sp_indexer may be -1 for two reasons - # 1.) we took for an index of -1 (new) - # 2.) we took a value that was self.fill_value (old) - new_fill_indices = indices == -1 - old_fill_indices = (sp_indexer == -1) & ~new_fill_indices - # Fill in two steps. # Old fill values # New fill values @@ -945,12 +941,14 @@ def searchsorted(self, v, side="left", sorter=None): v = np.asarray(v) return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) - def copy(self): + def copy(self: SparseArrayT) -> SparseArrayT: values = self.sp_values.copy() return self._simple_new(values, self.sp_index, self.dtype) @classmethod - def _concat_same_type(cls, to_concat): + def _concat_same_type( + cls: Type[SparseArrayT], to_concat: Sequence[SparseArrayT] + ) -> SparseArrayT: fill_value = to_concat[0].fill_value values = [] @@ -981,7 +979,7 @@ def _concat_same_type(cls, to_concat): # get an identical index as concating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting - # BlockIndex may have more blocs. + # BlockIndex may have more blocks. blengths = [] blocs = [] @@ -1058,6 +1056,11 @@ def astype(self, dtype=None, copy=True): IntIndex Indices: array([2, 3], dtype=int32) """ + if is_dtype_equal(dtype, self._dtype): + if not copy: + return self + else: + return self.copy() dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str # TODO copy=False is broken for astype_nansafe with int -> float, so cannot @@ -1159,7 +1162,7 @@ def nonzero(self): # Reductions # ------------------------------------------------------------------------ - def _reduce(self, name, skipna=True, **kwargs): + def _reduce(self, name: str, *, skipna: bool = True, **kwargs): method = getattr(self, name, None) if method is None: @@ -1304,19 +1307,6 @@ def mean(self, axis=0, *args, **kwargs): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) - def transpose(self, *axes) -> "SparseArray": - """ - Returns the SparseArray. - """ - return self - - @property - def T(self) -> "SparseArray": - """ - Returns the SparseArray. - """ - return self - # ------------------------------------------------------------------------ # Ufuncs # ------------------------------------------------------------------------ @@ -1380,110 +1370,82 @@ def __abs__(self): # Ops # ------------------------------------------------------------------------ - @classmethod - def _create_unary_method(cls, op) -> Callable[["SparseArray"], "SparseArray"]: - def sparse_unary_method(self) -> "SparseArray": - fill_value = op(np.array(self.fill_value)).item() - values = op(self.sp_values) - dtype = SparseDtype(values.dtype, fill_value) - return cls._simple_new(values, self.sp_index, dtype) - - name = f"__{op.__name__}__" - return compat.set_function_name(sparse_unary_method, name, cls) - - @classmethod - def _create_arithmetic_method(cls, op): + def _arith_method(self, other, op): op_name = op.__name__ - @unpack_zerodim_and_defer(op_name) - def sparse_arithmetic_method(self, other): - - if isinstance(other, SparseArray): - return _sparse_array_op(self, other, op, op_name) - - elif is_scalar(other): - with np.errstate(all="ignore"): - fill = op(_get_fill(self), np.asarray(other)) - result = op(self.sp_values, other) - - if op_name == "divmod": - left, right = result - lfill, rfill = fill - return ( - _wrap_result(op_name, left, self.sp_index, lfill), - _wrap_result(op_name, right, self.sp_index, rfill), - ) - - return _wrap_result(op_name, result, self.sp_index, fill) + if isinstance(other, SparseArray): + return _sparse_array_op(self, other, op, op_name) - else: - other = np.asarray(other) - with np.errstate(all="ignore"): - # TODO: look into _wrap_result - if len(self) != len(other): - raise AssertionError( - (f"length mismatch: {len(self)} vs. {len(other)}") - ) - if not isinstance(other, SparseArray): - dtype = getattr(other, "dtype", None) - other = SparseArray( - other, fill_value=self.fill_value, dtype=dtype - ) - return _sparse_array_op(self, other, op, op_name) - - name = f"__{op.__name__}__" - return compat.set_function_name(sparse_arithmetic_method, name, cls) - - @classmethod - def _create_comparison_method(cls, op): - op_name = op.__name__ - if op_name in {"and_", "or_"}: - op_name = op_name[:-1] + elif is_scalar(other): + with np.errstate(all="ignore"): + fill = op(_get_fill(self), np.asarray(other)) + result = op(self.sp_values, other) - @unpack_zerodim_and_defer(op_name) - def cmp_method(self, other): + if op_name == "divmod": + left, right = result + lfill, rfill = fill + return ( + _wrap_result(op_name, left, self.sp_index, lfill), + _wrap_result(op_name, right, self.sp_index, rfill), + ) - if not is_scalar(other) and not isinstance(other, type(self)): - # convert list-like to ndarray - other = np.asarray(other) + return _wrap_result(op_name, result, self.sp_index, fill) - if isinstance(other, np.ndarray): - # TODO: make this more flexible than just ndarray... + else: + other = np.asarray(other) + with np.errstate(all="ignore"): + # TODO: look into _wrap_result if len(self) != len(other): raise AssertionError( f"length mismatch: {len(self)} vs. {len(other)}" ) - other = SparseArray(other, fill_value=self.fill_value) - - if isinstance(other, SparseArray): + if not isinstance(other, SparseArray): + dtype = getattr(other, "dtype", None) + other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) return _sparse_array_op(self, other, op, op_name) - else: - with np.errstate(all="ignore"): - fill_value = op(self.fill_value, other) - result = op(self.sp_values, other) - - return type(self)( - result, - sparse_index=self.sp_index, - fill_value=fill_value, - dtype=np.bool_, - ) - name = f"__{op.__name__}__" - return compat.set_function_name(cmp_method, name, cls) + def _cmp_method(self, other, op) -> "SparseArray": + if not is_scalar(other) and not isinstance(other, type(self)): + # convert list-like to ndarray + other = np.asarray(other) - @classmethod - def _add_unary_ops(cls): - cls.__pos__ = cls._create_unary_method(operator.pos) - cls.__neg__ = cls._create_unary_method(operator.neg) - cls.__invert__ = cls._create_unary_method(operator.invert) + if isinstance(other, np.ndarray): + # TODO: make this more flexible than just ndarray... + if len(self) != len(other): + raise AssertionError(f"length mismatch: {len(self)} vs. {len(other)}") + other = SparseArray(other, fill_value=self.fill_value) - @classmethod - def _add_comparison_ops(cls): - cls.__and__ = cls._create_comparison_method(operator.and_) - cls.__or__ = cls._create_comparison_method(operator.or_) - cls.__xor__ = cls._create_arithmetic_method(operator.xor) - super()._add_comparison_ops() + if isinstance(other, SparseArray): + op_name = op.__name__.strip("_") + return _sparse_array_op(self, other, op, op_name) + else: + with np.errstate(all="ignore"): + fill_value = op(self.fill_value, other) + result = op(self.sp_values, other) + + return type(self)( + result, + sparse_index=self.sp_index, + fill_value=fill_value, + dtype=np.bool_, + ) + + _logical_method = _cmp_method + + def _unary_method(self, op) -> "SparseArray": + fill_value = op(np.array(self.fill_value)).item() + values = op(self.sp_values) + dtype = SparseDtype(values.dtype, fill_value) + return type(self)._simple_new(values, self.sp_index, dtype) + + def __pos__(self) -> "SparseArray": + return self._unary_method(operator.pos) + + def __neg__(self) -> "SparseArray": + return self._unary_method(operator.neg) + + def __invert__(self) -> "SparseArray": + return self._unary_method(operator.invert) # ---------- # Formatting @@ -1500,12 +1462,7 @@ def _formatter(self, boxed=False): return None -SparseArray._add_arithmetic_ops() -SparseArray._add_comparison_ops() -SparseArray._add_unary_ops() - - -def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy=False): +def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None): """ Convert ndarray to sparse format @@ -1551,7 +1508,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy else: indices = mask.nonzero()[0].astype(np.int32) - index = _make_index(length, indices, kind) + index = make_sparse_index(length, indices, kind) sparsified_values = arr[mask] if dtype is not None: sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) @@ -1559,7 +1516,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy return sparsified_values, index, fill_value -def _make_index(length, indices, kind): +def make_sparse_index(length, indices, kind): if kind == "block" or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index ccf2825162f51..c0662911d40da 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -22,7 +22,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype if TYPE_CHECKING: - from pandas.core.arrays.sparse.array import SparseArray # noqa: F401 + from pandas.core.arrays.sparse.array import SparseArray @register_extension_dtype @@ -180,7 +180,7 @@ def construct_array_type(cls) -> Type["SparseArray"]: ------- type """ - from pandas.core.arrays.sparse.array import SparseArray # noqa: F811 + from pandas.core.arrays.sparse.array import SparseArray return SparseArray diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index eafd782dc9b9c..56c678c88b9c7 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -85,7 +85,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): return values, i_coord, j_coord, i_labels, j_labels -def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): +def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column @@ -113,7 +113,7 @@ def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=F return sparse_matrix, rows, columns -def _coo_to_sparse_series(A, dense_index: bool = False): +def coo_to_sparse_series(A, dense_index: bool = False): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5104e3f12f5b4..cc2013deb5252 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,25 +1,32 @@ -import operator from typing import TYPE_CHECKING, Type, Union import numpy as np from pandas._libs import lib, missing as libmissing +from pandas._typing import Scalar +from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype -from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_array_like +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) -from pandas import compat from pandas.core import ops -from pandas.core.arrays import IntegerArray, PandasArray +from pandas.core.array_algos import masked_reductions +from pandas.core.arrays import FloatingArray, IntegerArray, PandasArray +from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow @register_extension_dtype @@ -80,7 +87,7 @@ def __from_arrow__( """ Construct StringArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow if isinstance(array, pyarrow.Array): chunks = [array] @@ -178,11 +185,13 @@ class StringArray(PandasArray): def __init__(self, values, copy=False): values = extract_array(values) - skip_validation = isinstance(values, type(self)) super().__init__(values, copy=copy) - self._dtype = StringDtype() - if not skip_validation: + # pandas\core\arrays\string_.py:188: error: Incompatible types in + # assignment (expression has type "StringDtype", variable has type + # "PandasDtype") [assignment] + self._dtype = StringDtype() # type: ignore[assignment] + if not isinstance(values, type(self)): self._validate() def _validate(self): @@ -196,33 +205,35 @@ def _validate(self): ) @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): + def _from_sequence(cls, scalars, *, dtype=None, copy=False): if dtype: assert dtype == "string" - result = np.asarray(scalars, dtype="object") - if copy and result is scalars: - result = result.copy() - - # Standardize all missing-like values to NA - # TODO: it would be nice to do this in _validate / lib.is_string_array - # We are already doing a scan over the values there. - na_values = isna(result) - has_nans = na_values.any() - if has_nans and result is scalars: - # force a copy now, if we haven't already - result = result.copy() - - # convert to str, then to object to avoid dtype like ' Scalar: + nv.validate_min((), kwargs) + result = masked_reductions.min( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + + def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: + nv.validate_max((), kwargs) + result = masked_reductions.max( + values=self.to_numpy(), mask=self.isna(), skipna=skipna + ) + return self._wrap_reduction_result(axis, result) + def value_counts(self, dropna=False): from pandas import value_counts return value_counts(self._ndarray, dropna=dropna).astype("Int64") - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes if deep: return result + lib.memory_usage_of_objects(self._ndarray) return result - # Override parent because we have different return types. - @classmethod - def _create_arithmetic_method(cls, op): - # Note: this handles both arithmetic and comparison methods. - def method(self, other): - from pandas.arrays import BooleanArray + def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray - assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS + if isinstance(other, StringArray): + other = other._ndarray - if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): - return NotImplemented + mask = isna(self) | isna(other) + valid = ~mask - elif isinstance(other, cls): - other = other._ndarray + if not lib.is_scalar(other): + if len(other) != len(self): + # prevent improper broadcasting when other is 2D + raise ValueError( + f"Lengths of operands do not match: {len(self)} != {len(other)}" + ) - mask = isna(self) | isna(other) - valid = ~mask + other = np.asarray(other) + other = other[valid] - if not lib.is_scalar(other): - if len(other) != len(self): - # prevent improper broadcasting when other is 2D - raise ValueError( - f"Lengths of operands do not match: {len(self)} != {len(other)}" - ) + if op.__name__ in ops.ARITHMETIC_BINOPS: + result = np.empty_like(self._ndarray, dtype="object") + result[mask] = StringDtype.na_value + result[valid] = op(self._ndarray[valid], other) + return StringArray(result) + else: + # logical + result = np.zeros(len(self._ndarray), dtype="bool") + result[valid] = op(self._ndarray[valid], other) + return BooleanArray(result, mask) - other = np.asarray(other) - other = other[valid] + _arith_method = _cmp_method - if op.__name__ in ops.ARITHMETIC_BINOPS: - result = np.empty_like(self._ndarray, dtype="object") - result[mask] = StringDtype.na_value - result[valid] = op(self._ndarray[valid], other) - return StringArray(result) - else: - # logical - result = np.zeros(len(self._ndarray), dtype="bool") - result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = StringDtype.na_value - return compat.set_function_name(method, f"__{op.__name__}__", cls) + def _str_map(self, f, na_value=None, dtype=None): + from pandas.arrays import BooleanArray, IntegerArray, StringArray + from pandas.core.arrays.string_ import StringDtype - @classmethod - def _add_arithmetic_ops(cls): - cls.__add__ = cls._create_arithmetic_method(operator.add) - cls.__radd__ = cls._create_arithmetic_method(ops.radd) + if dtype is None: + dtype = StringDtype() + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) - cls.__mul__ = cls._create_arithmetic_method(operator.mul) - cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) - _create_comparison_method = _create_arithmetic_method + if not na_value_is_na: + mask[:] = False + return constructor(result, mask) -StringArray._add_arithmetic_ops() -StringArray._add_comparison_ops() + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py new file mode 100644 index 0000000000000..184fbc050036b --- /dev/null +++ b/pandas/core/arrays/string_arrow.py @@ -0,0 +1,625 @@ +from __future__ import annotations + +from distutils.version import LooseVersion +from typing import TYPE_CHECKING, Any, Sequence, Type, Union + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas.util._validators import validate_fillna_kwargs + +from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.api.types import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import check_array_indexer, validate_indices +from pandas.core.missing import get_fill_func + +try: + import pyarrow as pa +except ImportError: + pa = None +else: + # our min supported version of pyarrow, 0.15.1, does not have a compute + # module + try: + import pyarrow.compute as pc + except ImportError: + pass + else: + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } + + +if TYPE_CHECKING: + from pandas import Series + + +@register_extension_dtype +class ArrowStringDtype(ExtensionDtype): + """ + Extension dtype for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.2.0 + + .. warning:: + + ArrowStringDtype is considered experimental. The implementation and + parts of the API may change without warning. + + Attributes + ---------- + None + + Methods + ------- + None + + Examples + -------- + >>> from pandas.core.arrays.string_arrow import ArrowStringDtype + >>> ArrowStringDtype() + ArrowStringDtype + """ + + name = "arrow_string" + + #: StringDtype.na_value uses pandas.NA + na_value = libmissing.NA + + @property + def type(self) -> Type[str]: + return str + + @classmethod + def construct_array_type(cls) -> Type["ArrowStringArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowStringArray + + def __hash__(self) -> int: + return hash("ArrowStringDtype") + + def __repr__(self) -> str: + return "ArrowStringDtype" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "ArrowStringArray": + """ + Construct StringArray from pyarrow Array/ChunkedArray. + """ + return ArrowStringArray(array) + + def __eq__(self, other) -> bool: + """Check whether 'other' is equal to self. + + By default, 'other' is considered equal if + * it's a string matching 'self.name'. + * it's an instance of this type. + + Parameters + ---------- + other : Any + + Returns + ------- + bool + """ + if isinstance(other, ArrowStringDtype): + return True + elif isinstance(other, str) and other == "arrow_string": + return True + else: + return False + + +class ArrowStringArray(OpsMixin, ExtensionArray): + """ + Extension array for string data in a ``pyarrow.ChunkedArray``. + + .. versionadded:: 1.2.0 + + .. warning:: + + ArrowStringArray is considered experimental. The implementation and + parts of the API may change without warning. + + Parameters + ---------- + values : pyarrow.Array or pyarrow.ChunkedArray + The array of data. + + Attributes + ---------- + None + + Methods + ------- + None + + See Also + -------- + array + The recommended function for creating a ArrowStringArray. + Series.str + The string methods are available on Series backed by + a ArrowStringArray. + + Notes + ----- + ArrowStringArray returns a BooleanArray for comparison methods. + + Examples + -------- + >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") + + ['This is', 'some text', , 'data.'] + Length: 4, dtype: arrow_string + """ + + _dtype = ArrowStringDtype() + + def __init__(self, values): + self._chk_pyarrow_available() + if isinstance(values, pa.Array): + self._data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._data = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") + + if not pa.types.is_string(self._data.type): + raise ValueError( + "ArrowStringArray requires a PyArrow (chunked) array of string type" + ) + + @classmethod + def _chk_pyarrow_available(cls) -> None: + # TODO: maybe update import_optional_dependency to allow a minimum + # version to be specified rather than use the global minimum + if pa is None or LooseVersion(pa.__version__) < "1.0.0": + msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + raise ImportError(msg) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + cls._chk_pyarrow_available() + # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value + scalars = lib.ensure_string_array(scalars, copy=False) + return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) + + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @property + def dtype(self) -> ArrowStringDtype: + """ + An instance of 'ArrowStringDtype'. + """ + return self._dtype + + def __array__(self, dtype=None) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self._data + + def to_numpy( + self, dtype=None, copy: bool = False, na_value=lib.no_default + ) -> np.ndarray: + """ + Convert to a NumPy ndarray. + """ + # TODO: copy argument is ignored + + if na_value is lib.no_default: + na_value = self._dtype.na_value + result = self._data.__array__(dtype=dtype) + result[isna(result)] = na_value + return result + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._data) + + @classmethod + def _from_factorized(cls, values, original): + return cls._from_sequence(values) + + @classmethod + def _concat_same_type(cls, to_concat) -> ArrowStringArray: + """ + Concatenate multiple ArrowStringArray. + + Parameters + ---------- + to_concat : sequence of ArrowStringArray + + Returns + ------- + ArrowStringArray + """ + return cls( + pa.chunked_array( + [array for ea in to_concat for array in ea._data.iterchunks()] + ) + ) + + def __getitem__(self, item: Any) -> Any: + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + return type(self)(pa.chunked_array([], type=pa.string())) + elif is_integer_dtype(item.dtype): + return self.take(item) + elif is_bool_dtype(item.dtype): + return type(self)(self._data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + value = self._data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + return self._as_pandas_scalar(value) + + def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): + scalar = arrow_scalar.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar + + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap. + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + ExtensionArray + With NA/NaN filled. + """ + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f"expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = get_fill_func(method) + new_values = func(self.to_numpy(object), limit=limit, mask=mask) + new_values = self._from_sequence(new_values) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + + def _reduce(self, name, skipna=True, **kwargs): + if name in ["min", "max"]: + return getattr(self, name)(skipna=skipna) + + raise TypeError(f"Cannot perform reduction '{name}' with string dtype") + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._data.nbytes + + def isna(self) -> np.ndarray: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + # TODO: Implement .to_numpy for ChunkedArray + return self._data.is_null().to_pandas().values + + def copy(self) -> ArrowStringArray: + """ + Return a shallow copy of the array. + + Returns + ------- + ArrowStringArray + """ + return type(self)(self._data) + + def _cmp_method(self, other, op): + from pandas.arrays import BooleanArray + + pc_func = ARROW_CMP_FUNCS[op.__name__] + if isinstance(other, ArrowStringArray): + result = pc_func(self._data, other._data) + elif isinstance(other, np.ndarray): + result = pc_func(self._data, other) + elif is_scalar(other): + try: + result = pc_func(self._data, pa.scalar(other)) + except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): + mask = isna(self) | isna(other) + valid = ~mask + result = np.zeros(len(self), dtype="bool") + result[valid] = op(np.array(self)[valid], other) + return BooleanArray(result, mask) + else: + return NotImplemented + + # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray + return BooleanArray._from_sequence(result.to_pandas().values) + + def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + + if is_integer(key): + if not is_scalar(value): + raise ValueError("Must pass scalars with scalar indexer") + elif isna(value): + value = None + elif not isinstance(value, str): + raise ValueError("Scalar must be NA or str") + + # Slice data and insert inbetween + new_data = [ + *self._data[0:key].chunks, + pa.array([value], type=pa.string()), + *self._data[(key + 1) :].chunks, + ] + self._data = pa.chunked_array(new_data) + else: + # Convert to integer indices and iteratively assign. + # TODO: Make a faster variant of this in Arrow upstream. + # This is probably extremely slow. + + # Convert all possible input key types to an array of integers + if is_bool_dtype(key): + # TODO(ARROW-9430): Directly support setitem(booleans) + key_array = np.argwhere(key).flatten() + elif isinstance(key, slice): + key_array = np.array(range(len(self))[key]) + else: + # TODO(ARROW-9431): Directly support setitem(integers) + key_array = np.asanyarray(key) + + if is_scalar(value): + value = np.broadcast_to(value, len(key_array)) + else: + value = np.asarray(value) + + if len(key_array) != len(value): + raise ValueError("Length of indexer and values mismatch") + + for k, v in zip(key_array, value): + self[k] = v + + def take( + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + indices_array = indices + + if len(self._data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._data)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._data.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + + vc = self._data.value_counts() + + # Index cannot hold ExtensionArrays yet + index = Index(type(self)(vc.field(0)).astype(object)) + # No missings, so we can adhere to the interface and return a numpy array. + counts = np.array(vc.field(1)) + + if dropna and self._data.null_count > 0: + raise NotImplementedError("yo") + + return Series(counts, index=index).astype("Int64") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a378423df788b..c51882afc4871 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1,18 +1,33 @@ from datetime import timedelta -from typing import List +from typing import List, Optional, Union import numpy as np from pandas._libs import lib, tslibs -from pandas._libs.tslibs import NaT, Period, Tick, Timedelta, Timestamp, iNaT, to_offset +from pandas._libs.tslibs import ( + BaseOffset, + NaT, + NaTType, + Period, + Tick, + Timedelta, + Timestamp, + iNaT, + to_offset, +) from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.fields import get_timedelta_field -from pandas._libs.tslibs.timedeltas import array_to_timedelta64, parse_timedelta_unit +from pandas._libs.tslibs.timedeltas import ( + array_to_timedelta64, + ints_to_pytimedelta, + parse_timedelta_unit, +) from pandas.compat.numpy import function as nv from pandas.core.dtypes.common import ( DT64NS_DTYPE, TD64NS_DTYPE, + is_categorical_dtype, is_dtype_equal, is_float_dtype, is_integer_dtype, @@ -29,15 +44,15 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr -from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays import IntegerArray, datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.ops.common import unpack_zerodim_and_defer -def _field_accessor(name, alias, docstring=None): - def f(self): +def _field_accessor(name: str, alias: str, docstring: str): + def f(self) -> np.ndarray: values = self.asi8 result = get_timedelta_field(values, alias) if self._hasnans: @@ -52,7 +67,7 @@ def f(self): return property(f) -class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): +class TimedeltaArray(dtl.TimelikeOps): """ Pandas ExtensionArray for timedelta data. @@ -89,6 +104,7 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _scalar_type = Timedelta _recognized_scalars = (timedelta, np.timedelta64, Tick) _is_recognized_dtype = is_timedelta64_dtype + _infer_matches = ("timedelta", "timedelta64") __array_priority__ = 1000 # define my properties & methods for delegation @@ -105,15 +121,14 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): "ceil", ] - # Note: ndim must be defined to ensure NaT.__richcmp(TimedeltaArray) + # Note: ndim must be defined to ensure NaT.__richcmp__(TimedeltaArray) # operates pointwise. - @property - def _box_func(self): - return lambda x: Timedelta(x, unit="ns") + def _box_func(self, x) -> Union[Timedelta, NaTType]: + return Timedelta(x, unit="ns") @property - def dtype(self): + def dtype(self) -> np.dtype: """ The dtype for the TimedeltaArray. @@ -188,7 +203,9 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new(cls, values, freq=None, dtype=TD64NS_DTYPE): + def _simple_new( + cls, values, freq: Optional[BaseOffset] = None, dtype=TD64NS_DTYPE + ) -> "TimedeltaArray": assert dtype == TD64NS_DTYPE, dtype assert isinstance(values, np.ndarray), type(values) if values.dtype != TD64NS_DTYPE: @@ -203,8 +220,25 @@ def _simple_new(cls, values, freq=None, dtype=TD64NS_DTYPE): @classmethod def _from_sequence( - cls, data, dtype=TD64NS_DTYPE, copy=False, freq=lib.no_default, unit=None - ): + cls, data, *, dtype=TD64NS_DTYPE, copy: bool = False + ) -> "TimedeltaArray": + if dtype: + _validate_td64_dtype(dtype) + + data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) + freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) + + return cls._simple_new(data, freq=freq) + + @classmethod + def _from_sequence_not_strict( + cls, + data, + dtype=TD64NS_DTYPE, + copy: bool = False, + freq=lib.no_default, + unit=None, + ) -> "TimedeltaArray": if dtype: _validate_td64_dtype(dtype) @@ -256,10 +290,6 @@ def _generate_range(cls, start, end, periods, freq, closed=None): index = generate_regular_range(start, end, periods, freq) else: index = np.linspace(start.value, end.value, periods).astype("i8") - if len(index) >= 2: - # Infer a frequency - td = Timedelta(index[1] - index[0]) - freq = to_offset(td) if not left_closed: index = index[1:] @@ -271,11 +301,11 @@ def _generate_range(cls, start, end, periods, freq, closed=None): # ---------------------------------------------------------------- # DatetimeLike Interface - def _unbox_scalar(self, value): + def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64: if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timedelta.") - self._check_compatible_with(value) - return value.value + self._check_compatible_with(value, setitem=setitem) + return np.timedelta64(value.value, "ns") def _scalar_from_string(self, value): return Timedelta(value) @@ -284,13 +314,10 @@ def _check_compatible_with(self, other, setitem: bool = False): # we don't have anything to validate. pass - def _maybe_clear_freq(self): - self._freq = None - # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): # We handle # --> timedelta64[ns] # --> timedelta64 @@ -304,10 +331,9 @@ def astype(self, dtype, copy=True): if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) - values = self._maybe_mask_results( + return self._maybe_mask_results( result, fill_value=None, convert="float64" ) - return values result = self._data.astype(dtype, copy=copy) return result.astype("i8") elif is_timedelta64_ns_dtype(dtype): @@ -316,11 +342,28 @@ def astype(self, dtype, copy=True): return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + def __iter__(self): + if self.ndim > 1: + for i in range(len(self)): + yield self[i] + else: + # convert in chunks of 10k for efficiency + data = self.asi8 + length = len(self) + chunksize = 10000 + chunks = int(length / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pytimedelta(data[start_i:end_i], box=True) + yield from converted + # ---------------------------------------------------------------- # Reductions def sum( self, + *, axis=None, dtype=None, out=None, @@ -330,20 +373,17 @@ def sum( min_count: int = 0, ): nv.validate_sum( - (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) + (), {"dtype": dtype, "out": out, "keepdims": keepdims, "initial": initial} ) - if not len(self): - return NaT - if not skipna and self._hasnans: - return NaT result = nanops.nansum( - self._data, axis=axis, skipna=skipna, min_count=min_count + self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) - return Timedelta(result) + return self._wrap_reduction_result(axis, result) def std( self, + *, axis=None, dtype=None, out=None, @@ -352,41 +392,26 @@ def std( skipna: bool = True, ): nv.validate_stat_ddof_func( - (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" + (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" ) - if not len(self): - return NaT - if not skipna and self._hasnans: - return NaT - result = nanops.nanstd(self._data, axis=axis, skipna=skipna, ddof=ddof) - return Timedelta(result) - - def median( - self, - axis=None, - out=None, - overwrite_input: bool = False, - keepdims: bool = False, - skipna: bool = True, - ): - nv.validate_median( - (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) - ) - return nanops.nanmedian(self._data, axis=axis, skipna=skipna) + result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) # ---------------------------------------------------------------- # Rendering Methods def _formatter(self, boxed=False): - from pandas.io.formats.format import _get_format_timedelta64 + from pandas.io.formats.format import get_format_timedelta64 - return _get_format_timedelta64(self, box=True) + return get_format_timedelta64(self, box=True) def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import _get_format_timedelta64 + from pandas.io.formats.format import get_format_timedelta64 - formatter = _get_format_timedelta64(self._data, na_rep) + formatter = get_format_timedelta64(self._data, na_rep) return np.array([formatter(x) for x in self._data.ravel()]).reshape(self.shape) # ---------------------------------------------------------------- @@ -453,7 +478,7 @@ def _addsub_object_array(self, other, op): ) from err @unpack_zerodim_and_defer("__mul__") - def __mul__(self, other): + def __mul__(self, other) -> "TimedeltaArray": if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._data * other @@ -488,7 +513,7 @@ def __mul__(self, other): def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT @@ -545,7 +570,7 @@ def __truediv__(self, other): @unpack_zerodim_and_defer("__rtruediv__") def __rtruediv__(self, other): # X / timedelta is defined only for timedelta-like X - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT @@ -588,7 +613,7 @@ def __rtruediv__(self, other): def __floordiv__(self, other): if is_scalar(other): - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT @@ -603,7 +628,7 @@ def __floordiv__(self, other): # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other - result[self._isnan] = iNaT + np.putmask(result, self._isnan, iNaT) freq = None if self.freq is not None: # Note: freq gets division, not floor-division @@ -628,8 +653,8 @@ def __floordiv__(self, other): result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): - result = result.astype(np.int64) - result[mask] = np.nan + result = result.astype(np.float64) + np.putmask(result, mask, np.nan) return result elif is_object_dtype(other.dtype): @@ -652,7 +677,7 @@ def __floordiv__(self, other): def __rfloordiv__(self, other): if is_scalar(other): - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT @@ -677,14 +702,13 @@ def __rfloordiv__(self, other): elif is_timedelta64_dtype(other.dtype): other = type(self)(other) - # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = other.asi8 // self.asi8 mask = self._isnan | other._isnan if mask.any(): - result = result.astype(np.int64) - result[mask] = np.nan + result = result.astype(np.float64) + np.putmask(result, mask, np.nan) return result elif is_object_dtype(other.dtype): @@ -699,21 +723,21 @@ def __rfloordiv__(self, other): @unpack_zerodim_and_defer("__mod__") def __mod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) return self - (self // other) * other @unpack_zerodim_and_defer("__rmod__") def __rmod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) return other - (other // self) * self @unpack_zerodim_and_defer("__divmod__") def __divmod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) res1 = self // other @@ -723,29 +747,29 @@ def __divmod__(self, other): @unpack_zerodim_and_defer("__rdivmod__") def __rdivmod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, (timedelta, np.timedelta64, Tick)): + if isinstance(other, self._recognized_scalars): other = Timedelta(other) res1 = other // self res2 = other - res1 * self return res1, res2 - def __neg__(self): + def __neg__(self) -> "TimedeltaArray": if self.freq is not None: return type(self)(-self._data, freq=-self.freq) return type(self)(-self._data) - def __pos__(self): + def __pos__(self) -> "TimedeltaArray": return type(self)(self._data, freq=self.freq) - def __abs__(self): + def __abs__(self) -> "TimedeltaArray": # Note: freq is not preserved return type(self)(np.abs(self._data)) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timedelta methods - def total_seconds(self): + def total_seconds(self) -> np.ndarray: """ Return total duration of each element expressed in seconds. @@ -921,6 +945,11 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data + elif isinstance(data, IntegerArray): + data = data.to_numpy("int64", na_value=tslibs.iNaT) + elif is_categorical_dtype(data.dtype): + data = data.categories.take(data.codes, fill_value=NaT)._values + copy = False # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): diff --git a/pandas/core/base.py b/pandas/core/base.py index b62ef668df5e1..f333ee0f71e46 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,42 +4,57 @@ import builtins import textwrap -from typing import Any, Dict, FrozenSet, List, Optional, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + FrozenSet, + Optional, + TypeVar, + Union, + cast, +) import numpy as np import pandas._libs.lib as lib +from pandas._typing import DtypeObj, IndexLabel from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc -from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( is_categorical_dtype, is_dict_like, is_extension_array_dtype, - is_list_like, is_object_dtype, is_scalar, ) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, remove_na_arraylike -from pandas.core import algorithms, common as com +from pandas.core import algorithms from pandas.core.accessor import DirNamesMixin from pandas.core.algorithms import duplicated, unique1d, value_counts +from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.construction import create_series_with_explicit_dtype import pandas.core.nanops as nanops -_shared_docs: Dict[str, str] = dict() -_indexops_doc_kwargs = dict( - klass="IndexOpsMixin", - inplace="", - unique="IndexOpsMixin", - duplicated="IndexOpsMixin", -) +if TYPE_CHECKING: + from pandas import Categorical + +_shared_docs: Dict[str, str] = {} +_indexops_doc_kwargs = { + "klass": "IndexOpsMixin", + "inplace": "", + "unique": "IndexOpsMixin", + "duplicated": "IndexOpsMixin", +} + +_T = TypeVar("_T", bound="IndexOpsMixin") class PandasObject(DirNamesMixin): @@ -80,7 +95,9 @@ def __sizeof__(self): either a value or Series of values """ if hasattr(self, "memory_usage"): - mem = self.memory_usage(deep=True) + # pandas\core\base.py:84: error: "PandasObject" has no attribute + # "memory_usage" [attr-defined] + mem = self.memory_usage(deep=True) # type: ignore[attr-defined] return int(mem if is_scalar(mem) else mem.sum()) # no memory_usage attribute, so fall back to object's 'sizeof' @@ -135,7 +152,7 @@ class SelectionMixin: object sub-classes need to define: obj, exclusions """ - _selection = None + _selection: Optional[IndexLabel] = None _internal_names = ["_cache", "__setstate__"] _internal_names_set = set(_internal_names) @@ -189,10 +206,18 @@ def _selection_list(self): @cache_readonly def _selected_obj(self): - if self._selection is None or isinstance(self.obj, ABCSeries): - return self.obj + # pandas\core\base.py:195: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] + if self._selection is None or isinstance( + self.obj, ABCSeries # type: ignore[attr-defined] + ): + # pandas\core\base.py:194: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] + return self.obj # type: ignore[attr-defined] else: - return self.obj[self._selection] + # pandas\core\base.py:204: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] + return self.obj[self._selection] # type: ignore[attr-defined] @cache_readonly def ndim(self) -> int: @@ -200,31 +225,58 @@ def ndim(self) -> int: @cache_readonly def _obj_with_exclusions(self): - if self._selection is not None and isinstance(self.obj, ABCDataFrame): - return self.obj.reindex(columns=self._selection_list) + # pandas\core\base.py:209: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] + if self._selection is not None and isinstance( + self.obj, ABCDataFrame # type: ignore[attr-defined] + ): + # pandas\core\base.py:217: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] + return self.obj.reindex( # type: ignore[attr-defined] + columns=self._selection_list + ) + + # pandas\core\base.py:207: error: "SelectionMixin" has no attribute + # "exclusions" [attr-defined] + if len(self.exclusions) > 0: # type: ignore[attr-defined] + # pandas\core\base.py:208: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] - if len(self.exclusions) > 0: - return self.obj.drop(self.exclusions, axis=1) + # pandas\core\base.py:208: error: "SelectionMixin" has no attribute + # "exclusions" [attr-defined] + return self.obj.drop(self.exclusions, axis=1) # type: ignore[attr-defined] else: - return self.obj + # pandas\core\base.py:210: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] + return self.obj # type: ignore[attr-defined] def __getitem__(self, key): if self._selection is not None: raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): - if len(self.obj.columns.intersection(key)) != len(key): - bad_keys = list(set(key).difference(self.obj.columns)) + # pandas\core\base.py:217: error: "SelectionMixin" has no attribute + # "obj" [attr-defined] + if len( + self.obj.columns.intersection(key) # type: ignore[attr-defined] + ) != len(key): + # pandas\core\base.py:218: error: "SelectionMixin" has no + # attribute "obj" [attr-defined] + bad_keys = list( + set(key).difference(self.obj.columns) # type: ignore[attr-defined] + ) raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) elif not getattr(self, "as_index", False): - if key not in self.obj.columns: + # error: "SelectionMixin" has no attribute "obj" [attr-defined] + if key not in self.obj.columns: # type: ignore[attr-defined] raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=2) else: - if key not in self.obj: + # error: "SelectionMixin" has no attribute "obj" [attr-defined] + if key not in self.obj: # type: ignore[attr-defined] raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=1) @@ -236,7 +288,7 @@ def _gotitem(self, key, ndim: int, subset=None): Parameters ---------- key : str / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on @@ -278,289 +330,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): f"'{arg}' is not a valid function for '{type(self).__name__}' object" ) - def _aggregate(self, arg, *args, **kwargs): - """ - provide an implementation for the aggregators - - Parameters - ---------- - arg : string, dict, function - *args : args to pass on to the function - **kwargs : kwargs to pass on to the function - - Returns - ------- - tuple of result, how - - Notes - ----- - how can be a string describe the required post-processing, or - None if not required - """ - is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) - - _axis = kwargs.pop("_axis", None) - if _axis is None: - _axis = getattr(self, "axis", 0) - - if isinstance(arg, str): - return self._try_aggregate_string_function(arg, *args, **kwargs), None - - if isinstance(arg, dict): - # aggregate based on the passed dict - if _axis != 0: # pragma: no cover - raise ValueError("Can only pass dict with axis=0") - - obj = self._selected_obj - - # if we have a dict of any non-scalars - # eg. {'A' : ['mean']}, normalize all to - # be list-likes - if any(is_aggregator(x) for x in arg.values()): - new_arg = {} - for k, v in arg.items(): - if not isinstance(v, (tuple, list, dict)): - new_arg[k] = [v] - else: - new_arg[k] = v - - # the keys must be in the columns - # for ndim=2, or renamers for ndim=1 - - # ok for now, but deprecated - # {'A': { 'ra': 'mean' }} - # {'A': { 'ra': ['mean'] }} - # {'ra': ['mean']} - - # not ok - # {'ra' : { 'A' : 'mean' }} - if isinstance(v, dict): - raise SpecificationError("nested renamer is not supported") - elif isinstance(obj, ABCSeries): - raise SpecificationError("nested renamer is not supported") - elif isinstance(obj, ABCDataFrame) and k not in obj.columns: - raise KeyError(f"Column '{k}' does not exist!") - - arg = new_arg - - else: - # deprecation of renaming keys - # GH 15931 - keys = list(arg.keys()) - if isinstance(obj, ABCDataFrame) and len( - obj.columns.intersection(keys) - ) != len(keys): - cols = sorted(set(keys) - set(obj.columns.intersection(keys))) - raise SpecificationError(f"Column(s) {cols} do not exist") - - from pandas.core.reshape.concat import concat - - def _agg_1dim(name, how, subset=None): - """ - aggregate a 1-dim with how - """ - colg = self._gotitem(name, ndim=1, subset=subset) - if colg.ndim != 1: - raise SpecificationError( - "nested dictionary is ambiguous in aggregation" - ) - return colg.aggregate(how) - - def _agg_2dim(how): - """ - aggregate a 2-dim with how - """ - colg = self._gotitem(self._selection, ndim=2, subset=obj) - return colg.aggregate(how) - - def _agg(arg, func): - """ - run the aggregations over the arg with func - return a dict - """ - result = {} - for fname, agg_how in arg.items(): - result[fname] = func(fname, agg_how) - return result - - # set the final keys - keys = list(arg.keys()) - result = {} - - if self._selection is not None: - - sl = set(self._selection_list) - - # we are a Series like object, - # but may have multiple aggregations - if len(sl) == 1: - - result = _agg( - arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how) - ) - - # we are selecting the same set as we are aggregating - elif not len(sl - set(keys)): - - result = _agg(arg, _agg_1dim) - - # we are a DataFrame, with possibly multiple aggregations - else: - - result = _agg(arg, _agg_2dim) - - # no selection - else: - - try: - result = _agg(arg, _agg_1dim) - except SpecificationError: - - # we are aggregating expecting all 1d-returns - # but we have 2d - result = _agg(arg, _agg_2dim) - - # combine results - - def is_any_series() -> bool: - # return a boolean if we have *any* nested series - return any(isinstance(r, ABCSeries) for r in result.values()) - - def is_any_frame() -> bool: - # return a boolean if we have *any* nested series - return any(isinstance(r, ABCDataFrame) for r in result.values()) - - if isinstance(result, list): - return concat(result, keys=keys, axis=1, sort=True), True - - elif is_any_frame(): - # we have a dict of DataFrames - # return a MI DataFrame - - keys_to_use = [k for k in keys if not result[k].empty] - # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else keys - return ( - concat([result[k] for k in keys_to_use], keys=keys_to_use, axis=1), - True, - ) - - elif isinstance(self, ABCSeries) and is_any_series(): - - # we have a dict of Series - # return a MI Series - try: - result = concat(result) - except TypeError as err: - # we want to give a nice error here if - # we have non-same sized objects, so - # we don't automatically broadcast - - raise ValueError( - "cannot perform both aggregation " - "and transformation operations " - "simultaneously" - ) from err - - return result, True - - # fall thru - from pandas import DataFrame, Series - - try: - result = DataFrame(result) - except ValueError: - - # we have a dict of scalars - result = Series(result, name=getattr(self, "name", None)) - - return result, True - elif is_list_like(arg): - # we require a list, but not an 'str' - return self._aggregate_multiple_funcs(arg, _axis=_axis), None - else: - result = None - - f = self._get_cython_func(arg) - if f and not args and not kwargs: - return getattr(self, f)(), None - - # caller can react - return result, True - - def _aggregate_multiple_funcs(self, arg, _axis): - from pandas.core.reshape.concat import concat - - if _axis != 0: - raise NotImplementedError("axis other than 0 is not supported") - - if self._selected_obj.ndim == 1: - obj = self._selected_obj - else: - obj = self._obj_with_exclusions - - results = [] - keys = [] - - # degenerate case - if obj.ndim == 1: - for a in arg: - colg = self._gotitem(obj.name, ndim=1, subset=obj) - try: - new_res = colg.aggregate(a) - - except TypeError: - pass - else: - results.append(new_res) - - # make sure we find a good name - name = com.get_callable_name(a) or a - keys.append(name) - - # multiples - else: - for index, col in enumerate(obj): - colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) - try: - new_res = colg.aggregate(arg) - except (TypeError, DataError): - pass - except ValueError as err: - # cannot aggregate - if "Must produce aggregated value" in str(err): - # raised directly in _aggregate_named - pass - elif "no results" in str(err): - # raised directly in _aggregate_multiple_funcs - pass - else: - raise - else: - results.append(new_res) - keys.append(col) - - # if we are empty - if not len(results): - raise ValueError("no results") - - try: - return concat(results, keys=keys, axis=1, sort=False) - except TypeError as err: - - # we are concatting non-NDFrame objects, - # e.g. a list of scalars - - from pandas import Series - - result = Series(results, index=keys, name=self.name) - if is_nested_object(result): - raise ValueError( - "cannot combine transform and aggregation operations" - ) from err - return result - - def _get_cython_func(self, arg: str) -> Optional[str]: + def _get_cython_func(self, arg: Callable) -> Optional[str]: """ if we define an internal function for this argument, return it """ @@ -574,38 +344,28 @@ def _is_builtin_func(self, arg): return self._builtin_table.get(arg, arg) -class ShallowMixin: - _attributes: List[str] = [] - - def _shallow_copy(self, obj, **kwargs): - """ - return a new object with the replacement attributes - """ - if isinstance(obj, self._constructor): - obj = obj.obj - for attr in self._attributes: - if attr not in kwargs: - kwargs[attr] = getattr(self, attr) - return self._constructor(obj, **kwargs) - - -class IndexOpsMixin: +class IndexOpsMixin(OpsMixin): """ Common ops mixin to support a unified interface / docs for Series / Index """ # ndarray compatibility __array_priority__ = 1000 - _deprecations: FrozenSet[str] = frozenset( + _hidden_attrs: FrozenSet[str] = frozenset( ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ ) + @property + def dtype(self) -> DtypeObj: + # must be defined here as a property for mypy + raise AbstractMethodError(self) + @property def _values(self) -> Union[ExtensionArray, np.ndarray]: # must be defined here as a property for mypy raise AbstractMethodError(self) - def transpose(self, *args, **kwargs): + def transpose(self: _T, *args, **kwargs) -> _T: """ Return the transpose, which is by definition self. @@ -643,7 +403,7 @@ def ndim(self) -> int: def item(self): """ - Return the first element of the underlying data as a python scalar. + Return the first element of the underlying data as a Python scalar. Returns ------- @@ -836,7 +596,11 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): dtype='datetime64[ns]') """ if is_extension_array_dtype(self.dtype): - return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) + # pandas\core\base.py:837: error: Too many arguments for "to_numpy" + # of "ExtensionArray" [call-arg] + return self.array.to_numpy( # type: ignore[call-arg] + dtype, copy=copy, na_value=na_value, **kwargs + ) elif kwargs: bad_keys = list(kwargs.keys())[0] raise TypeError( @@ -852,10 +616,10 @@ def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): return result @property - def empty(self): + def empty(self) -> bool: return not self.size - def max(self, axis=None, skipna=True, *args, **kwargs): + def max(self, axis=None, skipna: bool = True, *args, **kwargs): """ Return the maximum value of the Index. @@ -900,7 +664,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): return nanops.nanmax(self._values, skipna=skipna) @doc(op="max", oppose="min", value="largest") - def argmax(self, axis=None, skipna=True, *args, **kwargs): + def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: """ Return int position of the {value} value in the Series. @@ -955,7 +719,7 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): nv.validate_argmax_with_skipna(skipna, args, kwargs) return nanops.nanargmax(self._values, skipna=skipna) - def min(self, axis=None, skipna=True, *args, **kwargs): + def min(self, axis=None, skipna: bool = True, *args, **kwargs): """ Return the minimum value of the Index. @@ -1000,7 +764,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): return nanops.nanmin(self._values, skipna=skipna) @doc(argmax, op="min", oppose="max", value="smallest") - def argmin(self, axis=None, skipna=True, *args, **kwargs): + def argmin(self, axis=None, skipna=True, *args, **kwargs) -> int: nv.validate_minmax_axis(axis) nv.validate_argmax_with_skipna(skipna, args, kwargs) return nanops.nanargmin(self._values, skipna=skipna) @@ -1055,10 +819,14 @@ def hasnans(self): """ return bool(isna(self).any()) + def isna(self): + return isna(self._values) + def _reduce( self, op, name: str, + *, axis=0, skipna=True, numeric_only=None, @@ -1124,7 +892,15 @@ def _map_values(self, mapper, na_action=None): if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values - return self._values.map(mapper) + + # pandas\core\base.py:893: error: Incompatible types in + # assignment (expression has type "Categorical", variable has + # type "IndexOpsMixin") [assignment] + self = cast("Categorical", self) # type: ignore[assignment] + # pandas\core\base.py:894: error: Item "ExtensionArray" of + # "Union[ExtensionArray, Any]" has no attribute "map" + # [union-attr] + return self._values.map(mapper) # type: ignore[union-attr] values = self._values @@ -1141,12 +917,13 @@ def _map_values(self, mapper, na_action=None): raise NotImplementedError map_f = lambda values, f: values.map(f) else: - values = self.astype(object)._values + # pandas\core\base.py:1142: error: "IndexOpsMixin" has no attribute + # "astype" [attr-defined] + values = self.astype(object)._values # type: ignore[attr-defined] if na_action == "ignore": - - def map_f(values, f): - return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) - + map_f = lambda values, f: lib.map_infer_mask( + values, f, isna(values).view(np.uint8) + ) elif na_action is None: map_f = lib.map_infer else: @@ -1162,7 +939,12 @@ def map_f(values, f): return new_values def value_counts( - self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + self, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + bins=None, + dropna: bool = True, ): """ Return a Series containing counts of unique values. @@ -1201,8 +983,8 @@ def value_counts( >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 - 4.0 1 2.0 1 + 4.0 1 1.0 1 dtype: int64 @@ -1212,8 +994,8 @@ def value_counts( >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 - 4.0 0.2 2.0 0.2 + 4.0 0.2 1.0 0.2 dtype: float64 @@ -1225,8 +1007,8 @@ def value_counts( number of half-open bins. >>> s.value_counts(bins=3) - (2.0, 3.0] 2 (0.996, 2.0] 2 + (2.0, 3.0] 2 (3.0, 4.0] 1 dtype: int64 @@ -1236,9 +1018,9 @@ def value_counts( >>> s.value_counts(dropna=False) 3.0 2 + 2.0 1 NaN 1 4.0 1 - 2.0 1 1.0 1 dtype: int64 """ @@ -1300,11 +1082,8 @@ def nunique(self, dropna: bool = True) -> int: >>> s.nunique() 4 """ - uniqs = self.unique() - n = len(uniqs) - if dropna and isna(uniqs).any(): - n -= 1 - return n + obj = remove_na_arraylike(self) if dropna else self + return len(obj.unique()) @property def is_unique(self) -> bool: @@ -1359,7 +1138,7 @@ def memory_usage(self, deep=False): Parameters ---------- - deep : bool + deep : bool, default False Introspect the data deeply, interrogate `object` dtypes for system-level memory consumption. @@ -1378,7 +1157,9 @@ def memory_usage(self, deep=False): are not components of the array if deep=False or if used on PyPy """ if hasattr(self.array, "memory_usage"): - return self.array.memory_usage(deep=deep) + # pandas\core\base.py:1379: error: "ExtensionArray" has no + # attribute "memory_usage" [attr-defined] + return self.array.memory_usage(deep=deep) # type: ignore[attr-defined] v = self.array.nbytes if deep and is_object_dtype(self) and not PYPY: @@ -1398,7 +1179,7 @@ def memory_usage(self, deep=False): """ ), ) - def factorize(self, sort=False, na_sentinel=-1): + def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs[ @@ -1469,6 +1250,16 @@ def factorize(self, sort=False, na_sentinel=-1): >>> ser.searchsorted([1, 3], side='right') array([1, 3]) + >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'])) + >>> ser + 0 2000-03-11 + 1 2000-03-12 + 2 2000-03-13 + dtype: datetime64[ns] + + >>> ser.searchsorted('3/14/2000') + 3 + >>> ser = pd.Categorical( ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True ... ) @@ -1501,20 +1292,11 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) def drop_duplicates(self, keep="first"): - if isinstance(self, ABCIndexClass): - if self.is_unique: - return self._shallow_copy() - duplicated = self.duplicated(keep=keep) - result = self[np.logical_not(duplicated)] + # pandas\core\base.py:1507: error: Value of type "IndexOpsMixin" is not + # indexable [index] + result = self[np.logical_not(duplicated)] # type: ignore[index] return result def duplicated(self, keep="first"): - if isinstance(self, ABCIndexClass): - if self.is_unique: - return np.zeros(len(self), dtype=bool) - return duplicated(self, keep=keep) - else: - return self._constructor( - duplicated(self, keep=keep), index=self.index - ).__finalize__(self, method="duplicated") + return duplicated(self._values, keep=keep) diff --git a/pandas/core/common.py b/pandas/core/common.py index e7260a9923ee0..cdcbc43055052 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -6,17 +6,16 @@ from collections import abc, defaultdict import contextlib -from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Collection, Iterable, Iterator, List, Union +from typing import Any, Collection, Iterable, Iterator, List, Union, cast import warnings import numpy as np -from pandas._libs import lib, tslibs +from pandas._libs import lib from pandas._typing import AnyArrayLike, Scalar, T -from pandas.compat.numpy import _np_version_under1p18 +from pandas.compat.numpy import np_version_under1p18 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -25,13 +24,8 @@ is_extension_array_dtype, is_integer, ) -from pandas.core.dtypes.generic import ( - ABCExtensionArray, - ABCIndex, - ABCIndexClass, - ABCSeries, -) -from pandas.core.dtypes.inference import _iterable_not_string +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -43,13 +37,13 @@ class SettingWithCopyWarning(Warning): pass -def flatten(l): +def flatten(line): """ Flatten an arbitrarily nested sequence. Parameters ---------- - l : sequence + line : sequence The non string sequence to flatten Notes @@ -60,12 +54,11 @@ def flatten(l): ------- flattened : generator """ - for el in l: - if _iterable_not_string(el): - for s in flatten(el): - yield s + for element in line: + if iterable_not_string(element): + yield from flatten(element) else: - yield el + yield element def consensus_name_attr(objs): @@ -79,21 +72,6 @@ def consensus_name_attr(objs): return name -def maybe_box_datetimelike(value, dtype=None): - # turn a datetime like into a Timestamp/timedelta as needed - if dtype == object: - # If we dont have datetime64/timedelta64 dtype, we dont want to - # box datetimelike scalars - return value - - if isinstance(value, (np.datetime64, datetime)): - value = tslibs.Timestamp(value) - elif isinstance(value, (np.timedelta64, timedelta)): - value = tslibs.Timedelta(value) - - return value - - def is_bool_indexer(key: Any) -> bool: """ Check whether `key` is a valid boolean indexer. @@ -122,7 +100,7 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( + if isinstance(key, (ABCSeries, np.ndarray, ABCIndexClass)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: @@ -130,7 +108,9 @@ def is_bool_indexer(key: Any) -> bool: if not lib.is_bool_array(key): na_msg = "Cannot mask with non-boolean array containing NA / NaN values" - if isna(key).any(): + if lib.infer_dtype(key) == "boolean" and isna(key).any(): + # Don't raise on e.g. ["A", "B", np.nan], see + # test_loc_getitem_list_of_labels_categoricalindex_with_na raise ValueError(na_msg) return False return True @@ -277,6 +257,11 @@ def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T """ if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized): return list(obj) + # error: Incompatible return value type (got + # "Union[pandas.core.common., + # pandas.core.common.1, T]", expected + # "Union[Collection[T], T]") [return-value] + obj = cast(Collection, obj) return obj @@ -292,20 +277,23 @@ def is_null_slice(obj) -> bool: ) -def is_true_slices(l): +def is_true_slices(line): """ - Find non-trivial slices in "l": return a list of booleans with same length. + Find non-trivial slices in "line": return a list of booleans with same length. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in l] + return [isinstance(k, slice) and not is_null_slice(k) for k in line] # TODO: used only once in indexing; belongs elsewhere? -def is_full_slice(obj, l) -> bool: +def is_full_slice(obj, line) -> bool: """ We have a full length slice. """ return ( - isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None + isinstance(obj, slice) + and obj.start == 0 + and obj.stop == line + and obj.step is None ) @@ -343,23 +331,6 @@ def apply_if_callable(maybe_callable, obj, **kwargs): return maybe_callable -def dict_compat(d): - """ - Helper function to convert datetimelike-keyed dicts - to Timestamp-keyed dict. - - Parameters - ---------- - d: dict like object - - Returns - ------- - dict - - """ - return {maybe_box_datetimelike(key): value for key, value in d.items()} - - def standardize_mapping(into): """ Helper function to standardize a supplied mapping. @@ -420,7 +391,7 @@ def random_state(state=None): if ( is_integer(state) or is_array_like(state) - or (not _np_version_under1p18 and isinstance(state, np.random.BitGenerator)) + or (not np_version_under1p18 and isinstance(state, np.random.BitGenerator)) ): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): @@ -429,10 +400,8 @@ def random_state(state=None): return np.random else: raise ValueError( - ( - "random_state must be an integer, array-like, a BitGenerator, " - "a numpy RandomState, or None" - ) + "random_state must be an integer, array-like, a BitGenerator, " + "a numpy RandomState, or None" ) @@ -497,8 +466,11 @@ def convert_to_list_like( Convert list-like or scalar input to list-like. List, numpy and pandas array-like inputs are returned unmodified whereas others are converted to list. """ - if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)): - return values + if isinstance( + values, (list, np.ndarray, ABCIndexClass, ABCSeries, ABCExtensionArray) + ): + # np.ndarray resolving as Any gives a false positive + return values # type: ignore[return-value] elif isinstance(values, abc.Iterable) and not isinstance(values, str): return list(values) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 82867cf9dcd29..5ad3e78a76866 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -1,9 +1,10 @@ """ Core eval alignment algorithms. """ +from __future__ import annotations from functools import partial, wraps -from typing import Dict, Optional, Sequence, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np @@ -17,13 +18,16 @@ import pandas.core.common as com from pandas.core.computation.common import result_type_many +if TYPE_CHECKING: + from pandas.core.indexes.api import Index + def _align_core_single_unary_op( term, -) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, Index]]]: typ: Union[partial, Type[FrameOrSeries]] - axes: Optional[Dict[str, int]] = None + axes: Optional[Dict[str, Index]] = None if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) @@ -36,10 +40,9 @@ def _align_core_single_unary_op( def _zip_axes_from_type( - typ: Type[FrameOrSeries], new_axes: Sequence[int] -) -> Dict[str, int]: - axes = {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} - return axes + typ: Type[FrameOrSeries], new_axes: Sequence[Index] +) -> Dict[str, Index]: + return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} def _any_pandas_objects(terms) -> bool: @@ -186,8 +189,11 @@ def reconstruct_object(typ, obj, axes, dtype): # The condition is to distinguish 0-dim array (returned in case of # scalar) and 1 element array # e.g. np.array(0) and np.array([0]) - if len(obj.shape) == 1 and len(obj) == 1: - if not isinstance(ret_value, np.ndarray): - ret_value = np.array([ret_value]).astype(res_t) + if ( + len(obj.shape) == 1 + and len(obj) == 1 + and not isinstance(ret_value, np.ndarray) + ): + ret_value = np.array([ret_value]).astype(res_t) return ret_value diff --git a/pandas/core/computation/check.py b/pandas/core/computation/check.py index 4d205909b9e2e..6c7261b3b33c9 100644 --- a/pandas/core/computation/check.py +++ b/pandas/core/computation/check.py @@ -1,10 +1,10 @@ from pandas.compat._optional import import_optional_dependency ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn") -_NUMEXPR_INSTALLED = ne is not None -if _NUMEXPR_INSTALLED: - _NUMEXPR_VERSION = ne.__version__ +NUMEXPR_INSTALLED = ne is not None +if NUMEXPR_INSTALLED: + NUMEXPR_VERSION = ne.__version__ else: - _NUMEXPR_VERSION = None + NUMEXPR_VERSION = None -__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"] +__all__ = ["NUMEXPR_INSTALLED", "NUMEXPR_VERSION"] diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 327ec21c3c11c..8a9583c465f50 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -5,7 +5,7 @@ from pandas._config import get_option -def _ensure_decoded(s): +def ensure_decoded(s): """ If we have bytes, decode them to unicode. """ diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 9c5388faae1bd..77a378369ca34 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -6,11 +6,11 @@ from typing import Dict, Type from pandas.core.computation.align import align_terms, reconstruct_object -from pandas.core.computation.ops import _mathops, _reductions +from pandas.core.computation.ops import MATHOPS, REDUCTIONS import pandas.io.formats.printing as printing -_ne_builtins = frozenset(_mathops + _reductions) +_ne_builtins = frozenset(MATHOPS + REDUCTIONS) class NumExprClobberingError(NameError): @@ -130,7 +130,7 @@ def _evaluate(self) -> None: pass -_engines: Dict[str, Type[AbstractEngine]] = { +ENGINES: Dict[str, Type[AbstractEngine]] = { "numexpr": NumExprEngine, "python": PythonEngine, } diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index b74f99fca21c7..12f16343362e2 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -9,8 +9,8 @@ from pandas._libs.lib import no_default from pandas.util._validators import validate_bool_kwarg -from pandas.core.computation.engines import _engines -from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.engines import ENGINES +from pandas.core.computation.expr import PARSERS, Expr from pandas.core.computation.parsing import tokenize_string from pandas.core.computation.scope import ensure_scope @@ -38,13 +38,13 @@ def _check_engine(engine: Optional[str]) -> str: str Engine name. """ - from pandas.core.computation.check import _NUMEXPR_INSTALLED + from pandas.core.computation.check import NUMEXPR_INSTALLED if engine is None: - engine = "numexpr" if _NUMEXPR_INSTALLED else "python" + engine = "numexpr" if NUMEXPR_INSTALLED else "python" - if engine not in _engines: - valid_engines = list(_engines.keys()) + if engine not in ENGINES: + valid_engines = list(ENGINES.keys()) raise KeyError( f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" ) @@ -52,12 +52,11 @@ def _check_engine(engine: Optional[str]) -> str: # TODO: validate this in a more general way (thinking of future engines # that won't necessarily be import-able) # Could potentially be done on engine instantiation - if engine == "numexpr": - if not _NUMEXPR_INSTALLED: - raise ImportError( - "'numexpr' is not installed or an unsupported version. Cannot use " - "engine='numexpr' for query/eval if 'numexpr' is not installed" - ) + if engine == "numexpr" and not NUMEXPR_INSTALLED: + raise ImportError( + "'numexpr' is not installed or an unsupported version. Cannot use " + "engine='numexpr' for query/eval if 'numexpr' is not installed" + ) return engine @@ -75,9 +74,9 @@ def _check_parser(parser: str): KeyError * If an invalid parser is passed """ - if parser not in _parsers: + if parser not in PARSERS: raise KeyError( - f"Invalid parser '{parser}' passed, valid parsers are {_parsers.keys()}" + f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" ) @@ -212,7 +211,8 @@ def eval( truediv : bool, optional Whether to use true division, like in Python >= 3. - deprecated:: 1.0.0 + + .. deprecated:: 1.0.0 local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. @@ -241,7 +241,8 @@ def eval( Returns ------- - ndarray, numeric scalar, DataFrame, Series + ndarray, numeric scalar, DataFrame, Series, or None + The completion value of evaluating the given code or None if ``inplace=True``. Raises ------ @@ -341,7 +342,7 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) # construct the engine and evaluate the parsed expression - eng = _engines[engine] + eng = ENGINES[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index fcccc24ed7615..88a25ad9996a0 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -10,9 +10,17 @@ import numpy as np +from pandas.compat import PY39 + import pandas.core.common as com from pandas.core.computation.ops import ( - _LOCAL_TAG, + ARITH_OPS_SYMS, + BOOL_OPS_SYMS, + CMP_OPS_SYMS, + LOCAL_TAG, + MATHOPS, + REDUCTIONS, + UNARY_OPS_SYMS, BinOp, Constant, Div, @@ -21,12 +29,6 @@ Term, UnaryOp, UndefinedVariableError, - _arith_ops_syms, - _bool_ops_syms, - _cmp_ops_syms, - _mathops, - _reductions, - _unary_ops_syms, is_term, ) from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string @@ -101,7 +103,7 @@ def _replace_locals(tok: Tuple[int, str]) -> Tuple[int, str]: """ toknum, tokval = tok if toknum == tokenize.OP and tokval == "@": - return tokenize.OP, _LOCAL_TAG + return tokenize.OP, LOCAL_TAG return toknum, tokval @@ -151,7 +153,7 @@ def _preparse( the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), "f must be callable" - return tokenize.untokenize((f(x) for x in tokenize_string(source))) + return tokenize.untokenize(f(x) for x in tokenize_string(source)) def _is_type(t): @@ -167,10 +169,9 @@ def _is_type(t): # partition all AST nodes _all_nodes = frozenset( - filter( - lambda x: isinstance(x, type) and issubclass(x, ast.AST), - (getattr(ast, node) for node in dir(ast)), - ) + node + for node in (getattr(ast, name) for name in dir(ast)) + if isinstance(node, type) and issubclass(node, ast.AST) ) @@ -187,7 +188,6 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _stmt_nodes = _filter_nodes(ast.stmt) _expr_nodes = _filter_nodes(ast.expr) _expr_context_nodes = _filter_nodes(ast.expr_context) -_slice_nodes = _filter_nodes(ast.slice) _boolop_nodes = _filter_nodes(ast.boolop) _operator_nodes = _filter_nodes(ast.operator) _unary_op_nodes = _filter_nodes(ast.unaryop) @@ -198,6 +198,9 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) +if not PY39: + _slice_nodes = _filter_nodes(ast.slice) + # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) @@ -339,7 +342,7 @@ class BaseExprVisitor(ast.NodeVisitor): const_type: Type[Term] = Constant term_type = Term - binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS binary_op_nodes = ( "Gt", "Lt", @@ -363,9 +366,9 @@ class BaseExprVisitor(ast.NodeVisitor): ) binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) - unary_ops = _unary_ops_syms + unary_ops = UNARY_OPS_SYMS unary_op_nodes = "UAdd", "USub", "Invert", "Not" - unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + unary_op_nodes_map = {k: v for k, v in zip(unary_ops, unary_op_nodes)} rewrite_map = { ast.Eq: ast.In, @@ -493,15 +496,14 @@ def _maybe_evaluate_binop( f"'{lhs.type}' and '{rhs.type}'" ) - if self.engine != "pytables": - if ( - res.op in _cmp_ops_syms - and getattr(lhs, "is_datetime", False) - or getattr(rhs, "is_datetime", False) - ): - # all date ops must be done in python bc numexpr doesn't work - # well with NaT - return self._maybe_eval(res, self.binary_ops) + if self.engine != "pytables" and ( + res.op in CMP_OPS_SYMS + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python @@ -657,7 +659,11 @@ def visit_Call(self, node, side=None, **kwargs): raise if res is None: - raise ValueError(f"Invalid function call {node.func.id}") + # pandas\core\computation\expr.py:663: error: "expr" has no + # attribute "id" [attr-defined] + raise ValueError( + f"Invalid function call {node.func.id}" # type: ignore[attr-defined] + ) if hasattr(res, "value"): res = res.value @@ -678,7 +684,12 @@ def visit_Call(self, node, side=None, **kwargs): for key in node.keywords: if not isinstance(key, ast.keyword): - raise ValueError(f"keyword error in function call '{node.func.id}'") + # pandas\core\computation\expr.py:684: error: "expr" has no + # attribute "id" [attr-defined] + raise ValueError( + "keyword error in function call " # type: ignore[attr-defined] + f"'{node.func.id}'" + ) if key.arg: kwargs[key.arg] = self.visit(key.value).value @@ -727,7 +738,7 @@ def visitor(x, y): _python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) -_numexpr_supported_calls = frozenset(_reductions + _mathops) +_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS) @disallow( @@ -783,7 +794,7 @@ def __init__( self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser - self._visitor = _parsers[parser](self.env, self.engine, self.parser) + self._visitor = PARSERS[parser](self.env, self.engine, self.parser) self.terms = self.parse() @property @@ -815,4 +826,4 @@ def names(self): return frozenset(term.name for term in com.flatten(self.terms)) -_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} +PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 0e9077e6d557e..e5ede3cd885be 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -6,6 +6,7 @@ """ import operator +from typing import List, Set import warnings import numpy as np @@ -14,15 +15,15 @@ from pandas.core.dtypes.generic import ABCDataFrame -from pandas.core.computation.check import _NUMEXPR_INSTALLED +from pandas.core.computation.check import NUMEXPR_INSTALLED from pandas.core.ops import roperator -if _NUMEXPR_INSTALLED: +if NUMEXPR_INSTALLED: import numexpr as ne _TEST_MODE = None -_TEST_RESULT = None -_USE_NUMEXPR = _NUMEXPR_INSTALLED +_TEST_RESULT: List[bool] = [] +USE_NUMEXPR = NUMEXPR_INSTALLED _evaluate = None _where = None @@ -38,21 +39,21 @@ def set_use_numexpr(v=True): # set/unset to use numexpr - global _USE_NUMEXPR - if _NUMEXPR_INSTALLED: - _USE_NUMEXPR = v + global USE_NUMEXPR + if NUMEXPR_INSTALLED: + USE_NUMEXPR = v # choose what we are going to do global _evaluate, _where - _evaluate = _evaluate_numexpr if _USE_NUMEXPR else _evaluate_standard - _where = _where_numexpr if _USE_NUMEXPR else _where_standard + _evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard + _where = _where_numexpr if USE_NUMEXPR else _where_standard def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if NUMEXPR_INSTALLED and USE_NUMEXPR: if n is None: n = ne.detect_number_of_cores() ne.set_num_threads(n) @@ -75,7 +76,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: # check for dtype compatibility - dtypes = set() + dtypes: Set[str] = set() for o in [a, b]: # Series implements dtypes, check for dimension count as well if hasattr(o, "dtypes") and o.ndim > 1: @@ -132,7 +133,10 @@ def _evaluate_numexpr(op, op_str, a, b): roperator.rtruediv: "/", operator.floordiv: "//", roperator.rfloordiv: "//", - operator.mod: "%", + # we require Python semantics for mod of negative for backwards compatibility + # see https://github.com/pydata/numexpr/issues/365 + # so sticking with unaccelerated for now + operator.mod: None, roperator.rmod: "%", operator.pow: "**", roperator.rpow: "**", @@ -227,7 +231,8 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b) # type: ignore + # error: "None" not callable + return _evaluate(op, op_str, a, b) # type: ignore[misc] return _evaluate_standard(op, op_str, a, b) @@ -243,28 +248,32 @@ def where(cond, a, b, use_numexpr=True): use_numexpr : bool, default True Whether to try to use numexpr. """ + assert _where is not None return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) -def set_test_mode(v=True): +def set_test_mode(v: bool = True) -> None: """ - Keeps track of whether numexpr was used. Stores an additional ``True`` - for every successful use of evaluate with numexpr since the last - ``get_test_result`` + Keeps track of whether numexpr was used. + + Stores an additional ``True`` for every successful use of evaluate with + numexpr since the last ``get_test_result``. """ global _TEST_MODE, _TEST_RESULT _TEST_MODE = v _TEST_RESULT = [] -def _store_test_result(used_numexpr): +def _store_test_result(used_numexpr: bool) -> None: global _TEST_RESULT if used_numexpr: _TEST_RESULT.append(used_numexpr) -def get_test_result(): - """get test result and reset test_results""" +def get_test_result() -> List[bool]: + """ + Get test result and reset test_results. + """ global _TEST_RESULT res = _TEST_RESULT _TEST_RESULT = [] diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index bc9ff7c44b689..74bee80c6c8a6 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -15,12 +15,12 @@ from pandas.core.dtypes.common import is_list_like, is_scalar import pandas.core.common as com -from pandas.core.computation.common import _ensure_decoded, result_type_many -from pandas.core.computation.scope import _DEFAULT_GLOBALS +from pandas.core.computation.common import ensure_decoded, result_type_many +from pandas.core.computation.scope import DEFAULT_GLOBALS from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -_reductions = ("sum", "prod") +REDUCTIONS = ("sum", "prod") _unary_math_ops = ( "sin", @@ -46,10 +46,10 @@ ) _binary_math_ops = ("arctan2",) -_mathops = _unary_math_ops + _binary_math_ops +MATHOPS = _unary_math_ops + _binary_math_ops -_LOCAL_TAG = "__pd_eval_local_" +LOCAL_TAG = "__pd_eval_local_" class UndefinedVariableError(NameError): @@ -69,7 +69,9 @@ def __init__(self, name: str, is_local: Optional[bool] = None): class Term: def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls - supr_new = super(Term, klass).__new__ + # pandas\core\computation\ops.py:72: error: Argument 2 for "super" not + # an instance of argument 1 [misc] + supr_new = super(Term, klass).__new__ # type: ignore[misc] return supr_new(klass) is_local: bool @@ -80,13 +82,13 @@ def __init__(self, name, env, side=None, encoding=None): self.env = env self.side = side tname = str(name) - self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS + self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS self._value = self._resolve_name() self.encoding = encoding @property def local_name(self) -> str: - return self.name.replace(_LOCAL_TAG, "") + return self.name.replace(LOCAL_TAG, "") def __repr__(self) -> str: return pprint_thing(self.name) @@ -220,7 +222,7 @@ def __repr__(self) -> str: @property def return_type(self): # clobber types to bool if the op is a boolean operator - if self.op in (_cmp_ops_syms + _bool_ops_syms): + if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): return np.bool_ return result_type_many(*(term.type for term in com.flatten(self))) @@ -280,7 +282,7 @@ def _not_in(x, y): return x not in y -_cmp_ops_syms = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") _cmp_ops_funcs = ( operator.gt, operator.lt, @@ -291,13 +293,13 @@ def _not_in(x, y): _in, _not_in, ) -_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) +_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) -_bool_ops_syms = ("&", "|", "and", "or") +BOOL_OPS_SYMS = ("&", "|", "and", "or") _bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) -_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) +_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) -_arith_ops_syms = ("+", "-", "*", "/", "**", "//", "%") +ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") _arith_ops_funcs = ( operator.add, operator.sub, @@ -307,12 +309,12 @@ def _not_in(x, y): operator.floordiv, operator.mod, ) -_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) +_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) -_special_case_arith_ops_syms = ("**", "//", "%") +SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") _special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) _special_case_arith_ops_dict = dict( - zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs) + zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) ) _binary_ops_dict = {} @@ -466,7 +468,7 @@ def stringify(value): v = rhs.value if isinstance(v, (int, float)): v = stringify(v) - v = Timestamp(_ensure_decoded(v)) + v = Timestamp(ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.rhs.update(v) @@ -475,19 +477,27 @@ def stringify(value): v = lhs.value if isinstance(v, (int, float)): v = stringify(v) - v = Timestamp(_ensure_decoded(v)) + v = Timestamp(ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.lhs.update(v) def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) if ( - (self.lhs.is_scalar or self.rhs.is_scalar) + (lhs.is_scalar or rhs.is_scalar) and self.op in _bool_ops_dict and ( not ( - issubclass(self.rhs.return_type, (bool, np.bool_)) - and issubclass(self.lhs.return_type, (bool, np.bool_)) + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) ) ) ): @@ -522,9 +532,9 @@ def __init__(self, lhs, rhs): _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) -_unary_ops_syms = ("+", "-", "~", "not") +UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) -_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) +_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) class UnaryOp(Op): @@ -553,7 +563,7 @@ def __init__(self, op: str, operand): except KeyError as err: raise ValueError( f"Invalid unary operator {repr(op)}, " - f"valid operators are {_unary_ops_syms}" + f"valid operators are {UNARY_OPS_SYMS}" ) from err def __call__(self, env): @@ -581,7 +591,8 @@ def __init__(self, func, args): self.func = func def __call__(self, env): - operands = [op(env) for op in self.operands] + # pandas\core\computation\ops.py:592: error: "Op" not callable [operator] + operands = [op(env) for op in self.operands] # type: ignore[operator] with np.errstate(all="ignore"): return self.func.func(*operands) @@ -592,11 +603,11 @@ def __repr__(self) -> str: class FuncNode: def __init__(self, name: str): - from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION + from pandas.core.computation.check import NUMEXPR_INSTALLED, NUMEXPR_VERSION - if name not in _mathops or ( - _NUMEXPR_INSTALLED - and _NUMEXPR_VERSION < LooseVersion("2.6.9") + if name not in MATHOPS or ( + NUMEXPR_INSTALLED + and NUMEXPR_VERSION < LooseVersion("2.6.9") and name in ("floor", "ceil") ): raise ValueError(f'"{name}" is not a supported function') diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index c7c7103654a65..a1bebc92046ae 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -8,6 +8,8 @@ import tokenize from typing import Iterator, Tuple +from pandas._typing import Label + # A token value Python's tokenizer probably will never use. BACKTICK_QUOTED_STRING = 100 @@ -37,7 +39,9 @@ def create_valid_python_identifier(name: str) -> str: special_characters_replacements = { char: f"_{token.tok_name[tokval]}_" # The ignore here is because of a bug in mypy that is resolved in 0.740 - for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + for char, tokval in ( + tokenize.EXACT_TOKEN_TYPES.items() # type: ignore[attr-defined] + ) } special_characters_replacements.update( { @@ -89,7 +93,7 @@ def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: return toknum, tokval -def clean_column_name(name: str) -> str: +def clean_column_name(name: "Label") -> "Label": """ Function to emulate the cleaning of a backtick quoted name. @@ -100,12 +104,12 @@ def clean_column_name(name: str) -> str: Parameters ---------- - name : str + name : hashable Name to be cleaned. Returns ------- - name : str + name : hashable Returns the name after tokenizing and cleaning. Notes diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 001eb1789007f..b819886687817 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -14,7 +14,7 @@ import pandas as pd import pandas.core.common as com from pandas.core.computation import expr, ops, scope as _scope -from pandas.core.computation.common import _ensure_decoded +from pandas.core.computation.common import ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term from pandas.core.construction import extract_array @@ -35,14 +35,17 @@ def __init__( queryables: Optional[Dict[str, Any]] = None, ): super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) - self.queryables = queryables or dict() + self.queryables = queryables or {} class Term(ops.Term): env: PyTablesScope def __new__(cls, name, env, side=None, encoding=None): - klass = Constant if not isinstance(name, str) else cls + if isinstance(name, str): + klass = cls + else: + klass = Constant return object.__new__(klass) def __init__(self, name, env: PyTablesScope, side=None, encoding=None): @@ -63,7 +66,7 @@ def _resolve_name(self): return self.name # read-only property overwriting read/write property - @property # type: ignore + @property # type: ignore[misc] def value(self): return self._value @@ -83,6 +86,7 @@ class BinOp(ops.BinOp): op: str queryables: Dict[str, Any] + condition: Optional[str] def __init__(self, op: str, lhs, rhs, queryables: Dict[str, Any], encoding): super().__init__(op, lhs, rhs) @@ -184,17 +188,15 @@ def convert_value(self, v) -> "TermValue": def stringify(value): if self.encoding is not None: - encoder = partial(pprint_thing_encoded, encoding=self.encoding) - else: - encoder = pprint_thing - return encoder(value) + return pprint_thing_encoded(value, encoding=self.encoding) + return pprint_thing(value) - kind = _ensure_decoded(self.kind) - meta = _ensure_decoded(self.meta) + kind = ensure_decoded(self.kind) + meta = ensure_decoded(self.meta) if kind == "datetime64" or kind == "datetime": if isinstance(v, (int, float)): v = stringify(v) - v = _ensure_decoded(v) + v = ensure_decoded(v) v = Timestamp(v) if v.tz is not None: v = v.tz_convert("UTC") @@ -257,9 +259,11 @@ def __repr__(self) -> str: def invert(self): """ invert the filter """ if self.filter is not None: - f = list(self.filter) - f[1] = self.generate_filter_op(invert=True) - self.filter = tuple(f) + self.filter = ( + self.filter[0], + self.generate_filter_op(invert=True), + self.filter[2], + ) return self def format(self): @@ -374,14 +378,14 @@ def prune(self, klass): operand = self.operand operand = operand.prune(klass) - if operand is not None: - if issubclass(klass, ConditionBinOp): - if operand.condition is not None: - return operand.invert() - elif issubclass(klass, FilterBinOp): - if operand.filter is not None: - return operand.invert() - + if operand is not None and ( + issubclass(klass, ConditionBinOp) + and operand.condition is not None + or not issubclass(klass, ConditionBinOp) + and issubclass(klass, FilterBinOp) + and operand.filter is not None + ): + return operand.invert() return None @@ -426,6 +430,10 @@ def visit_Subscript(self, node, **kwargs): except AttributeError: pass + if isinstance(slobj, Term): + # In py39 np.ndarray lookups with Term containing int raise + slobj = slobj.value + try: return self.const_type(value[slobj], self.env) except TypeError as err: @@ -554,7 +562,7 @@ def __init__( else: w = _validate_where(w) where[idx] = w - _where = " & ".join((f"({w})" for w in com.flatten(where))) + _where = " & ".join(f"({w})" for w in com.flatten(where)) else: _where = where diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 83bf92ad737e4..d2708da04b7e9 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -53,7 +53,7 @@ def _raw_hex_id(obj) -> str: return "".join(_replacer(x) for x in packed) -_DEFAULT_GLOBALS = { +DEFAULT_GLOBALS = { "Timestamp": Timestamp, "datetime": datetime.datetime, "True": True, @@ -114,7 +114,7 @@ def __init__( # shallow copy because we don't want to keep filling this up with what # was there before if there are multiple calls to Scope/_ensure_scope - self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) + self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) self.target = target if isinstance(local_dict, Scope): @@ -129,23 +129,36 @@ def __init__( # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - self.scope = self.scope.new_child((global_dict or frame.f_globals).copy()) + + # pandas\core\computation\scope.py:132: error: Incompatible types + # in assignment (expression has type "ChainMap[str, Any]", variable + # has type "DeepChainMap[str, Any]") [assignment] + self.scope = self.scope.new_child( # type: ignore[assignment] + (global_dict or frame.f_globals).copy() + ) if not isinstance(local_dict, Scope): - self.scope = self.scope.new_child((local_dict or frame.f_locals).copy()) + # pandas\core\computation\scope.py:134: error: Incompatible + # types in assignment (expression has type "ChainMap[str, + # Any]", variable has type "DeepChainMap[str, Any]") + # [assignment] + self.scope = self.scope.new_child( # type: ignore[assignment] + (local_dict or frame.f_locals).copy() + ) finally: del frame # assumes that resolvers are going from outermost scope to inner if isinstance(local_dict, Scope): - resolvers += tuple(local_dict.resolvers.maps) + # pandas\core\computation\scope.py:140: error: Cannot determine + # type of 'resolvers' [has-type] + resolvers += tuple(local_dict.resolvers.maps) # type: ignore[has-type] self.resolvers = DeepChainMap(*resolvers) self.temps = {} def __repr__(self) -> str: scope_keys = _get_pretty_string(list(self.scope.keys())) res_keys = _get_pretty_string(list(self.resolvers.keys())) - unicode_str = f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" - return unicode_str + return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" @property def has_resolvers(self) -> bool: @@ -225,7 +238,9 @@ def swapkey(self, old_key: str, new_key: str, new_value=None): for mapping in maps: if old_key in mapping: - mapping[new_key] = new_value + # pandas\core\computation\scope.py:228: error: Unsupported + # target for indexed assignment ("Mapping[Any, Any]") [index] + mapping[new_key] = new_value # type: ignore[index] return def _get_vars(self, stack, scopes: List[str]): @@ -244,7 +259,11 @@ def _get_vars(self, stack, scopes: List[str]): for scope, (frame, _, _, _, _, _) in variables: try: d = getattr(frame, "f_" + scope) - self.scope = self.scope.new_child(d) + # pandas\core\computation\scope.py:247: error: Incompatible + # types in assignment (expression has type "ChainMap[str, + # Any]", variable has type "DeepChainMap[str, Any]") + # [assignment] + self.scope = self.scope.new_child(d) # type: ignore[assignment] finally: # won't remove it, but DECREF it # in Py3 this probably isn't necessary since frame won't be @@ -311,5 +330,16 @@ def full_scope(self): vars : DeepChainMap All variables in this scope. """ - maps = [self.temps] + self.resolvers.maps + self.scope.maps + # pandas\core\computation\scope.py:314: error: Unsupported operand + # types for + ("List[Dict[Any, Any]]" and "List[Mapping[Any, Any]]") + # [operator] + + # pandas\core\computation\scope.py:314: error: Unsupported operand + # types for + ("List[Dict[Any, Any]]" and "List[Mapping[str, Any]]") + # [operator] + maps = ( + [self.temps] + + self.resolvers.maps # type: ignore[operator] + + self.scope.maps # type: ignore[operator] + ) return DeepChainMap(*maps) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 54d23fe8829e6..7d9664bd9f965 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -52,6 +52,20 @@ def use_numexpr_cb(key): expressions.set_use_numexpr(cf.get_option(key)) +use_numba_doc = """ +: bool + Use the numba engine option for select operations if it is installed, + the default is False + Valid values: False,True +""" + + +def use_numba_cb(key): + from pandas.core.util import numba_ + + numba_.set_use_numba(cf.get_option(key)) + + with cf.config_prefix("compute"): cf.register_option( "use_bottleneck", @@ -63,13 +77,17 @@ def use_numexpr_cb(key): cf.register_option( "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb ) + cf.register_option( + "use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb + ) # # options from the "display" namespace pc_precision_doc = """ : int - Floating point output precision (number of significant digits). This is - only a suggestion + Floating point output precision in terms of number of places after the + decimal, for regular formatting as well as scientific notation. Similar + to ``precision`` in :meth:`numpy.set_printoptions`. """ pc_colspace_doc = """ @@ -232,7 +250,7 @@ def use_numexpr_cb(key): pc_max_seq_items = """ : int or None - when pretty-printing a long sequence, no more then `max_seq_items` + When pretty-printing a long sequence, no more then `max_seq_items` will be printed. If items are omitted, they will be denoted by the addition of "..." to the resulting string. @@ -297,9 +315,9 @@ def use_numexpr_cb(key): def table_schema_cb(key): - from pandas.io.formats.printing import _enable_data_resource_formatter + from pandas.io.formats.printing import enable_data_resource_formatter - _enable_data_resource_formatter(cf.get_option(key)) + enable_data_resource_formatter(cf.get_option(key)) def is_terminal() -> bool: @@ -310,7 +328,7 @@ def is_terminal() -> bool: """ try: # error: Name 'get_ipython' is not defined - ip = get_ipython() # type: ignore + ip = get_ipython() # type: ignore[name-defined] except NameError: # assume standard Python interpreter in a terminal return True else: @@ -563,6 +581,13 @@ def use_inf_as_na_cb(key): writer_engine_doc.format(ext="xls", others=", ".join(_xls_options)), validator=str, ) +cf.deprecate_option( + "io.excel.xls.writer", + msg="As the xlwt package is no longer maintained, the xlwt engine will be " + "removed in a future version of pandas. This is the only engine in pandas that " + "supports writing in the xls format. Install openpyxl and write to an " + "xlsx file instead.", +) with cf.config_prefix("io.excel.xlsm"): cf.register_option( @@ -645,8 +670,10 @@ def register_plotting_backend_cb(key): def register_converter_cb(key): - from pandas.plotting import register_matplotlib_converters - from pandas.plotting import deregister_matplotlib_converters + from pandas.plotting import ( + deregister_matplotlib_converters, + register_matplotlib_converters, + ) if cf.get_option(key): register_matplotlib_converters() diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 6c58698989e96..44224f9709699 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -4,6 +4,7 @@ These should not depend on core.internals. """ +from __future__ import annotations from collections import abc from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast @@ -35,6 +36,8 @@ is_iterator, is_list_like, is_object_dtype, + is_sparse, + is_string_dtype, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ( @@ -48,16 +51,14 @@ import pandas.core.common as com if TYPE_CHECKING: - from pandas.core.series import Series # noqa: F401 - from pandas.core.indexes.api import Index # noqa: F401 - from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas import ExtensionArray, Index, Series def array( data: Union[Sequence[object], AnyArrayLike], dtype: Optional[Dtype] = None, copy: bool = True, -) -> "ExtensionArray": +) -> ExtensionArray: """ Create an array. @@ -101,6 +102,7 @@ def array( :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`float` :class:`pandas.arrays.FloatingArray` :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== @@ -113,6 +115,11 @@ def array( string dtype for string data, and nullable-boolean dtype for boolean data. + .. versionchanged:: 1.2.0 + + Pandas now also infers nullable-floating dtype for float-like + input data + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -204,6 +211,11 @@ def array( [1, 2, ] Length: 3, dtype: Int64 + >>> pd.array([1.1, 2.2]) + + [1.1, 2.2] + Length: 2, dtype: Float64 + >>> pd.array(["a", None, "c"]) ['a', , 'c'] @@ -230,10 +242,10 @@ def array( If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. - >>> pd.array([1.1, 2.2]) + >>> pd.array([1 + 1j, 3 + 2j]) - [1.1, 2.2] - Length: 2, dtype: float64 + [(1+1j), (3+2j)] + Length: 2, dtype: complex128 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return @@ -255,14 +267,15 @@ def array( ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( - period_array, BooleanArray, + DatetimeArray, + FloatingArray, IntegerArray, IntervalArray, PandasArray, - DatetimeArray, - TimedeltaArray, StringArray, + TimedeltaArray, + period_array, ) if lib.is_scalar(data): @@ -319,6 +332,9 @@ def array( elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) + elif inferred_dtype in ("floating", "mixed-integer-float"): + return FloatingArray._from_sequence(data, copy=copy) + elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) @@ -335,7 +351,7 @@ def array( return result -def extract_array(obj, extract_numpy: bool = False): +def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayLike]: """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -386,9 +402,27 @@ def extract_array(obj, extract_numpy: bool = False): return obj +def ensure_wrapped_if_datetimelike(arr): + """ + Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. + """ + if isinstance(arr, np.ndarray): + if arr.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence(arr) + + elif arr.dtype.kind == "m": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(arr) + + return arr + + def sanitize_array( data, - index: Optional["Index"], + index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, @@ -436,7 +470,12 @@ def sanitize_array( subarr = subarr.copy() return subarr - elif isinstance(data, (list, tuple)) and len(data) > 0: + elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: + if isinstance(data, set): + # Raise only for unordered sets, e.g., not for dict_keys + raise TypeError("Set type is unordered") + data = list(data) + if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: @@ -448,8 +487,6 @@ def sanitize_array( # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) - elif isinstance(data, abc.Set): - raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): @@ -467,7 +504,7 @@ def sanitize_array( # figure out the dtype from the value (upcast if necessary) if dtype is None: - dtype, value = infer_dtype_from_scalar(value) + dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) @@ -483,13 +520,11 @@ def sanitize_array( # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: - subarr = construct_1d_arraylike_from_scalar( - subarr[0], len(index), subarr.dtype - ) + subarr = subarr.repeat(len(index)) elif subarr.ndim > 1: if isinstance(data, np.ndarray): - raise Exception("Data must be 1-dimensional") + raise ValueError("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) @@ -505,7 +540,8 @@ def sanitize_array( data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): + is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype) + if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) @@ -513,9 +549,7 @@ def sanitize_array( return subarr -def _try_cast( - arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool, -): +def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool): """ Convert input to numpy ndarray and optionally cast to a given dtype. @@ -535,9 +569,10 @@ def _try_cast( if maybe_castable(arr) and not copy and dtype is None: return arr - if isinstance(dtype, ExtensionDtype) and dtype.kind != "M": + if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype - # DatetimeTZ case needs to go through maybe_cast_to_datetime + # DatetimeTZ case needs to go through maybe_cast_to_datetime but + # SparseDtype does not array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr @@ -594,13 +629,13 @@ def is_empty_data(data: Any) -> bool: def create_series_with_explicit_dtype( data: Any = None, - index: Optional[Union[ArrayLike, "Index"]] = None, + index: Optional[Union[ArrayLike, Index]] = None, dtype: Optional[Dtype] = None, name: Optional[str] = None, copy: bool = False, fastpath: bool = False, dtype_if_empty: Dtype = object, -) -> "Series": +) -> Series: """ Helper to pass an explicit dtype when instantiating an empty Series. diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 07c73876954d0..c2be81cd46b3b 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -12,19 +12,18 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries if TYPE_CHECKING: - from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.arrays import ExtensionArray class ExtensionDtype: """ A custom data type, to be paired with an ExtensionArray. - .. versionadded:: 0.23.0 - See Also -------- - extensions.register_extension_dtype - extensions.ExtensionArray + extensions.register_extension_dtype: Register an ExtensionType + with pandas as class decorator. + extensions.ExtensionArray: Abstract base class for custom 1-D array types. Notes ----- @@ -100,9 +99,8 @@ def __eq__(self, other: Any) -> bool: By default, 'other' is considered equal if either * it's a string matching 'self.name'. - * it's an instance of this type and all of the - the attributes in ``self._metadata`` are equal between - `self` and `other`. + * it's an instance of this type and all of the attributes + in ``self._metadata`` are equal between `self` and `other`. Parameters ---------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6b84f0e81f48b..165e63e23d60e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2,8 +2,21 @@ Routines for casting. """ +from contextlib import suppress from datetime import date, datetime, timedelta -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Sequence, + Set, + Sized, + Tuple, + Type, + Union, +) import numpy as np @@ -14,17 +27,19 @@ Period, Timedelta, Timestamp, + conversion, iNaT, ints_to_pydatetime, + ints_to_pytimedelta, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike, Dtype, DtypeObj +from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - _POSSIBLY_CAST_DTYPES, DT64NS_DTYPE, INT64_DTYPE, + POSSIBLY_CAST_DTYPES, TD64NS_DTYPE, ensure_int8, ensure_int16, @@ -50,6 +65,7 @@ is_numeric_dtype, is_object_dtype, is_scalar, + is_sparse, is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -72,11 +88,17 @@ ABCSeries, ) from pandas.core.dtypes.inference import is_list_like -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import ( + is_valid_nat_for_dtype, + isna, + na_value_for_dtype, + notna, +) if TYPE_CHECKING: from pandas import Series - from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.arrays import ExtensionArray + from pandas.core.indexes.base import Index _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -112,7 +134,31 @@ def is_nested_object(obj) -> bool: return False -def maybe_downcast_to_dtype(result, dtype): +def maybe_box_datetimelike(value: Scalar, dtype: Optional[Dtype] = None) -> Scalar: + """ + Cast scalar to Timestamp or Timedelta if scalar is datetime-like + and dtype is not object. + + Parameters + ---------- + value : scalar + dtype : Dtype, optional + + Returns + ------- + scalar + """ + if dtype == object: + pass + elif isinstance(value, (np.datetime64, datetime)): + value = tslibs.Timestamp(value) + elif isinstance(value, (np.timedelta64, timedelta)): + value = tslibs.Timedelta(value) + + return value + + +def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 @@ -148,12 +194,20 @@ def maybe_downcast_to_dtype(result, dtype): dtype = np.dtype(dtype) + elif dtype.type is Period: + from pandas.core.arrays import PeriodArray + + with suppress(TypeError): + # e.g. TypeError: int() argument must be a string, a + # bytes-like object or a number, not 'Period + return PeriodArray(result, freq=dtype.freq) + converted = maybe_downcast_numeric(result, dtype, do_round) if converted is not result: return converted # a datetimelike - # GH12821, iNaT is casted to float + # GH12821, iNaT is cast to float if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: if hasattr(dtype, "tz"): # not a numpy dtype @@ -166,21 +220,10 @@ def maybe_downcast_to_dtype(result, dtype): else: result = result.astype(dtype) - elif dtype.type is Period: - # TODO(DatetimeArray): merge with previous elif - from pandas.core.arrays import PeriodArray - - try: - return PeriodArray(result, freq=dtype.freq) - except TypeError: - # e.g. TypeError: int() argument must be a string, a - # bytes-like object or a number, not 'Period - pass - return result -def maybe_downcast_numeric(result, dtype, do_round: bool = False): +def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False): """ Subset of maybe_downcast_to_dtype restricted to numeric dtypes. @@ -198,10 +241,6 @@ def maybe_downcast_numeric(result, dtype, do_round: bool = False): # e.g. SparseDtype has no itemsize attr return result - if isinstance(result, list): - # reached via groupby.agg._ohlc; really this should be handled earlier - result = np.array(result) - def trans(x): if do_round: return x.round() @@ -253,7 +292,9 @@ def trans(x): return result -def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""): +def maybe_cast_result( + result: ArrayLike, obj: "Series", numeric_only: bool = False, how: str = "" +) -> ArrayLike: """ Try casting result to a different type if appropriate @@ -273,25 +314,23 @@ def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: st result : array-like result maybe casted to the dtype. """ - if obj.ndim > 1: - dtype = obj._values.dtype - else: - dtype = obj.dtype + dtype = obj.dtype dtype = maybe_cast_result_dtype(dtype, how) - if not is_scalar(result): - if ( - is_extension_array_dtype(dtype) - and not is_categorical_dtype(dtype) - and dtype.kind != "M" - ): - # We have to special case categorical so as not to upcast - # things like counts back to categorical - cls = dtype.construct_array_type() - result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + assert not is_scalar(result) - elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) + if ( + is_extension_array_dtype(dtype) + and not is_categorical_dtype(dtype) + and dtype.kind != "M" + ): + # We have to special case categorical so as not to upcast + # things like counts back to categorical + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) return result @@ -314,16 +353,24 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: The desired dtype of the result. """ from pandas.core.arrays.boolean import BooleanDtype - from pandas.core.arrays.integer import Int64Dtype - - if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)): - return np.dtype(np.int64) - elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): - return Int64Dtype() + from pandas.core.arrays.floating import Float64Dtype + from pandas.core.arrays.integer import Int64Dtype, _IntegerDtype + + if how in ["add", "cumsum", "sum", "prod"]: + if dtype == np.dtype(bool): + return np.dtype(np.int64) + elif isinstance(dtype, (BooleanDtype, _IntegerDtype)): + return Int64Dtype() + elif how in ["mean", "median", "var"] and isinstance( + dtype, (BooleanDtype, _IntegerDtype) + ): + return Float64Dtype() return dtype -def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): +def maybe_cast_to_extension_array( + cls: Type["ExtensionArray"], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None +) -> ArrayLike: """ Call to `_from_sequence` that returns the object unchanged on Exception. @@ -339,13 +386,17 @@ def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): ExtensionArray or obj """ from pandas.core.arrays.string_ import StringArray + from pandas.core.arrays.string_arrow import ArrowStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg - # Everything can be be converted to StringArrays, but we may not want to convert - if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string": + # Everything can be converted to StringArrays, but we may not want to convert + if ( + issubclass(cls, (StringArray, ArrowStringArray)) + and lib.infer_dtype(obj) != "string" + ): return obj try: @@ -356,7 +407,9 @@ def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): return result -def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): +def maybe_upcast_putmask( + result: np.ndarray, mask: np.ndarray, other: Scalar +) -> Tuple[np.ndarray, bool]: """ A safe version of putmask that potentially upcasts the result. @@ -400,12 +453,9 @@ def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): # NaN -> NaT # integer or integer array -> date-like array if result.dtype.kind in ["m", "M"]: - if is_scalar(other): - if isna(other): - other = result.dtype.type("nat") - elif is_integer(other): - other = np.array(other, dtype=result.dtype) - elif is_integer_dtype(other): + if isna(other): + other = result.dtype.type("nat") + elif is_integer(other): other = np.array(other, dtype=result.dtype) def changeit(): @@ -438,6 +488,53 @@ def changeit(): return result, False +def maybe_casted_values( + index: "Index", codes: Optional[np.ndarray] = None +) -> ArrayLike: + """ + Convert an index, given directly or as a pair (level, code), to a 1D array. + + Parameters + ---------- + index : Index + codes : np.ndarray[intp] or None, default None + + Returns + ------- + ExtensionArray or ndarray + If codes is `None`, the values of `index`. + If codes is passed, an array obtained by taking from `index` the indices + contained in `codes`. + """ + + values = index._values + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + + # if we have the codes, extract the values with a mask + if codes is not None: + mask: np.ndarray = codes == -1 + + if mask.size > 0 and mask.all(): + # we can have situations where the whole mask is -1, + # meaning there is nothing found in codes, so make all nan's + + dtype = index.dtype + fill_value = na_value_for_dtype(dtype) + values = construct_1d_arraylike_from_scalar(fill_value, len(mask), dtype) + + else: + values = values.take(codes) + + if mask.any(): + if isinstance(values, np.ndarray): + values, _ = maybe_upcast_putmask(values, mask, np.nan) + else: + values[mask] = np.nan + + return values + + def maybe_promote(dtype, fill_value=np.nan): """ Find the minimal dtype that can hold both the given dtype and fill_value. @@ -486,7 +583,7 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) else: try: - fill_value = tslibs.Timestamp(fill_value).to_datetime64() + fill_value = Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): @@ -499,7 +596,7 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) else: try: - fv = tslibs.Timedelta(fill_value) + fv = Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: @@ -594,7 +691,7 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def _ensure_dtype_type(value, dtype): +def _ensure_dtype_type(value, dtype: DtypeObj): """ Ensure that the given value is an instance of the given dtype. @@ -650,7 +747,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, If False, scalar belongs to pandas extension types is inferred as object """ - dtype = np.dtype(object) + dtype: DtypeObj = np.dtype(object) # a 1-element ndarray if isinstance(val, np.ndarray): @@ -672,8 +769,8 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, dtype = np.dtype(object) elif isinstance(val, (np.datetime64, datetime)): - val = tslibs.Timestamp(val) - if val is tslibs.NaT or val.tz is None: + val = Timestamp(val) + if val is NaT or val.tz is None: dtype = np.dtype("M8[ns]") else: if pandas_dtype: @@ -684,7 +781,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, val = val.value elif isinstance(val, (np.timedelta64, timedelta)): - val = tslibs.Timedelta(val).value + val = Timedelta(val).value dtype = np.dtype("m8[ns]") elif is_bool(val): @@ -696,6 +793,11 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, else: dtype = np.dtype(np.int64) + try: + np.array(val, dtype=dtype) + except OverflowError: + dtype = np.array(val).dtype + elif is_float(val): if isinstance(val, np.floating): dtype = np.dtype(type(val)) @@ -708,7 +810,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, elif pandas_dtype: if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) - val = val.ordinal elif lib.is_interval(val): subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] dtype = IntervalDtype(subtype=subtype) @@ -716,8 +817,25 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, return dtype, val -# TODO: try to make the Any in the return annotation more specific -def infer_dtype_from_array(arr, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]: +def dict_compat(d: Dict[Scalar, Scalar]) -> Dict[Scalar, Scalar]: + """ + Convert datetimelike-keyed dicts to a Timestamp-keyed dict. + + Parameters + ---------- + d: dict-like object + + Returns + ------- + dict + + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + +def infer_dtype_from_array( + arr, pandas_dtype: bool = False +) -> Tuple[DtypeObj, ArrayLike]: """ Infer the dtype from an array. @@ -805,7 +923,12 @@ def maybe_infer_dtype_type(element): return tipo -def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): +def maybe_upcast( + values: ArrayLike, + fill_value: Scalar = np.nan, + dtype: Dtype = None, + copy: bool = False, +) -> Tuple[ArrayLike, Scalar]: """ Provide explicit type promotion and coercion. @@ -817,6 +940,13 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): dtype : if None, then use the dtype of the values, else coerce to this type copy : bool, default True If True always make a copy even if no upcast is required. + + Returns + ------- + values: ndarray or ExtensionArray + the original array, possibly upcast + fill_value: + the fill value, possibly upcast """ if not is_scalar(fill_value) and not is_object_dtype(values.dtype): # We allow arbitrary fill values for object dtype @@ -837,7 +967,7 @@ def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): return values, fill_value -def invalidate_string_dtypes(dtype_set): +def invalidate_string_dtypes(dtype_set: Set[DtypeObj]): """ Change string like dtypes to object for ``DataFrame.select_dtypes()``. @@ -859,37 +989,9 @@ def coerce_indexer_dtype(indexer, categories): return ensure_int64(indexer) -def coerce_to_dtypes(result, dtypes): - """ - given a dtypes and a result set, coerce the result elements to the - dtypes - """ - if len(result) != len(dtypes): - raise AssertionError("_coerce_to_dtypes requires equal len arrays") - - def conv(r, dtype): - if np.any(isna(r)): - pass - elif dtype == DT64NS_DTYPE: - r = tslibs.Timestamp(r) - elif dtype == TD64NS_DTYPE: - r = tslibs.Timedelta(r) - elif dtype == np.bool_: - # messy. non 0/1 integers do not get converted. - if is_integer(r) and r not in [0, 1]: - return int(r) - r = bool(r) - elif dtype.kind == "f": - r = float(r) - elif dtype.kind == "i": - r = int(r) - - return r - - return [conv(r, dtype) for r, dtype in zip(result, dtypes)] - - -def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): +def astype_nansafe( + arr, dtype: DtypeObj, copy: bool = True, skipna: bool = False +) -> ArrayLike: """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -916,7 +1018,9 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): dtype = pandas_dtype(dtype) if issubclass(dtype.type, str): - return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape) + return lib.ensure_string_array( + arr.ravel(), skipna=skipna, convert_na_value=False + ).reshape(arr.shape) elif is_datetime64_dtype(arr): if is_object_dtype(dtype): @@ -934,7 +1038,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): - return tslibs.ints_to_pytimedelta(arr.view(np.int64)) + return ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") @@ -991,99 +1095,37 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): return arr.view(dtype) -def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True): +def soft_convert_objects( + values: np.ndarray, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + copy: bool = True, +): """ - If we have an object dtype array, try to coerce dates and/or numbers. + Try to coerce datetime, timedelta, and numeric object-dtype columns + to inferred dtype. Parameters ---------- - values : ndarray - convert_numeric : bool, default True + values : np.ndarray[object] + datetime : bool, default True + numeric: bool, default True + timedelta : bool, default True + copy : bool, default True Returns ------- - ndarray or DatetimeIndex + np.ndarray """ - validate_bool_kwarg(convert_numeric, "convert_numeric") - - orig_values = values - - # convert dates - if is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, convert_datetime=True) - - # convert timedeltas - if is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, convert_timedelta=True) - - # convert to numeric - if is_object_dtype(values.dtype): - if convert_numeric: - try: - new_values = lib.maybe_convert_numeric( - values, set(), coerce_numeric=True - ) - except (ValueError, TypeError): - pass - else: - # if we are all nans then leave me alone - if not isna(new_values).all(): - values = new_values - - else: - # soft-conversion - values = lib.maybe_convert_objects(values) - - if values is orig_values: - values = values.copy() - - return values - - -def soft_convert_objects( - values: np.ndarray, - datetime: bool = True, - numeric: bool = True, - timedelta: bool = True, - coerce: bool = False, - copy: bool = True, -): - """ if we have an object dtype, try to coerce dates and/or numbers """ validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") - validate_bool_kwarg(coerce, "coerce") validate_bool_kwarg(copy, "copy") conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: raise ValueError("At least one of datetime, numeric or timedelta must be True.") - elif conversion_count > 1 and coerce: - raise ValueError( - "Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when when coerce=True." - ) - - if not is_object_dtype(values.dtype): - # If not object, do not attempt conversion - values = values.copy() if copy else values - return values - - # If 1 flag is coerce, ensure 2 others are False - if coerce: - # Immediate return if coerce - if datetime: - from pandas import to_datetime - - return to_datetime(values, errors="coerce").to_numpy() - elif timedelta: - from pandas import to_timedelta - - return to_timedelta(values, errors="coerce").to_numpy() - elif numeric: - from pandas import to_numeric - - return to_numeric(values, errors="coerce") # Soft conversions if datetime: @@ -1112,10 +1154,11 @@ def soft_convert_objects( def convert_dtypes( - input_array, + input_array: AnyArrayLike, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, + convert_floating: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, @@ -1123,13 +1166,17 @@ def convert_dtypes( Parameters ---------- - input_array : ExtensionArray or PandasArray + input_array : ExtensionArray, Index, Series or np.ndarray convert_string : bool, default True Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. Returns ------- @@ -1137,7 +1184,9 @@ def convert_dtypes( new dtype """ is_extension = is_extension_array_dtype(input_array.dtype) - if (convert_string or convert_integer or convert_boolean) and not is_extension: + if ( + convert_string or convert_integer or convert_boolean or convert_floating + ) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1151,9 +1200,11 @@ def convert_dtypes( target_int_dtype = "Int64" if is_integer_dtype(input_array.dtype): - from pandas.core.arrays.integer import _dtypes + from pandas.core.arrays.integer import INT_STR_TO_DTYPE - inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) + inferred_dtype = INT_STR_TO_DTYPE.get( + input_array.dtype.name, target_int_dtype + ) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( input_array.dtype ): @@ -1163,6 +1214,29 @@ def convert_dtypes( if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype + if convert_floating: + if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( + input_array.dtype + ): + from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE + + inferred_float_dtype = FLOAT_STR_TO_DTYPE.get( + input_array.dtype.name, "Float64" + ) + # if we could also convert to integer, check if all floats + # are actually integers + if convert_integer: + arr = input_array[notna(input_array)] + if (arr.astype(int) == arr).all(): + inferred_dtype = "Int64" + else: + inferred_dtype = inferred_float_dtype + else: + inferred_dtype = inferred_float_dtype + else: + if is_float_dtype(inferred_dtype): + inferred_dtype = input_array.dtype + if convert_boolean: if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" @@ -1176,9 +1250,11 @@ def convert_dtypes( return inferred_dtype -def maybe_castable(arr) -> bool: +def maybe_castable(arr: np.ndarray) -> bool: # return False to force a non-fastpath + assert isinstance(arr, np.ndarray) # GH 37024 + # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce kind = arr.dtype.kind @@ -1187,10 +1263,12 @@ def maybe_castable(arr) -> bool: elif kind == "m": return is_timedelta64_ns_dtype(arr.dtype) - return arr.dtype.name not in _POSSIBLY_CAST_DTYPES + return arr.dtype.name not in POSSIBLY_CAST_DTYPES -def maybe_infer_to_datetimelike(value, convert_dates: bool = False): +def maybe_infer_to_datetimelike( + value: Union[ArrayLike, Scalar], convert_dates: bool = False +): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1212,9 +1290,6 @@ def maybe_infer_to_datetimelike(value, convert_dates: bool = False): value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray) ): return value - elif isinstance(value, ABCSeries): - if isinstance(value._values, ABCDatetimeIndex): - return value._values v = value @@ -1227,7 +1302,7 @@ def maybe_infer_to_datetimelike(value, convert_dates: bool = False): return value shape = v.shape - if not v.ndim == 1: + if v.ndim != 1: v = v.ravel() if not len(v): @@ -1243,7 +1318,6 @@ def try_datetime(v): # we might have a sequence of the same-datetimes with tz's # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype, xref GH19671 - from pandas._libs.tslibs import conversion from pandas import DatetimeIndex try: @@ -1298,18 +1372,15 @@ def try_timedelta(v): return value -def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): +def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ - from pandas.core.tools.timedeltas import to_timedelta from pandas.core.tools.datetimes import to_datetime + from pandas.core.tools.timedeltas import to_timedelta if dtype is not None: - if isinstance(dtype, str): - dtype = np.dtype(dtype) - is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) @@ -1322,16 +1393,21 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): f"Please pass in '{dtype.name}[ns]' instead." ) - if is_datetime64 and not is_dtype_equal(dtype, DT64NS_DTYPE): - - # pandas supports dtype whose granularity is less than [ns] - # e.g., [ps], [fs], [as] - if dtype <= np.dtype("M8[ns]"): - if dtype.name == "datetime64": - raise ValueError(msg) - dtype = DT64NS_DTYPE - else: - raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]") + if is_datetime64: + # unpack e.g. SparseDtype + dtype = getattr(dtype, "subtype", dtype) + if not is_dtype_equal(dtype, DT64NS_DTYPE): + + # pandas supports dtype whose granularity is less than [ns] + # e.g., [ps], [fs], [as] + if dtype <= np.dtype("M8[ns]"): + if dtype.name == "datetime64": + raise ValueError(msg) + dtype = DT64NS_DTYPE + else: + raise TypeError( + f"cannot convert datetimelike to dtype [{dtype}]" + ) elif is_datetime64tz: # our NaT doesn't support tz's @@ -1354,7 +1430,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if is_scalar(value): if value == iNaT or isna(value): value = iNaT - else: + elif not is_sparse(value): value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) @@ -1365,7 +1441,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype): try: if is_datetime64: - value = to_datetime(value, errors=errors) + value = to_datetime(value, errors="raise") # GH 25843: Remove tz information since the dtype # didn't specify one if value.tz is not None: @@ -1377,7 +1453,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): # datetime64tz is assumed to be naive which should # be localized to the timezone. is_dt_string = is_string_dtype(value.dtype) - value = to_datetime(value, errors=errors).array + value = to_datetime(value, errors="raise").array if is_dt_string: # Strings here are naive, so directly localize value = value.tz_localize(dtype.tz) @@ -1386,7 +1462,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): # so localize and convert value = value.tz_localize("UTC").tz_convert(dtype.tz) elif is_timedelta64: - value = to_timedelta(value, errors=errors)._values + value = to_timedelta(value, errors="raise")._values except OutOfBoundsDatetime: raise except (AttributeError, ValueError, TypeError): @@ -1415,10 +1491,10 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): dtype = value.dtype if dtype.kind == "M" and dtype != DT64NS_DTYPE: - value = tslibs.conversion.ensure_datetime64ns(value) + value = conversion.ensure_datetime64ns(value) elif dtype.kind == "m" and dtype != TD64NS_DTYPE: - value = to_timedelta(value) + value = conversion.ensure_timedelta64ns(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this @@ -1484,40 +1560,13 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: if has_bools: for t in types: if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): - return object + return np.dtype("object") return np.find_common_type(types, []) -def cast_scalar_to_array(shape, value, dtype: Optional[DtypeObj] = None) -> np.ndarray: - """ - Create np.ndarray of specified shape and dtype, filled with values. - - Parameters - ---------- - shape : tuple - value : scalar value - dtype : np.dtype, optional - dtype to coerce - - Returns - ------- - ndarray of shape, filled with value, of specified / inferred dtype - - """ - if dtype is None: - dtype, fill_value = infer_dtype_from_scalar(value) - else: - fill_value = value - - values = np.empty(shape, dtype=dtype) - values.fill(fill_value) - - return values - - def construct_1d_arraylike_from_scalar( - value, length: int, dtype: DtypeObj + value: Scalar, length: int, dtype: DtypeObj ) -> ArrayLike: """ create a np.ndarray / pandas type of specified shape and dtype @@ -1546,9 +1595,14 @@ def construct_1d_arraylike_from_scalar( elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): # we need to coerce to object dtype to avoid # to allow numpy to take our string as a scalar value - dtype = object + dtype = np.dtype("object") if not isna(value): value = ensure_str(value) + elif dtype.kind in ["M", "m"] and is_valid_nat_for_dtype(value, dtype): + # GH36541: can't fill array directly with pd.NaT + # > np.empty(10, dtype="datetime64[64]").fill(pd.NaT) + # ValueError: cannot convert float NaN to integer + value = dtype.type("NaT", "ns") subarr = np.empty(length, dtype=dtype) subarr.fill(value) @@ -1556,7 +1610,7 @@ def construct_1d_arraylike_from_scalar( return subarr -def construct_1d_object_array_from_listlike(values) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1582,7 +1636,7 @@ def construct_1d_object_array_from_listlike(values) -> np.ndarray: def construct_1d_ndarray_preserving_na( - values, dtype: Optional[DtypeObj] = None, copy: bool = False + values: Sequence, dtype: Optional[DtypeObj] = None, copy: bool = False ) -> np.ndarray: """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1607,24 +1661,16 @@ def construct_1d_ndarray_preserving_na( >>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str')) array(['1.0', '2.0', None], dtype=object) """ - subarr = np.array(values, dtype=dtype, copy=copy) if dtype is not None and dtype.kind == "U": - # GH-21083 - # We can't just return np.array(subarr, dtype='str') since - # NumPy will convert the non-string objects into strings - # Including NA values. Se we have to go - # string -> object -> update NA, which requires an - # additional pass over the data. - na_values = isna(values) - subarr2 = subarr.astype(object) - subarr2[na_values] = np.asarray(values, dtype=object)[na_values] - subarr = subarr2 + subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + else: + subarr = np.array(values, dtype=dtype, copy=copy) return subarr -def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): +def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -1666,6 +1712,8 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): ... ValueError: Trying to coerce float values to integers """ + assert is_integer_dtype(dtype) + try: if not hasattr(arr, "astype"): casted = np.array(arr, dtype=dtype, copy=copy) @@ -1690,11 +1738,11 @@ def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): + if is_float_dtype(arr) or is_object_dtype(arr): raise ValueError("Trying to coerce float values to integers") -def convert_scalar_for_putitemlike(scalar, dtype: np.dtype): +def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: """ Convert datetimelike scalar if we are setting into a datetime64 or timedelta64 ndarray. @@ -1725,7 +1773,7 @@ def convert_scalar_for_putitemlike(scalar, dtype: np.dtype): return scalar -def validate_numeric_casting(dtype: np.dtype, value): +def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None: """ Check that we can losslessly insert the given value into an array with the given dtype. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a2ca4d84b2bf6..b4f6d587c6642 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,7 +9,7 @@ from pandas._libs import Interval, Period, algos from pandas._libs.tslibs import conversion -from pandas._typing import ArrayLike, DtypeObj +from pandas._typing import ArrayLike, DtypeObj, Optional from pandas.core.dtypes.base import registry from pandas.core.dtypes.dtypes import ( @@ -43,7 +43,7 @@ is_sequence, ) -_POSSIBLY_CAST_DTYPES = { +POSSIBLY_CAST_DTYPES = { np.dtype(t).name for t in [ "O", @@ -83,7 +83,12 @@ def ensure_float(arr): float_arr : The original array cast to the float dtype if possible. Otherwise, the original array is returned. """ - if issubclass(arr.dtype.type, (np.integer, np.bool_)): + if is_extension_array_dtype(arr.dtype): + if is_float_dtype(arr.dtype): + arr = arr.to_numpy(dtype=arr.dtype.numpy_dtype, na_value=np.nan) + else: + arr = arr.to_numpy(dtype="float64", na_value=np.nan) + elif issubclass(arr.dtype.type, (np.integer, np.bool_)): arr = arr.astype(float) return arr @@ -108,7 +113,7 @@ def ensure_str(value: Union[bytes, Any]) -> str: return value -def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: +def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.ndarray: """ Ensure that an dtype array of some integer dtype has an int64 dtype if possible. @@ -136,11 +141,13 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: """ # TODO: GH27506 potential bug with ExtensionArrays try: - return arr.astype("int64", copy=copy, casting="safe") # type: ignore + # error: Unexpected keyword argument "casting" for "astype" + return arr.astype("int64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: pass try: - return arr.astype("uint64", copy=copy, casting="safe") # type: ignore + # error: Unexpected keyword argument "casting" for "astype" + return arr.astype("uint64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: if is_extension_array_dtype(arr.dtype): return arr.to_numpy(dtype="float64", na_value=np.nan) @@ -633,8 +640,8 @@ def is_dtype_equal(source, target) -> bool: False """ try: - source = _get_dtype(source) - target = _get_dtype(target) + source = get_dtype(source) + target = get_dtype(target) return source == target except (TypeError, AttributeError): @@ -982,10 +989,10 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - tipo = _get_dtype(arr_or_dtype) + tipo = get_dtype(arr_or_dtype) except TypeError: if is_datetime64tz_dtype(arr_or_dtype): - tipo = _get_dtype(arr_or_dtype.dtype) + tipo = get_dtype(arr_or_dtype.dtype) else: return False return tipo == DT64NS_DTYPE or getattr(tipo, "base", None) == DT64NS_DTYPE @@ -1213,6 +1220,10 @@ def needs_i8_conversion(arr_or_dtype) -> bool: """ if arr_or_dtype is None: return False + if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): + # fastpath + dtype = arr_or_dtype + return dtype.kind in ["m", "M"] or dtype.type is Period return ( is_datetime_or_timedelta_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) @@ -1370,7 +1381,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - dtype = _get_dtype(arr_or_dtype) + dtype = get_dtype(arr_or_dtype) except TypeError: return False @@ -1386,8 +1397,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: # guess this return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): - dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) - return dtype._is_boolean + return getattr(arr_or_dtype, "dtype", arr_or_dtype)._is_boolean return issubclass(dtype.type, np.bool_) @@ -1556,13 +1566,13 @@ def _is_dtype(arr_or_dtype, condition) -> bool: if arr_or_dtype is None: return False try: - dtype = _get_dtype(arr_or_dtype) + dtype = get_dtype(arr_or_dtype) except (TypeError, ValueError, UnicodeEncodeError): return False return condition(dtype) -def _get_dtype(arr_or_dtype) -> DtypeObj: +def get_dtype(arr_or_dtype) -> DtypeObj: """ Get the dtype instance associated with an array or dtype object. @@ -1693,7 +1703,7 @@ def infer_dtype_from_object(dtype): try: return infer_dtype_from_object(getattr(np, dtype)) except (AttributeError, TypeError): - # Handles cases like _get_dtype(int) i.e., + # Handles cases like get_dtype(int) i.e., # Python objects that are valid dtypes # (unlike user-defined types, in general) # @@ -1717,7 +1727,7 @@ def _validate_date_like_dtype(dtype) -> None: ------ TypeError : The dtype could not be casted to a date-like dtype. ValueError : The dtype is an illegal date-like dtype (e.g. the - the frequency provided is too specific) + frequency provided is too specific) """ try: typ = np.datetime_data(dtype)[0] @@ -1730,6 +1740,32 @@ def _validate_date_like_dtype(dtype) -> None: ) +def validate_all_hashable(*args, error_name: Optional[str] = None) -> None: + """ + Return None if all args are hashable, else raise a TypeError. + + Parameters + ---------- + *args + Arguments to validate. + error_name : str, optional + The name to use if error + + Raises + ------ + TypeError : If an argument is not hashable + + Returns + ------- + None + """ + if not all(is_hashable(arg) for arg in args): + if error_name: + raise TypeError(f"{error_name} must be a hashable type") + else: + raise TypeError("All elements must be hashable") + + def pandas_dtype(dtype) -> DtypeObj: """ Convert input into a pandas only dtype object or a numpy dtype object. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 4b7c818f487ac..a9355e30cd3c2 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,7 +1,7 @@ """ Utility functions related to concat. """ -from typing import cast +from typing import Set, cast import numpy as np @@ -9,59 +9,50 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( - is_bool_dtype, is_categorical_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, - is_object_dtype, is_sparse, - is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray -from pandas.core.construction import array +from pandas.core.construction import array, ensure_wrapped_if_datetimelike -def get_dtype_kinds(l): +def _get_dtype_kinds(arrays) -> Set[str]: """ Parameters ---------- - l : list of arrays + arrays : list of arrays Returns ------- - a set of kinds that exist in this list of arrays + set[str] + A set of kinds that exist in this list of arrays. """ - typs = set() - for arr in l: + typs: Set[str] = set() + for arr in arrays: + # Note: we use dtype.kind checks because they are much more performant + # than is_foo_dtype dtype = arr.dtype - if is_categorical_dtype(dtype): - typ = "category" - elif is_sparse(dtype): - typ = "sparse" + if not isinstance(dtype, np.dtype): + # ExtensionDtype so we get + # e.g. "categorical", "datetime64[ns, US/Central]", "Sparse[itn64, 0]" + typ = str(dtype) elif isinstance(arr, ABCRangeIndex): typ = "range" - elif is_datetime64tz_dtype(dtype): - # if to_concat contains different tz, - # the result must be object dtype - typ = str(dtype) - elif is_datetime64_dtype(dtype): + elif dtype.kind == "M": typ = "datetime" - elif is_timedelta64_dtype(dtype): + elif dtype.kind == "m": typ = "timedelta" - elif is_object_dtype(dtype): - typ = "object" - elif is_bool_dtype(dtype): - typ = "bool" - elif is_extension_array_dtype(dtype): - typ = str(dtype) + elif dtype.kind in ["O", "b"]: + typ = str(dtype) # i.e. "object", "bool" else: typ = dtype.kind + typs.add(typ) return typs @@ -140,7 +131,7 @@ def is_nonempty(x) -> bool: if non_empties and axis == 0: to_concat = non_empties - typs = get_dtype_kinds(to_concat) + typs = _get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) all_empty = not len(non_empties) @@ -148,6 +139,8 @@ def is_nonempty(x) -> bool: any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea: + # we ignore axis here, as internally concatting with EAs is always + # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] @@ -159,13 +152,13 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: - return concat_datetime(to_concat, axis=axis, typs=typs) + return _concat_datetime(to_concat, axis=axis) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) - typs = get_dtype_kinds(to_concat) + typs = _get_dtype_kinds(to_concat) if len(typs) != 1: if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): @@ -303,19 +296,13 @@ def _maybe_unwrap(x): raise TypeError("dtype of categories must be the same") ordered = False - if all(first.is_dtype_equal(other) for other in to_union[1:]): + if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered - if all(first.categories.equals(other.categories) for other in to_union[1:]): - new_codes = np.concatenate([c.codes for c in to_union]) - else: - codes = [first.codes] + [ - recode_for_categories(other.codes, other.categories, first.categories) - for other in to_union[1:] - ] - new_codes = np.concatenate(codes) + all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] + new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with ordered Categoricals") @@ -359,7 +346,7 @@ def _concatenate_2d(to_concat, axis: int): return np.concatenate(to_concat, axis=axis) -def concat_datetime(to_concat, axis=0, typs=None): +def _concat_datetime(to_concat, axis=0): """ provide concatenation of an datetimelike array of arrays each of which is a single M8[ns], datetime64[ns, tz] or m8[ns] dtype @@ -368,21 +355,19 @@ def concat_datetime(to_concat, axis=0, typs=None): ---------- to_concat : array of arrays axis : axis to provide concatenation - typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ - if typs is None: - typs = get_dtype_kinds(to_concat) + to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] - to_concat = [_wrap_datetimelike(x) for x in to_concat] single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: - # wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta + # ensure_wrapped_if_datetimelike ensures that astype(object) wraps + # in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: @@ -396,17 +381,3 @@ def concat_datetime(to_concat, axis=0, typs=None): assert result.shape[0] == 1 result = result[0] return result - - -def _wrap_datetimelike(arr): - """ - Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. - - DTA/TDA handle .astype(object) correctly. - """ - from pandas.core.construction import array as pd_array, extract_array - - arr = extract_array(arr, extract_numpy=True) - if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: - arr = pd_array(arr) - return arr diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 22480fbc47508..3c5421ae433b6 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -29,13 +29,10 @@ from pandas.core.dtypes.inference import is_bool, is_list_like if TYPE_CHECKING: - import pyarrow # noqa: F401 - from pandas.core.arrays import ( # noqa: F401 - IntervalArray, - PeriodArray, - DatetimeArray, - ) - from pandas import Categorical # noqa: F401 + import pyarrow + + from pandas import Categorical + from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray str_type = str @@ -50,13 +47,13 @@ class PandasExtensionDtype(ExtensionDtype): type: Any kind: Any # The Any type annotations above are here only because mypy seems to have a - # problem dealing with with multiple inheritance from PandasExtensionDtype + # problem dealing with multiple inheritance from PandasExtensionDtype # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None str: str_type num = 100 - shape: Tuple[int, ...] = tuple() + shape: Tuple[int, ...] = () itemsize = 8 base = None isbuiltin = 0 @@ -374,29 +371,50 @@ def __eq__(self, other: Any) -> bool: # but same order is not necessary. There is no distinction between # ordered=False and ordered=None: CDT(., False) and CDT(., None) # will be equal if they have the same categories. - if ( - self.categories.dtype == other.categories.dtype - and self.categories.equals(other.categories) - ): + left = self.categories + right = other.categories + + # GH#36280 the ordering of checks here is for performance + if not left.dtype == right.dtype: + return False + + if len(left) != len(right): + return False + + if self.categories.equals(other.categories): # Check and see if they happen to be identical categories return True + + if left.dtype != object: + # Faster than calculating hash + indexer = left.get_indexer(right) + # Because left and right have the same length and are unique, + # `indexer` not having any -1s implies that there is a + # bijection between `left` and `right`. + return (indexer != -1).all() + + # With object-dtype we need a comparison that identifies + # e.g. int(2) as distinct from float(2) return hash(self) == hash(other) def __repr__(self) -> str_type: if self.categories is None: - data = "None, " + data = "None" else: data = self.categories._format_data(name=type(self).__name__) - return f"CategoricalDtype(categories={data}ordered={self.ordered})" + if data is None: + # self.categories is RangeIndex + data = str(self.categories._range) + data = data.rstrip(", ") + return f"CategoricalDtype(categories={data}, ordered={self.ordered})" @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: from pandas.core.util.hashing import ( + combine_hash_arrays, hash_array, - _combine_hash_arrays, hash_tuples, ) - from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM @@ -414,9 +432,9 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: hashed = hash((tuple(categories), ordered)) return hashed - if is_datetime64tz_dtype(categories.dtype): + if DatetimeTZDtype.is_dtype(categories.dtype): # Avoid future warning. - categories = categories.astype(DT64NS_DTYPE) + categories = categories.astype("datetime64[ns]") cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: @@ -425,7 +443,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: ) else: cat_array = [cat_array] - hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) + hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed) @classmethod @@ -437,7 +455,7 @@ def construct_array_type(cls) -> Type["Categorical"]: ------- type """ - from pandas import Categorical # noqa: F811 + from pandas import Categorical return Categorical @@ -633,7 +651,8 @@ class DatetimeTZDtype(PandasExtensionDtype): def __init__(self, unit: Union[str_type, "DatetimeTZDtype"] = "ns", tz=None): if isinstance(unit, DatetimeTZDtype): - unit, tz = unit.unit, unit.tz # type: ignore + # error: "str" has no attribute "tz" + unit, tz = unit.unit, unit.tz # type: ignore[attr-defined] if unit != "ns": if isinstance(unit, str) and tz is None: @@ -685,7 +704,7 @@ def construct_array_type(cls) -> Type["DatetimeArray"]: ------- type """ - from pandas.core.arrays import DatetimeArray # noqa: F811 + from pandas.core.arrays import DatetimeArray return DatetimeArray @@ -892,6 +911,9 @@ def __eq__(self, other: Any) -> bool: return isinstance(other, PeriodDtype) and self.freq == other.freq + def __ne__(self, other: Any) -> bool: + return not self.__eq__(other) + def __setstate__(self, state): # for pickle compat. __getstate__ is defined in the # PandasExtensionDtype superclass and uses the public properties to @@ -938,7 +960,8 @@ def __from_arrow__( """ Construct PeriodArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow + from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -993,11 +1016,7 @@ class IntervalDtype(PandasExtensionDtype): _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, subtype=None): - from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_string_dtype, - pandas_dtype, - ) + from pandas.core.dtypes.common import is_string_dtype, pandas_dtype if isinstance(subtype, IntervalDtype): return subtype @@ -1020,7 +1039,7 @@ def __new__(cls, subtype=None): except TypeError as err: raise TypeError("could not construct IntervalDtype") from err - if is_categorical_dtype(subtype) or is_string_dtype(subtype): + if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype): # GH 19016 msg = ( "category, object, and string subtypes are not supported " @@ -1135,7 +1154,8 @@ def __from_arrow__( """ Construct IntervalArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow + from pandas.core.arrays import IntervalArray if isinstance(array, pyarrow.Array): diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 36eff214fc314..dfbbaa9c1784a 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -1,4 +1,24 @@ """ define generic base classes for pandas objects """ +from __future__ import annotations + +from typing import TYPE_CHECKING, Type, cast + +if TYPE_CHECKING: + from pandas import ( + CategoricalIndex, + DataFrame, + DatetimeIndex, + Float64Index, + Int64Index, + IntervalIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + Series, + TimedeltaIndex, + UInt64Index, + ) + from pandas.core.generic import NDFrame # define abstract base classes to enable isinstance type checking on our @@ -7,33 +27,54 @@ def create_pandas_abc_type(name, attr, comp): # https://github.com/python/mypy/issues/1006 # error: 'classmethod' used with a non-method - @classmethod # type: ignore + @classmethod # type: ignore[misc] def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp - dct = dict(__instancecheck__=_check, __subclasscheck__=_check) + dct = {"__instancecheck__": _check, "__subclasscheck__": _check} meta = type("ABCBase", (type,), dct) - return meta(name, tuple(), dct) + return meta(name, (), dct) -ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) -ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) -ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) -ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) -ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) -ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) -ABCDatetimeIndex = create_pandas_abc_type( - "ABCDatetimeIndex", "_typ", ("datetimeindex",) +ABCInt64Index = cast( + "Type[Int64Index]", + create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)), +) +ABCUInt64Index = cast( + "Type[UInt64Index]", + create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)), +) +ABCRangeIndex = cast( + "Type[RangeIndex]", + create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)), +) +ABCFloat64Index = cast( + "Type[Float64Index]", + create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)), +) +ABCMultiIndex = cast( + "Type[MultiIndex]", + create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)), ) -ABCTimedeltaIndex = create_pandas_abc_type( - "ABCTimedeltaIndex", "_typ", ("timedeltaindex",) +ABCDatetimeIndex = cast( + "Type[DatetimeIndex]", + create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)), ) -ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) -ABCCategoricalIndex = create_pandas_abc_type( - "ABCCategoricalIndex", "_typ", ("categoricalindex",) +ABCTimedeltaIndex = cast( + "Type[TimedeltaIndex]", + create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)), ) -ABCIntervalIndex = create_pandas_abc_type( - "ABCIntervalIndex", "_typ", ("intervalindex",) +ABCPeriodIndex = cast( + "Type[PeriodIndex]", + create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)), +) +ABCCategoricalIndex = cast( + "Type[CategoricalIndex]", + create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)), +) +ABCIntervalIndex = cast( + "Type[IntervalIndex]", + create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)), ) ABCIndexClass = create_pandas_abc_type( "ABCIndexClass", @@ -53,8 +94,17 @@ def _check(cls, inst) -> bool: }, ) -ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) -ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +ABCNDFrame = cast( + "Type[NDFrame]", + create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")), +) +ABCSeries = cast( + "Type[Series]", + create_pandas_abc_type("ABCSeries", "_typ", ("series",)), +) +ABCDataFrame = cast( + "Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) +) ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index d1607b5ede6c3..329c4445b05bc 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -68,7 +68,7 @@ def is_number(obj) -> bool: return isinstance(obj, (Number, np.number)) -def _iterable_not_string(obj) -> bool: +def iterable_not_string(obj) -> bool: """ Check if the object is an iterable but not a string. @@ -83,11 +83,11 @@ def _iterable_not_string(obj) -> bool: Examples -------- - >>> _iterable_not_string([1, 2, 3]) + >>> iterable_not_string([1, 2, 3]) True - >>> _iterable_not_string("foo") + >>> iterable_not_string("foo") False - >>> _iterable_not_string(1) + >>> iterable_not_string(1) False """ return isinstance(obj, abc.Iterable) and not isinstance(obj, str) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 75188ad5b00eb..0b4aab0ac9d88 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,8 +9,8 @@ from pandas._libs import lib import pandas._libs.missing as libmissing -from pandas._libs.tslibs import NaT, iNaT -from pandas._typing import DtypeObj +from pandas._libs.tslibs import NaT, Period, iNaT +from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -43,6 +43,9 @@ isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar +nan_checker = np.isnan +INF_AS_NA = False + def isna(obj): """ @@ -188,6 +191,12 @@ def _use_inf_as_na(key): """ inf_as_na = get_option(key) globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na) + if inf_as_na: + globals()["nan_checker"] = lambda x: ~np.isfinite(x) + globals()["INF_AS_NA"] = True + else: + globals()["nan_checker"] = np.isnan + globals()["INF_AS_NA"] = False def _isna_ndarraylike(obj, inf_as_na: bool = False): @@ -338,7 +347,7 @@ def notna(obj): notnull = notna -def _isna_compat(arr, fill_value=np.nan) -> bool: +def isna_compat(arr, fill_value=np.nan) -> bool: """ Parameters ---------- @@ -355,7 +364,9 @@ def _isna_compat(arr, fill_value=np.nan) -> bool: return True -def array_equivalent(left, right, strict_nan: bool = False) -> bool: +def array_equivalent( + left, right, strict_nan: bool = False, dtype_equal: bool = False +) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and @@ -368,6 +379,12 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. + dtype_equal : bool, default False + Whether `left` and `right` are known to have the same dtype + according to `is_dtype_equal`. Some methods like `BlockManager.equals`. + require that the dtypes match. Setting this to ``True`` can improve + performance, but will give different results for arrays that are + equal but different dtypes. Returns ------- @@ -391,43 +408,28 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: if left.shape != right.shape: return False + if dtype_equal: + # fastpath when we require that the dtypes match (Block.equals) + if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): + return _array_equivalent_float(left, right) + elif is_datetimelike_v_numeric(left.dtype, right.dtype): + return False + elif needs_i8_conversion(left.dtype): + return _array_equivalent_datetimelike(left, right) + elif is_string_dtype(left.dtype): + # TODO: fastpath for pandas' StringDtype + return _array_equivalent_object(left, right, strict_nan) + else: + return np.array_equal(left, right) + + # Slow path when we allow comparing different dtypes. # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left.dtype) or is_string_dtype(right.dtype): - - if not strict_nan: - # isna considers NaN and None to be equivalent. - return lib.array_equivalent_object( - ensure_object(left.ravel()), ensure_object(right.ravel()) - ) - - for left_value, right_value in zip(left, right): - if left_value is NaT and right_value is not NaT: - return False - - elif left_value is libmissing.NA and right_value is not libmissing.NA: - return False - - elif isinstance(left_value, float) and np.isnan(left_value): - if not isinstance(right_value, float) or not np.isnan(right_value): - return False - else: - try: - if np.any(np.asarray(left_value != right_value)): - return False - except TypeError as err: - if "Cannot compare tz-naive" in str(err): - # tzawareness compat failure, see GH#28507 - return False - elif "boolean value of NA is ambiguous" in str(err): - return False - raise - return True + return _array_equivalent_object(left, right, strict_nan) # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): - - # empty if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() @@ -452,7 +454,58 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: return np.array_equal(left, right) -def _infer_fill_value(val): +def _array_equivalent_float(left, right): + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + +def _array_equivalent_datetimelike(left, right): + return np.array_equal(left.view("i8"), right.view("i8")) + + +def _array_equivalent_object(left, right, strict_nan): + if not strict_nan: + # isna considers NaN and None to be equivalent. + return lib.array_equivalent_object( + ensure_object(left.ravel()), ensure_object(right.ravel()) + ) + + for left_value, right_value in zip(left, right): + if left_value is NaT and right_value is not NaT: + return False + + elif left_value is libmissing.NA and right_value is not libmissing.NA: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if not isinstance(right_value, float) or not np.isnan(right_value): + return False + else: + try: + if np.any(np.asarray(left_value != right_value)): + return False + except TypeError as err: + if "Cannot compare tz-naive" in str(err): + # tzawareness compat failure, see GH#28507 + return False + elif "boolean value of NA is ambiguous" in str(err): + return False + raise + return True + + +def array_equals(left: ArrayLike, right: ArrayLike) -> bool: + """ + ExtensionArray-compatible implementation of array_equivalent. + """ + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ABCExtensionArray): + return left.equals(right) + else: + return array_equivalent(left, right, dtype_equal=True) + + +def infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -472,11 +525,11 @@ def _infer_fill_value(val): return np.nan -def _maybe_fill(arr, fill_value=np.nan): +def maybe_fill(arr, fill_value=np.nan): """ if we have a compatible fill_value and arr dtype, then fill """ - if _isna_compat(arr, fill_value): + if isna_compat(arr, fill_value): arr.fill(fill_value) return arr @@ -555,6 +608,37 @@ def is_valid_nat_for_dtype(obj, dtype: DtypeObj) -> bool: return not isinstance(obj, np.timedelta64) if dtype.kind == "m": return not isinstance(obj, np.datetime64) + if dtype.kind in ["i", "u", "f", "c"]: + # Numeric + return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64)) # must be PeriodDType return not isinstance(obj, (np.datetime64, np.timedelta64)) + + +def isna_all(arr: ArrayLike) -> bool: + """ + Optimized equivalent to isna(arr).all() + """ + total_len = len(arr) + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. + # parameters 1000 and 40 were chosen arbitrarily + chunk_len = max(total_len // 40, 1000) + + dtype = arr.dtype + if dtype.kind == "f": + checker = nan_checker + + elif dtype.kind in ["m", "M"] or dtype.type is Period: + checker = lambda x: np.asarray(x.view("i8")) == iNaT + + else: + checker = lambda x: _isna_ndarraylike(x, inf_as_na=INF_AS_NA) + + for i in range(0, total_len, chunk_len): + if not checker(arr[i : i + chunk_len]).all(): + return False + + return True diff --git a/pandas/core/flags.py b/pandas/core/flags.py new file mode 100644 index 0000000000000..6a09bfa3bd082 --- /dev/null +++ b/pandas/core/flags.py @@ -0,0 +1,113 @@ +import weakref + + +class Flags: + """ + Flags that apply to pandas objects. + + .. versionadded:: 1.2.0 + + Parameters + ---------- + obj : Series or DataFrame + The object these flags are associated with. + allows_duplicate_labels : bool, default True + Whether to allow duplicate labels in this object. By default, + duplicate labels are permitted. Setting this to ``False`` will + cause an :class:`errors.DuplicateLabelError` to be raised when + `index` (or columns for DataFrame) is not unique, or any + subsequent operation on introduces duplicates. + See :ref:`duplicates.disallow` for more. + + .. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. + + Notes + ----- + Attributes can be set in two ways + + >>> df = pd.DataFrame() + >>> df.flags + + >>> df.flags.allows_duplicate_labels = False + >>> df.flags + + + >>> df.flags['allows_duplicate_labels'] = True + >>> df.flags + + """ + + _keys = {"allows_duplicate_labels"} + + def __init__(self, obj, *, allows_duplicate_labels): + self._allows_duplicate_labels = allows_duplicate_labels + self._obj = weakref.ref(obj) + + @property + def allows_duplicate_labels(self) -> bool: + """ + Whether this object allows duplicate labels. + + Setting ``allows_duplicate_labels=False`` ensures that the + index (and columns of a DataFrame) are unique. Most methods + that accept and return a Series or DataFrame will propagate + the value of ``allows_duplicate_labels``. + + See :ref:`duplicates` for more. + + See Also + -------- + DataFrame.attrs : Set global metadata on this object. + DataFrame.set_flags : Set global flags on this object. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df.allows_duplicate_labels + True + >>> df.allows_duplicate_labels = False + Traceback (most recent call last): + ... + pandas.errors.DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + return self._allows_duplicate_labels + + @allows_duplicate_labels.setter + def allows_duplicate_labels(self, value: bool): + value = bool(value) + obj = self._obj() + if obj is None: + raise ValueError("This flag's object has been deleted.") + + if not value: + for ax in obj.axes: + ax._maybe_check_unique() + + self._allows_duplicate_labels = value + + def __getitem__(self, key): + if key not in self._keys: + raise KeyError(key) + + return getattr(self, key) + + def __setitem__(self, key, value): + if key not in self._keys: + raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}") + setattr(self, key, value) + + def __repr__(self): + return f"" + + def __eq__(self, other): + if isinstance(other, type(self)): + return self.allows_duplicate_labels == other.allows_duplicate_labels + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cfe5621fec14e..de60cda382fba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8,24 +8,26 @@ alignment and a host of useful data manipulation methods having to do with the labeling information """ +from __future__ import annotations import collections from collections import abc import datetime from io import StringIO import itertools +import mmap from textwrap import dedent from typing import ( IO, TYPE_CHECKING, Any, + AnyStr, Dict, FrozenSet, Hashable, Iterable, Iterator, List, - Mapping, Optional, Sequence, Set, @@ -33,6 +35,7 @@ Type, Union, cast, + overload, ) import warnings @@ -44,9 +47,11 @@ from pandas._libs import algos as libalgos, lib, properties from pandas._libs.lib import no_default from pandas._typing import ( + AggFuncType, ArrayLike, Axes, Axis, + CompressionOptions, Dtype, FilePathOrBuffer, FrameOrSeriesUnion, @@ -54,9 +59,9 @@ Label, Level, Renamer, + StorageOptions, ValueKeyFunc, ) -from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -73,18 +78,17 @@ ) from pandas.core.dtypes.cast import ( - cast_scalar_to_array, - coerce_to_dtypes, construct_1d_arraylike_from_scalar, find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, + maybe_box_datetimelike, maybe_cast_to_datetime, + maybe_casted_values, maybe_convert_platform, maybe_downcast_to_dtype, maybe_infer_to_datetimelike, maybe_upcast, - maybe_upcast_putmask, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -97,6 +101,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_float, is_float_dtype, is_hashable, is_integer, @@ -107,23 +112,32 @@ is_object_dtype, is_scalar, is_sequence, - needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna +from pandas.core.dtypes.missing import isna, notna -from pandas.core import algorithms, common as com, nanops, ops +from pandas.core import algorithms, common as com, generic, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import reconstruct_func, relabel_result +from pandas.core.aggregation import ( + aggregate, + reconstruct_func, + relabel_result, + transform, +) +from pandas.core.arraylike import OpsMixin from pandas.core.arrays import Categorical, ExtensionArray -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import extract_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase -from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences -from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + PeriodIndex, + ensure_index, + ensure_index_from_sequences, +) from pandas.core.indexes.multi import MultiIndex, maybe_droplevels -from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( @@ -139,46 +153,44 @@ ) from pandas.core.reshape.melt import melt from pandas.core.series import Series -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import get_group_index, lexsort_indexer, nargsort -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_handle from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import DataFrameInfo +from pandas.io.formats.info import BaseInfo, DataFrameInfo import pandas.plotting if TYPE_CHECKING: + from typing import Literal + from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.io.formats.style import Styler # --------------------------------------------------------------------- # Docstring templates -_shared_doc_kwargs = dict( - axes="index, columns", - klass="DataFrame", - axes_single_arg="{0 or 'index', 1 or 'columns'}", - axis="""axis : {0 or 'index', 1 or 'columns'}, default 0 +_shared_doc_kwargs = { + "axes": "index, columns", + "klass": "DataFrame", + "axes_single_arg": "{0 or 'index', 1 or 'columns'}", + "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index': apply function to each column. If 1 or 'columns': apply function to each row.""", - optional_by=""" + "optional_by": """ by : str or list of str Name or list of names to sort by. - if `axis` is 0 or `'index'` then `by` may contain index levels and/or column labels. - if `axis` is 1 or `'columns'` then `by` may contain column - levels and/or index labels. - - .. versionchanged:: 0.23.0 - - Allow specifying index or column level names.""", - versionadded_to_excel="", - optional_labels="""labels : array-like, optional + levels and/or index labels.""", + "optional_labels": """labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", - optional_axis="""axis : int or str, optional + "optional_axis": """axis : int or str, optional Axis to target. Can be either the axis name ('index', 'columns') or number (0, 1).""", -) +} _numeric_only_doc = """numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use @@ -191,12 +203,14 @@ The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. Parameters ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -207,6 +221,11 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + + .. versionadded:: 1.2.0 + on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -327,6 +346,44 @@ ... ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 """ @@ -334,7 +391,7 @@ # DataFrame class -class DataFrame(NDFrame): +class DataFrame(NDFrame, OpsMixin): """ Two-dimensional, size-mutable, potentially heterogeneous tabular data. @@ -346,15 +403,11 @@ class DataFrame(NDFrame): Parameters ---------- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame - Dict can contain Series, arrays, constants, or list-like objects. - - .. versionchanged:: 0.23.0 - If data is a dict, column order follows insertion-order for - Python 3.6 and later. + Dict can contain Series, arrays, constants, dataclass or list-like objects. If + data is a dict, column order follows insertion-order. .. versionchanged:: 0.25.0 - If data is a list of dicts, column order follows insertion-order - for Python 3.6 and later. + If data is a list of dicts, column order follows insertion-order. index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if @@ -410,17 +463,28 @@ class DataFrame(NDFrame): 0 1 2 3 1 4 5 6 2 7 8 9 + + Constructing DataFrame from dataclass: + + >>> from dataclasses import make_dataclass + >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) + >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + x y + 0 0 0 + 1 0 3 + 2 2 3 """ _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) @property - def _constructor(self) -> Type["DataFrame"]: + def _constructor(self) -> Type[DataFrame]: return DataFrame _constructor_sliced: Type[Series] = Series - _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) + _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) _accessors: Set[str] = {"sparse"} @property @@ -458,7 +522,7 @@ def __init__( return mgr = self._init_mgr( - data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy + data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy ) elif isinstance(data, dict): @@ -550,9 +614,8 @@ def __init__( if arr.ndim != 0: raise ValueError("DataFrame constructor not properly called!") - values = cast_scalar_to_array( - (len(index), len(columns)), data, dtype=dtype - ) + shape = (len(index), len(columns)) + values = np.full(shape, arr) mgr = init_ndarray( values, index, columns, dtype=values.dtype, copy=False @@ -586,7 +649,7 @@ def shape(self) -> Tuple[int, int]: See Also -------- - ndarray.shape + ndarray.shape : Tuple of array dimensions. Examples -------- @@ -635,7 +698,6 @@ def _is_homogeneous_type(self) -> bool: if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: - # Note: consolidates inplace return not self._is_mixed_type @property @@ -643,10 +705,10 @@ def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ - if self._data.any_extension_types: + if self._mgr.any_extension_types: # TODO(EA2D) special case would be unnecessary with 2D EAs return False - return len(self._data.blocks) == 1 + return len(self._mgr.blocks) == 1 # ---------------------------------------------------------------------- # Rendering Methods @@ -707,7 +769,7 @@ def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: d.to_string(buf=buf) value = buf.getvalue() - repr_width = max(len(l) for l in value.split("\n")) + repr_width = max(len(line) for line in value.split("\n")) return repr_width < width @@ -789,10 +851,8 @@ def _repr_html_(self) -> Optional[str]: max_cols=max_cols, show_dimensions=show_dimensions, decimal=".", - table_id=None, - render_links=False, ) - return formatter.to_html(notebook=True) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) else: return None @@ -875,14 +935,17 @@ def to_string( max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, + ) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, line_width=line_width, ) - return formatter.to_string(buf=buf, encoding=encoding) # ---------------------------------------------------------------------- @property - def style(self) -> "Styler": + def style(self) -> Styler: """ Returns a Styler object. @@ -971,9 +1034,6 @@ def iterrows(self) -> Iterable[Tuple[Label, Series]]: data : Series The data of the row as a Series. - it : generator - A generator that iterates over the rows of the frame. - See Also -------- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. @@ -1011,7 +1071,7 @@ def iterrows(self) -> Iterable[Tuple[Label, Series]]: s = klass(v, index=columns, name=k) yield k, s - def itertuples(self, index=True, name="Pandas"): + def itertuples(self, index: bool = True, name: Optional[str] = "Pandas"): """ Iterate over DataFrame rows as namedtuples. @@ -1084,10 +1144,12 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python versions before 3.7 support at most 255 arguments to constructors - can_return_named_tuples = PY37 or len(self.columns) + index < 255 - if name is not None and can_return_named_tuples: - itertuple = collections.namedtuple(name, fields, rename=True) + if name is not None: + # https://github.com/python/mypy/issues/9046 + # error: namedtuple() expects a string literal as the first argument + itertuple = collections.namedtuple( # type: ignore[misc] + name, fields, rename=True + ) return map(itertuple._make, zip(*arrays)) # fallback to regular tuples @@ -1219,13 +1281,20 @@ def __rmatmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - return self.T.dot(np.transpose(other)).T + try: + return self.T.dot(np.transpose(other)).T + except ValueError as err: + if "shape mismatch" not in str(err): + raise + # GH#21581 give exception message for original shapes + msg = f"shapes {np.shape(other)} and {self.shape} not aligned" + raise ValueError(msg) from err # ---------------------------------------------------------------------- # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. @@ -1246,8 +1315,6 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra Column labels to use when ``orient='index'``. Raises a ValueError if used with ``orient='columns'``. - .. versionadded:: 0.23.0 - Returns ------- DataFrame @@ -1368,6 +1435,8 @@ def to_numpy( result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) + if result.dtype is not dtype: + result = np.array(result, dtype=dtype, copy=False) return result @@ -1512,7 +1581,7 @@ def to_dict(self, orient="dict", into=dict): ( "data", [ - list(map(com.maybe_box_datetimelike, t)) + list(map(maybe_box_datetimelike, t)) for t in self.itertuples(index=False, name=None) ], ), @@ -1520,7 +1589,7 @@ def to_dict(self, orient="dict", into=dict): ) elif orient == "series": - return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) + return into_c((k, maybe_box_datetimelike(v)) for k, v in self.items()) elif orient == "records": columns = self.columns.tolist() @@ -1529,7 +1598,7 @@ def to_dict(self, orient="dict", into=dict): for row in self.itertuples(index=False, name=None) ) return [ - into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) + into_c((k, maybe_box_datetimelike(v)) for k, v in row.items()) for row in rows ] @@ -1661,7 +1730,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, - ) -> "DataFrame": + ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. @@ -1769,13 +1838,13 @@ def from_records( arrays = [data[k] for k in columns] else: arrays = [] - arr_columns = [] + arr_columns_list = [] for k, v in data.items(): if k in columns: - arr_columns.append(k) + arr_columns_list.append(k) arrays.append(v) - arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) + arrays, arr_columns = reorder_arrays(arrays, arr_columns_list, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) @@ -2002,7 +2071,7 @@ def _from_arrays( index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, - ) -> "DataFrame": + ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2040,6 +2109,7 @@ def _from_arrays( ) return cls(mgr) + @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, @@ -2052,7 +2122,8 @@ def to_stata( variable_labels: Optional[Dict[Label, str]] = None, version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, ) -> None: """ Export DataFrame object to Stata dta format. @@ -2091,7 +2162,7 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - version : {114, 117, 118, 119, None}, default 114 + version : {{114, 117, 118, 119, None}}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of columns in the frame. Version 114 can be read by Stata 10 and @@ -2103,7 +2174,11 @@ def to_stata( support Unicode characters, and version 119 supports more than 32,767 variables. - .. versionadded:: 0.23.0 + Version 119 should usually only be used when the number of + variables exceeds the capacity of dta format 118. Exporting + smaller datasets in format 119 may have unintended consequences, + and, as of November 2020, Stata SE cannot read version 119 files. + .. versionchanged:: 1.0.0 Added support for formats 118 and 119. @@ -2113,22 +2188,23 @@ def to_stata( format. Only available if version is 117. Storing strings in the StrL format can produce smaller dta files if strings have more than 8 characters and values are repeated. - - .. versionadded:: 0.23.0 - compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies - compression mode. Compression mode must be one of {'infer', 'gzip', - 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and + compression mode. Compression mode must be one of {{'infer', 'gzip', + 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and `fname` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression). If dict and compression mode is one of {'zip', - 'gzip', 'bz2'}, or inferred as one of the above, other entries + compression). If dict and compression mode is one of {{'zip', + 'gzip', 'bz2'}}, or inferred as one of the above, other entries passed as additional compression options. .. versionadded:: 1.1.0 + {storage_options} + + .. versionadded:: 1.2.0 + Raises ------ NotImplementedError @@ -2148,9 +2224,9 @@ def to_stata( Examples -------- - >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', + >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}) + ... 'speed': [350, 18, 361, 15]}}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): @@ -2161,10 +2237,14 @@ def to_stata( from pandas.io.stata import StataWriter as statawriter elif version == 117: # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import StataWriter117 as statawriter # type: ignore + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriter117 as statawriter, + ) else: # versions 118 and 119 # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import StataWriterUTF8 as statawriter # type: ignore + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriterUTF8 as statawriter, + ) kwargs: Dict[str, Any] = {} if version is None or version >= 117: @@ -2175,7 +2255,7 @@ def to_stata( kwargs["version"] = version # mypy: Too many arguments for "StataWriter" - writer = statawriter( # type: ignore + writer = statawriter( # type: ignore[call-arg] path, self, convert_dates=convert_dates, @@ -2185,19 +2265,20 @@ def to_stata( write_index=write_index, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, **kwargs, ) writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path, **kwargs) -> None: + def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: """ Write a DataFrame to the binary Feather format. Parameters ---------- - path : str - String file path. + path : str or file-like object + If a string, it will be used as Root Directory path. **kwargs : Additional keywords passed to :func:`pyarrow.feather.write_feather`. Starting with pyarrow 0.17, this includes the `compression`, @@ -2212,6 +2293,7 @@ def to_feather(self, path, **kwargs) -> None: @doc( Series.to_markdown, klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], examples="""Examples -------- >>> df = pd.DataFrame( @@ -2236,29 +2318,46 @@ def to_feather(self, path, **kwargs) -> None: """, ) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + self, + buf: Optional[Union[IO[str], str]] = None, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions = None, + **kwargs, ) -> Optional[str]: + if "showindex" in kwargs: + warnings.warn( + "'showindex' is deprecated. Only 'index' will be used " + "in a future version. Use 'index' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + kwargs.setdefault("headers", "keys") kwargs.setdefault("tablefmt", "pipe") + kwargs.setdefault("showindex", index) tabulate = import_optional_dependency("tabulate") result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) - assert buf is not None # Help mypy. - buf.writelines(result) + + with get_handle(buf, mode, storage_options=storage_options) as handles: + assert not isinstance(handles.handle, (str, mmap.mmap)) + handles.handle.writelines(result) return None + @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, - path, - engine="auto", - compression="snappy", - index=None, - partition_cols=None, + path: Optional[FilePathOrBuffer] = None, + engine: str = "auto", + compression: Optional[str] = "snappy", + index: Optional[bool] = None, + partition_cols: Optional[List[str]] = None, + storage_options: StorageOptions = None, **kwargs, - ) -> None: + ) -> Optional[bytes]: """ Write a DataFrame to the binary parquet format. @@ -2269,20 +2368,24 @@ def to_parquet( Parameters ---------- - path : str - File path or Root Directory path. Will be used as Root Directory - path while writing a partitioned dataset. + path : str or file-like object, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. - .. versionchanged:: 1.0.0 + .. versionchanged:: 1.2.0 Previously this was "fname" - engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. - compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' + compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. @@ -2298,13 +2401,22 @@ def to_parquet( partition_cols : list, optional, default None Column names by which to partition the dataset. Columns are partitioned in the order they are given. + Must be None if path is not a string. .. versionadded:: 0.24.0 + {storage_options} + + .. versionadded:: 1.2.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. + Returns + ------- + bytes if no path argument is provided else None + See Also -------- read_parquet : Read a parquet file. @@ -2320,7 +2432,7 @@ def to_parquet( Examples -------- - >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) >>> df.to_parquet('df.parquet.gzip', ... compression='gzip') # doctest: +SKIP >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP @@ -2340,13 +2452,14 @@ def to_parquet( """ from pandas.io.parquet import to_parquet - to_parquet( + return to_parquet( self, path, engine, compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) @@ -2407,9 +2520,6 @@ def to_html( table_id : str, optional A css id is included in the opening `` tag if specified. - - .. versionadded:: 0.23.0 - render_links : bool, default False Convert URLs to HTML links. @@ -2427,45 +2537,57 @@ def to_html( columns=columns, col_space=col_space, na_rep=na_rep, + header=header, + index=index, formatters=formatters, float_format=float_format, + bold_rows=bold_rows, sparsify=sparsify, justify=justify, index_names=index_names, - header=header, - index=index, - bold_rows=bold_rows, escape=escape, + decimal=decimal, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, - decimal=decimal, - table_id=table_id, - render_links=render_links, ) # TODO: a generic formatter wld b in DataFrameFormatter - return formatter.to_html( + return fmt.DataFrameRenderer(formatter).to_html( buf=buf, classes=classes, notebook=notebook, border=border, encoding=encoding, + table_id=table_id, + render_links=render_links, ) # ---------------------------------------------------------------------- @Substitution( klass="DataFrame", type_sub=" and columns", - max_cols_sub=( - """max_cols : int, optional + max_cols_sub=dedent( + """\ + max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - """ + ``pandas.options.display.max_info_columns`` is used.""" ), - examples_sub=( - """ + show_counts_sub=dedent( + """\ + show_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + null_counts : bool, optional + .. deprecated:: 1.2.0 + Use show_counts instead.""" + ), + examples_sub=dedent( + """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -2546,27 +2668,45 @@ def to_html( 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) - memory usage: 188.8 MB""" + memory usage: 165.9 MB""" ), - see_also_sub=( - """ + see_also_sub=dedent( + """\ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), + version_added_sub="", ) - @doc(DataFrameInfo.info) + @doc(BaseInfo.render) def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, + show_counts: Optional[bool] = None, null_counts: Optional[bool] = None, ) -> None: - return DataFrameInfo( - self, verbose, buf, max_cols, memory_usage, null_counts - ).info() + if null_counts is not None: + if show_counts is not None: + raise ValueError("null_counts used with show_counts. Use show_counts.") + warnings.warn( + "null_counts is deprecated. Use show_counts instead", + FutureWarning, + stacklevel=2, + ) + show_counts = null_counts + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.render( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) def memory_usage(self, index=True, deep=False) -> Series: """ @@ -2607,16 +2747,16 @@ def memory_usage(self, index=True, deep=False) -> Series: Examples -------- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000).astype(t)) + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) ... for t in dtypes]) >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool - 0 1 1.0 1.000000+0.000000j 1 True - 1 1 1.0 1.000000+0.000000j 1 True - 2 1 1.0 1.000000+0.000000j 1 True - 3 1 1.0 1.000000+0.000000j 1 True - 4 1 1.0 1.000000+0.000000j 1 True + 0 1 1.0 1.0+0.0j 1 True + 1 1 1.0 1.0+0.0j 1 True + 2 1 1.0 1.0+0.0j 1 True + 3 1 1.0 1.0+0.0j 1 True + 4 1 1.0 1.0+0.0j 1 True >>> df.memory_usage() Index 128 @@ -2642,7 +2782,7 @@ def memory_usage(self, index=True, deep=False) -> Series: int64 40000 float64 40000 complex128 80000 - object 160000 + object 180000 bool 5000 dtype: int64 @@ -2650,7 +2790,7 @@ def memory_usage(self, index=True, deep=False) -> Series: many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5216 + 5244 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], @@ -2662,7 +2802,7 @@ def memory_usage(self, index=True, deep=False) -> Series: ).append(result) return result - def transpose(self, *args, copy: bool = False) -> "DataFrame": + def transpose(self, *args, copy: bool = False) -> DataFrame: """ Transpose index and columns. @@ -2741,7 +2881,7 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": >>> df2_transposed 0 1 name Alice Bob - score 9.5 8 + score 9.5 8.0 employed False True kids 0 0 @@ -2759,7 +2899,7 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": 1 object dtype: object """ - nv.validate_transpose(args, dict()) + nv.validate_transpose(args, {}) # construct the args dtypes = list(self.dtypes) @@ -2785,7 +2925,7 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": return result.__finalize__(self, method="transpose") @property - def T(self) -> "DataFrame": + def T(self) -> DataFrame: return self.transpose() # ---------------------------------------------------------------------- @@ -2834,7 +2974,7 @@ def _get_column_array(self, i: int) -> ArrayLike: Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) """ - return self._data.iget_values(i) + return self._mgr.iget_values(i) def _iter_column_arrays(self) -> Iterator[ArrayLike]: """ @@ -2851,13 +2991,17 @@ def __getitem__(self, key): if is_hashable(key): # shortcut if the key is in columns if self.columns.is_unique and key in self.columns: - if self.columns.nlevels > 1: + if isinstance(self.columns, MultiIndex): return self._getitem_multilevel(key) return self._get_item_cache(key) # Do we have a slicer (on rows)? indexer = convert_to_index_sliceable(self, key) if indexer is not None: + if isinstance(indexer, np.ndarray): + indexer = lib.maybe_indices_to_slice( + indexer.astype(np.intp, copy=False), len(self) + ) # either we have a slice or we have a string that can be converted # to a slice for partial-string date indexing return self._slice(indexer, axis=0) @@ -2897,7 +3041,8 @@ def __getitem__(self, key): # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): - data = data[key] + # GH#26490 using data[key] can cause RecursionError + data = data._get_item_cache(key) return data @@ -3022,7 +3167,7 @@ def _setitem_slice(self, key: slice, value): # operates on labels and we need to operate positional for # backwards-compat, xref GH#31469 self._check_setitem_copy() - self.iloc._setitem_with_indexer(key, value) + self.iloc[key] = value def _setitem_array(self, key, value): # also raises Exception if object array with NA values @@ -3034,7 +3179,7 @@ def _setitem_array(self, key, value): key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() - self.iloc._setitem_with_indexer(indexer, value) + self.iloc[indexer] = value else: if isinstance(value, DataFrame): if len(value.columns) != len(key): @@ -3042,12 +3187,12 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: - self.loc._ensure_listlike_indexer(key, axis=1) + self.loc._ensure_listlike_indexer(key, axis=1, value=value) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] self._check_setitem_copy() - self.iloc._setitem_with_indexer((slice(None), indexer), value) + self.iloc[:, indexer] = value def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. @@ -3066,13 +3211,16 @@ def _setitem_frame(self, key, value): self._check_setitem_copy() self._where(-key, value, inplace=True) - def _iset_item(self, loc: int, value): - self._ensure_valid_index(value) + def _iset_item_mgr(self, loc: int, value) -> None: + self._mgr.iset(loc, value) + self._clear_item_cache() + + def _iset_item(self, loc: int, value, broadcast: bool = False): # technically _sanitize_column expects a label, not a position, # but the behavior is the same as long as we pass broadcast=False - value = self._sanitize_column(loc, value, broadcast=False) - NDFrame._iset_item(self, loc, value) + value = self._sanitize_column(loc, value, broadcast=broadcast) + self._iset_item_mgr(loc, value) # check if we are modifying a copy # try to set first as we want an invalid @@ -3090,9 +3238,15 @@ def _set_item(self, key, value): Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - self._ensure_valid_index(value) value = self._sanitize_column(key, value) - NDFrame._set_item(self, key, value) + + try: + loc = self._info_axis.get_loc(key) + except KeyError: + # This item wasn't present, just insert at end + self._mgr.insert(len(self._info_axis), key, value) + else: + self._iset_item_mgr(loc, value) # check if we are modifying a copy # try to set first as we want an invalid @@ -3148,9 +3302,12 @@ def _ensure_valid_index(self, value): "and a value that cannot be converted to a Series" ) from err - self._mgr = self._mgr.reindex_axis( - value.index.copy(), axis=1, fill_value=np.nan - ) + # GH31368 preserve name of index + index_copy = value.index.copy() + if self.index.name is not None: + index_copy.name = self.index.name + + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) def _box_col_values(self, values, loc: int) -> Series: """ @@ -3178,11 +3335,12 @@ def query(self, expr, inplace=False, **kwargs): in the environment by prefixing them with an '@' character like ``@a + b``. - You can refer to column names that contain spaces or operators by - surrounding them in backticks. This way you can also escape - names that start with a digit, or those that are a Python keyword. - Basically when it is not valid Python identifier. See notes down - for more details. + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2) would + be referenced as `Area (cm^2)`). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. @@ -3202,8 +3360,9 @@ def query(self, expr, inplace=False, **kwargs): Returns ------- - DataFrame - DataFrame resulting from the provided query expression. + DataFrame or None + DataFrame resulting from the provided query expression or + None if ``inplace=True``. See Also -------- @@ -3350,8 +3509,8 @@ def eval(self, expr, inplace=False, **kwargs): Returns ------- - ndarray, scalar, or pandas object - The result of the evaluation. + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. See Also -------- @@ -3445,7 +3604,7 @@ def eval(self, expr, inplace=False, **kwargs): return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None) -> "DataFrame": + def select_dtypes(self, include=None, exclude=None) -> DataFrame: """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -3558,7 +3717,13 @@ def extract_unique_dtypes_from_dtypes_set( extracted_dtypes = [ unique_dtype for unique_dtype in unique_dtypes - if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore + # error: Argument 1 to "tuple" has incompatible type + # "FrozenSet[Union[ExtensionDtype, str, Any, Type[str], + # Type[float], Type[int], Type[complex], Type[bool]]]"; + # expected "Iterable[Union[type, Tuple[Any, ...]]]" + if issubclass( + unique_dtype.type, tuple(dtypes_set) # type: ignore[arg-type] + ) ] return extracted_dtypes @@ -3593,12 +3758,38 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: Label of the inserted column. value : int, Series, or array-like allow_duplicates : bool, optional + + See Also + -------- + Index.insert : Insert new item by index. + + Examples + -------- + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + >>> df.insert(1, "newcol", [99, 99]) + >>> df + col1 newcol col2 + 0 1 99 3 + 1 2 99 4 + >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) + >>> df + col1 col1 newcol col2 + 0 100 1 99 3 + 1 100 2 99 4 """ - self._ensure_valid_index(value) + if allow_duplicates and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) - def assign(self, **kwargs) -> "DataFrame": + def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. @@ -3627,10 +3818,6 @@ def assign(self, **kwargs) -> "DataFrame": Later items in '\*\*kwargs' may refer to newly created or modified columns in 'df'; items are computed and assigned into 'df' in order. - .. versionchanged:: 0.23.0 - - Keyword argument order is maintained. - Examples -------- >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, @@ -3689,6 +3876,7 @@ def _sanitize_column(self, key, value, broadcast=True): ------- numpy.ndarray """ + self._ensure_valid_index(value) def reindexer(value): # reindex if necessary @@ -3755,15 +3943,11 @@ def reindexer(value): else: # cast ignores pandas dtypes. so save the dtype first - infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) + infer_dtype, fill_value = infer_dtype_from_scalar(value, pandas_dtype=True) - # upcast - if is_extension_array_dtype(infer_dtype): - value = construct_1d_arraylike_from_scalar( - value, len(self.index), infer_dtype - ) - else: - value = cast_scalar_to_array(len(self.index), value) + value = construct_1d_arraylike_from_scalar( + fill_value, len(self), infer_dtype + ) value = maybe_cast_to_datetime(value, infer_dtype) @@ -3792,10 +3976,15 @@ def _series(self): def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. - Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. + .. deprecated:: 1.2.0 + DataFrame.lookup is deprecated, + use DataFrame.melt and DataFrame.loc instead. + For an example see :meth:`~pandas.DataFrame.lookup` + in the user guide. + Parameters ---------- row_labels : sequence @@ -3808,6 +3997,14 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: numpy.ndarray The found values. """ + msg = ( + "The 'lookup' method is deprecated and will be" + "removed in a future version." + "You can use DataFrame.melt and DataFrame.loc" + "as a substitute." + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + n = len(row_labels) if n != len(col_labels): raise ValueError("Row labels must have same size as column labels") @@ -3896,7 +4093,7 @@ def _reindex_columns( allow_dups=False, ) - def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": + def _reindex_multi(self, axes, copy, fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. """ @@ -3929,7 +4126,7 @@ def align( limit=None, fill_axis=0, broadcast_axis=None, - ) -> "DataFrame": + ) -> DataFrame: return super().align( other, join=join, @@ -3998,7 +4195,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): ("tolerance", None), ], ) - def reindex(self, *args, **kwargs) -> "DataFrame": + def reindex(self, *args, **kwargs) -> DataFrame: axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names @@ -4048,8 +4245,9 @@ def drop( Returns ------- - DataFrame - DataFrame without the removed index or column labels. + DataFrame or None + DataFrame without the removed index or column labels or + None if ``inplace=True``. Raises ------ @@ -4160,7 +4358,7 @@ def rename( inplace: bool = False, level: Optional[Level] = None, errors: str = "ignore", - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Alter axes labels. @@ -4173,7 +4371,7 @@ def rename( Parameters ---------- mapper : dict-like or function - Dict-like or functions transformations to apply to + Dict-like or function transformations to apply to that axis' values. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and ``columns``. @@ -4203,8 +4401,8 @@ def rename( Returns ------- - DataFrame - DataFrame with the renamed axis labels. + DataFrame or None + DataFrame with the renamed axis labels or None if ``inplace=True``. Raises ------ @@ -4254,7 +4452,7 @@ def rename( Traceback (most recent call last): KeyError: ['C'] not found in axis - Using axis-style parameters + Using axis-style parameters: >>> df.rename(str.lower, axis='columns') a b @@ -4288,7 +4486,7 @@ def fillna( inplace=False, limit=None, downcast=None, - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: return super().fillna( value=value, method=method, @@ -4396,7 +4594,34 @@ def _replace_columnwise( return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": + def shift( + self, periods=1, freq=None, axis=0, fill_value=lib.no_default + ) -> DataFrame: + axis = self._get_axis_number(axis) + + ncols = len(self.columns) + if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: + # We will infer fill_value to match the closest column + + if periods > 0: + result = self.iloc[:, :-periods] + for col in range(min(ncols, abs(periods))): + # TODO(EA2D): doing this in a loop unnecessary with 2D EAs + # Define filler inside loop so we get a copy + filler = self.iloc[:, 0].shift(len(self)) + result.insert(0, col, filler, allow_duplicates=True) + else: + result = self.iloc[:, -periods:] + for col in range(min(ncols, abs(periods))): + # Define filler inside loop so we get a copy + filler = self.iloc[:, -1].shift(len(self)) + result.insert( + len(result.columns), col, filler, allow_duplicates=True + ) + + result.columns = self.columns.copy() + return result + return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4424,7 +4649,7 @@ def set_index( append : bool, default False Whether to append columns to existing index. inplace : bool, default False - Modify the DataFrame in place (do not create a new object). + If True, modifies the DataFrame in place (do not create a new object). verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this @@ -4432,8 +4657,8 @@ def set_index( Returns ------- - DataFrame - Changed row labels. + DataFrame or None + Changed row labels or None if ``inplace=True``. See Also -------- @@ -4494,6 +4719,7 @@ def set_index( 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -4531,7 +4757,7 @@ def set_index( frame = self.copy() arrays = [] - names = [] + names: List[Label] = [] if append: names = list(self.index.names) if isinstance(self.index, MultiIndex): @@ -4589,6 +4815,30 @@ def set_index( if not inplace: return frame + @overload + # https://github.com/python/mypy/issues/6580 + # Overloaded function signatures 1 and 2 overlap with incompatible return types + def reset_index( # type: ignore[misc] + self, + level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., + drop: bool = ..., + inplace: Literal[False] = ..., + col_level: Hashable = ..., + col_fill: Label = ..., + ) -> DataFrame: + ... + + @overload + def reset_index( + self, + level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., + drop: bool = ..., + inplace: Literal[True] = ..., + col_level: Hashable = ..., + col_fill: Label = ..., + ) -> None: + ... + def reset_index( self, level: Optional[Union[Hashable, Sequence[Hashable]]] = None, @@ -4596,7 +4846,7 @@ def reset_index( inplace: bool = False, col_level: Hashable = 0, col_fill: Label = "", - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Reset the index, or a level of it. @@ -4739,51 +4989,12 @@ class max type monkey mammal NaN jump """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: new_obj = self else: new_obj = self.copy() - def _maybe_casted_values(index, labels=None): - values = index._values - if not isinstance(index, (PeriodIndex, DatetimeIndex)): - if values.dtype == np.object_: - values = lib.maybe_convert_objects(values) - - # if we have the labels, extract the values with a mask - if labels is not None: - mask = labels == -1 - - # we can have situations where the whole mask is -1, - # meaning there is nothing found in labels, so make all nan's - if mask.all(): - dtype = index.dtype - fill_value = na_value_for_dtype(dtype) - values = construct_1d_arraylike_from_scalar( - fill_value, len(mask), dtype - ) - else: - values = values.take(labels) - - # TODO(https://github.com/pandas-dev/pandas/issues/24206) - # Push this into maybe_upcast_putmask? - # We can't pass EAs there right now. Looks a bit - # complicated. - # So we unbox the ndarray_values, op, re-box. - values_type = type(values) - values_dtype = values.dtype - - if issubclass(values_type, DatetimeLikeArray): - values = values._data # TODO: can we de-kludge yet? - - if mask.any(): - values, _ = maybe_upcast_putmask(values, mask, np.nan) - - if issubclass(values_type, DatetimeLikeArray): - values = values_type(values, dtype=values_dtype) - - return values - new_index = ibase.default_index(len(new_obj)) if level is not None: if not isinstance(level, (tuple, list)): @@ -4826,7 +5037,7 @@ def _maybe_casted_values(index, labels=None): name_lst += [col_fill] * missing name = tuple(name_lst) # to ndarray and maybe infer different dtype - level_values = _maybe_casted_values(lev, lab) + level_values = maybe_casted_values(lev, lab) new_obj.insert(0, name, level_values) new_obj.index = new_index @@ -4839,20 +5050,20 @@ def _maybe_casted_values(index, labels=None): # Reindex-based selection methods @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isna(self) -> "DataFrame": - result = self._constructor(self._data.isna(func=isna)) + def isna(self) -> DataFrame: + result = self._constructor(self._mgr.isna(func=isna)) return result.__finalize__(self, method="isna") @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isnull(self) -> "DataFrame": + def isnull(self) -> DataFrame: return self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notna(self) -> "DataFrame": + def notna(self) -> DataFrame: return ~self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notnull(self) -> "DataFrame": + def notnull(self) -> DataFrame: return ~self.isna() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): @@ -4893,8 +5104,8 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): Returns ------- - DataFrame - DataFrame with NA entries dropped from it. + DataFrame or None + DataFrame with NA entries dropped from it or None if ``inplace=True``. See Also -------- @@ -4947,9 +5158,10 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): Define in which columns to look for missing values. - >>> df.dropna(subset=['name', 'born']) + >>> df.dropna(subset=['name', 'toy']) name toy born 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT Keep the DataFrame with valid entries in the same variable. @@ -5002,7 +5214,7 @@ def drop_duplicates( keep: Union[str, bool] = "first", inplace: bool = False, ignore_index: bool = False, - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Return DataFrame with duplicate rows removed. @@ -5028,7 +5240,7 @@ def drop_duplicates( Returns ------- - DataFrame + DataFrame or None DataFrame with duplicates removed or None if ``inplace=True``. See Also @@ -5068,7 +5280,7 @@ def drop_duplicates( 0 Yum Yum cup 4.0 2 Indomie cup 3.5 - To remove duplicates and keep last occurences, use ``keep``. + To remove duplicates and keep last occurrences, use ``keep``. >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') brand style rating @@ -5080,6 +5292,7 @@ def drop_duplicates( return self.copy() inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") duplicated = self.duplicated(subset, keep=keep) result = self[-duplicated] @@ -5096,7 +5309,7 @@ def duplicated( self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", - ) -> "Series": + ) -> Series: """ Return boolean Series denoting duplicate rows. @@ -5185,15 +5398,14 @@ def duplicated( 4 True dtype: bool """ - from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 if self.empty: return self._constructor_sliced(dtype=bool) def f(vals): labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) + vals, size_hint=min(len(self), SIZE_HINT_LIMIT) ) return labels.astype("i8", copy=False), len(shape) @@ -5221,14 +5433,16 @@ def f(vals): labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - return self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) + result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) + return result.__finalize__(self, method="duplicated") # ---------------------------------------------------------------------- # Sorting # TODO: Just move the sort_values doc here. @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - def sort_values( # type: ignore[override] # NOQA # issue 27237 + # error: Signature of "sort_values" incompatible with supertype "NDFrame" + def sort_values( # type: ignore[override] self, by, axis=0, @@ -5249,7 +5463,6 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) if len(by) > 1: - from pandas.core.sorting import lexsort_indexer keys = [self._get_label_or_level_values(x, axis=axis) for x in by] @@ -5262,7 +5475,6 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 ) indexer = ensure_platform_int(indexer) else: - from pandas.core.sorting import nargsort by = by[0] k = self._get_label_or_level_values(by, axis=axis) @@ -5349,8 +5561,8 @@ def sort_index( Returns ------- - DataFrame - The original DataFrame sorted by the labels. + DataFrame or None + The original DataFrame sorted by the labels or None if ``inplace=True``. See Also -------- @@ -5392,62 +5604,17 @@ def sort_index( C 3 d 4 """ - # TODO: this can be combined with Series.sort_index impl as - # almost identical - - inplace = validate_bool_kwarg(inplace, "inplace") - - axis = self._get_axis_number(axis) - labels = self._get_axis(axis) - labels = ensure_key_mapped(labels, key, levels=level) - - # make sure that the axis is lexsorted to start - # if not we need to reconstruct to get the correct indexer - labels = labels._sort_levels_monotonic() - if level is not None: - new_axis, indexer = labels.sortlevel( - level, ascending=ascending, sort_remaining=sort_remaining - ) - - elif isinstance(labels, MultiIndex): - from pandas.core.sorting import lexsort_indexer - - indexer = lexsort_indexer( - labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position, - ) - else: - from pandas.core.sorting import nargsort - - # Check monotonic-ness before sort an index - # GH11080 - if (ascending and labels.is_monotonic_increasing) or ( - not ascending and labels.is_monotonic_decreasing - ): - if inplace: - return - else: - return self.copy() - - indexer = nargsort( - labels, kind=kind, ascending=ascending, na_position=na_position - ) - - baxis = self._get_block_manager_axis(axis) - new_data = self._mgr.take(indexer, axis=baxis, verify=False) - - # reconstruct axis if needed - new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() - - if ignore_index: - new_data.axes[1] = ibase.default_index(len(indexer)) - - result = self._constructor(new_data) - if inplace: - return self._update_inplace(result) - else: - return result.__finalize__(self, method="sort_index") + return super().sort_index( + axis, + level, + ascending, + inplace, + kind, + na_position, + sort_remaining, + ignore_index, + key, + ) def value_counts( self, @@ -5502,8 +5669,8 @@ def value_counts( >>> df.value_counts() num_legs num_wings 4 0 2 - 6 0 1 2 2 1 + 6 0 1 dtype: int64 >>> df.value_counts(sort=False) @@ -5523,8 +5690,8 @@ def value_counts( >>> df.value_counts(normalize=True) num_legs num_wings 4 0 0.50 - 6 0 0.25 2 2 0.25 + 6 0 0.25 dtype: float64 """ if subset is None: @@ -5545,7 +5712,7 @@ def value_counts( return counts - def nlargest(self, n, columns, keep="first") -> "DataFrame": + def nlargest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in descending order. @@ -5654,7 +5821,7 @@ def nlargest(self, n, columns, keep="first") -> "DataFrame": """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep="first") -> "DataFrame": + def nsmallest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in ascending order. @@ -5724,7 +5891,7 @@ def nsmallest(self, n, columns, keep="first") -> "DataFrame": population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI - Iceland 337000 17036 IS + Iceland 337000 17036 IS When using ``keep='last'``, ties are resolved in reverse order: @@ -5756,7 +5923,7 @@ def nsmallest(self, n, columns, keep="first") -> "DataFrame": self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": + def swaplevel(self, i=-2, j=-1, axis=0) -> DataFrame: """ Swap levels i and j in a MultiIndex on a particular axis. @@ -5787,7 +5954,7 @@ def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order, axis=0) -> "DataFrame": + def reorder_levels(self, order, axis=0) -> DataFrame: """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -5818,9 +5985,92 @@ def reorder_levels(self, order, axis=0) -> "DataFrame": return result # ---------------------------------------------------------------------- - # Arithmetic / combination related + # Arithmetic Methods + + def _cmp_method(self, other, op): + axis = 1 # only relevant for Series other case + + self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) - def _combine_frame(self, other: "DataFrame", func, fill_value=None): + # See GH#4537 for discussion of scalar op behavior + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) + + def _arith_method(self, other, op): + if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None): + return ops.frame_arith_method_with_reindex(self, other, op) + + axis = 1 # only relevant for Series other case + + self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) + + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) + + _logical_method = _arith_method + + def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): + """ + Evaluate the frame operation func(left, right) by evaluating + column-by-column, dispatching to the Series implementation. + + Parameters + ---------- + right : scalar, Series, or DataFrame + func : arithmetic or comparison operator + axis : {None, 0, 1} + + Returns + ------- + DataFrame + """ + # Get the appropriate array-op to apply to each column/block's values. + array_op = ops.get_array_op(func) + + right = lib.item_from_zerodim(right) + if not is_list_like(right): + # i.e. scalar, faster than checking np.ndim(right) == 0 + bm = self._mgr.apply(array_op, right=right) + return type(self)(bm) + + elif isinstance(right, DataFrame): + assert self.index.equals(right.index) + assert self.columns.equals(right.columns) + # TODO: The previous assertion `assert right._indexed_same(self)` + # fails in cases with empty columns reached via + # _frame_arith_method_with_reindex + + bm = self._mgr.operate_blockwise(right._mgr, array_op) + return type(self)(bm) + + elif isinstance(right, Series) and axis == 1: + # axis=1 means we want to operate row-by-row + assert right.index.equals(self.columns) + + right = right._values + # maybe_align_as_frame ensures we do not have an ndarray here + assert not isinstance(right, np.ndarray) + + arrays = [ + array_op(_left, _right) + for _left, _right in zip(self._iter_column_arrays(), right) + ] + + elif isinstance(right, Series): + assert right.index.equals(self.index) # Handle other cases later + right = right._values + + arrays = [array_op(left, right) for left in self._iter_column_arrays()] + + else: + # Remaining cases have less-obvious dispatch rules + raise NotImplementedError(right) + + return type(self)._from_arrays( + arrays, self.columns, self.index, verify_integrity=False + ) + + def _combine_frame(self, other: DataFrame, func, fill_value=None): # at this point we have `self._indexed_same(other)` if fill_value is None: @@ -5837,10 +6087,10 @@ def _arith_op(left, right): left, right = ops.fill_binop(left, right, fill_value) return func(left, right) - new_data = ops.dispatch_to_series(self, other, _arith_op) + new_data = self._dispatch_frame_op(other, _arith_op) return new_data - def _construct_result(self, result) -> "DataFrame": + def _construct_result(self, result) -> DataFrame: """ Wrap the result of an arithmetic, comparison, or logical operation. @@ -5859,7 +6109,23 @@ def _construct_result(self, result) -> "DataFrame": out.index = self.index return out - @Appender( + def __divmod__(self, other) -> Tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = self // other + mod = self - div * other + return div, mod + + def __rdivmod__(self, other) -> Tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = other // self + mod = other - div * self + return div, mod + + # ---------------------------------------------------------------------- + # Combination-Related + + @doc( + _shared_docs["compare"], """ Returns ------- @@ -5869,22 +6135,31 @@ def _construct_result(self, result) -> "DataFrame": The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. +Raises +------ +ValueError + When the two DataFrames don't have identical labels or shape. + See Also -------- Series.compare : Compare with another Series and show differences. +DataFrame.equals : Test whether two objects contain the same elements. Notes ----- Matching NaNs will not appear as a difference. +Can only compare identically-labeled +(i.e. same shape, identical row and column labels) DataFrames + Examples -------- >>> df = pd.DataFrame( -... { +... {{ ... "col1": ["a", "a", "b", "b", "a"], ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] -... }, +... }}, ... columns=["col1", "col2", "col3"], ... ) >>> df @@ -5952,16 +6227,16 @@ def _construct_result(self, result) -> "DataFrame": 2 b b 3.0 3.0 3.0 4.0 3 b b NaN NaN 4.0 4.0 4 a a 5.0 5.0 5.0 5.0 -""" +""", + klass=_shared_doc_kwargs["klass"], ) - @Appender(_shared_docs["compare"] % _shared_doc_kwargs) def compare( self, - other: "DataFrame", + other: DataFrame, align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - ) -> "DataFrame": + ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, @@ -5970,8 +6245,8 @@ def compare( ) def combine( - self, other: "DataFrame", func, fill_value=None, overwrite=True - ) -> "DataFrame": + self, other: DataFrame, func, fill_value=None, overwrite=True + ) -> DataFrame: """ Perform column-wise combine with another DataFrame. @@ -6131,14 +6406,14 @@ def combine( otherSeries = otherSeries.astype(new_dtype) arr = func(series, otherSeries) - arr = maybe_downcast_to_dtype(arr, this_dtype) + arr = maybe_downcast_to_dtype(arr, new_dtype) result[col] = arr # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other: "DataFrame") -> "DataFrame": + def combine_first(self, other: DataFrame) -> DataFrame: """ Update null elements with value in the same location in `other`. @@ -6182,29 +6457,11 @@ def combine_first(self, other: "DataFrame") -> "DataFrame": """ import pandas.core.computation.expressions as expressions - def extract_values(arr): - # Does two things: - # 1. maybe gets the values from the Series / Index - # 2. convert datelike to i8 - # TODO: extract_array? - if isinstance(arr, (Index, Series)): - arr = arr._values - - if needs_i8_conversion(arr.dtype): - if is_extension_array_dtype(arr.dtype): - arr = arr.asi8 - else: - arr = arr.view("i8") - return arr - def combiner(x, y): - mask = isna(x) - # TODO: extract_array? - if isinstance(mask, (Index, Series)): - mask = mask._values + mask = extract_array(isna(x)) - x_values = extract_values(x) - y_values = extract_values(y) + x_values = extract_array(x, extract_numpy=True) + y_values = extract_array(y, extract_numpy=True) # If the column y in other DataFrame is not in first DataFrame, # just return y_values. @@ -6267,7 +6524,7 @@ def update( See Also -------- dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-columns(s) operations. + DataFrame.merge : For column(s)-on-column(s) operations. Examples -------- @@ -6295,7 +6552,7 @@ def update( 1 b e 2 c f - For Series, it's name attribute must be set. + For Series, its name attribute must be set. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) @@ -6463,7 +6720,7 @@ def groupby( squeeze: bool = no_default, observed: bool = False, dropna: bool = True, - ) -> "DataFrameGroupBy": + ) -> DataFrameGroupBy: from pandas.core.groupby.generic import DataFrameGroupBy if squeeze is not no_default: @@ -6526,9 +6783,6 @@ def groupby( specified, all remaining columns will be used and the result will have hierarchically indexed columns. - .. versionchanged:: 0.23.0 - Also accept list of column names. - Returns ------- DataFrame @@ -6546,6 +6800,8 @@ def groupby( duplicate values for one index/column pair. DataFrame.unstack : Pivot based on the index values instead of a column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Notes ----- @@ -6642,7 +6898,7 @@ def groupby( @Substitution("") @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None) -> "DataFrame": + def pivot(self, index=None, columns=None, values=None) -> DataFrame: from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values) @@ -6700,6 +6956,10 @@ def pivot(self, index=None, columns=None, values=None) -> "DataFrame": -------- DataFrame.pivot : Pivot without aggregation that can handle non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Examples -------- @@ -6790,7 +7050,7 @@ def pivot_table( dropna=True, margins_name="All", observed=False, - ) -> "DataFrame": + ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table return pivot_table( @@ -6970,13 +7230,15 @@ def stack(self, level=-1, dropna=True): from pandas.core.reshape.reshape import stack, stack_multiple if isinstance(level, (tuple, list)): - return stack_multiple(self, level, dropna=dropna) + result = stack_multiple(self, level, dropna=dropna) else: - return stack(self, level, dropna=dropna) + result = stack(self, level, dropna=dropna) + + return result.__finalize__(self, method="stack") def explode( self, column: Union[str, Tuple], ignore_index: bool = False - ) -> "DataFrame": + ) -> DataFrame: """ Transform each element of a list-like to a row, replicating index values. @@ -7011,10 +7273,11 @@ def explode( Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of rows in the + output will be non-deterministic when exploding sets. Examples -------- @@ -7042,8 +7305,6 @@ def explode( raise ValueError("columns must be unique") df = self.reset_index(drop=True) - # TODO: use overload to refine return type of reset_index - assert df is not None # needed for mypy result = df[column].explode() result = df.drop([column], axis=1).join(result) if ignore_index: @@ -7113,16 +7374,11 @@ def unstack(self, level=-1, fill_value=None): """ from pandas.core.reshape.reshape import unstack - return unstack(self, level, fill_value) + result = unstack(self, level, fill_value) - @Appender( - _shared_docs["melt"] - % dict( - caller="df.melt(", - versionadded="\n .. versionadded:: 0.20.0\n", - other="melt", - ) - ) + return result.__finalize__(self, method="unstack") + + @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) def melt( self, id_vars=None, @@ -7131,7 +7387,7 @@ def melt( value_name="value", col_level=None, ignore_index=True, - ) -> "DataFrame": + ) -> DataFrame: return melt( self, @@ -7180,13 +7436,13 @@ def melt( Difference with previous column >>> df.diff(axis=1) - a b c - 0 NaN 0.0 0.0 - 1 NaN -1.0 3.0 - 2 NaN -1.0 7.0 - 3 NaN -1.0 13.0 - 4 NaN 0.0 20.0 - 5 NaN 2.0 28.0 + a b c + 0 NaN 0 0 + 1 NaN -1 3 + 2 NaN -1 7 + 3 NaN -1 13 + 4 NaN 0 20 + 5 NaN 2 28 Difference with 3rd previous row @@ -7219,23 +7475,26 @@ def melt( 1 255.0""" ), ) - def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": + def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: + if not isinstance(periods, int): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") + periods = int(periods) bm_axis = self._get_block_manager_axis(axis) - self._consolidate_inplace() if bm_axis == 0 and periods != 0: - return self.T.diff(periods, axis=0).T + return self - self.shift(periods, axis=axis) new_data = self._mgr.diff(n=periods, axis=bm_axis) - return self._constructor(new_data) + return self._constructor(new_data).__finalize__(self, "diff") # ---------------------------------------------------------------------- # Function application def _gotitem( self, - key: Union[str, List[str]], + key: Union[Label, List[Label]], ndim: int, subset: Optional[FrameOrSeriesUnion] = None, ) -> FrameOrSeriesUnion: @@ -7303,9 +7562,18 @@ def _gotitem( >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) A B - max NaN 8.0 - min 1.0 2.0 sum 12.0 NaN + min 1.0 2.0 + max NaN 8.0 + + Aggregate different functions over the columns and rename the index of the resulting + DataFrame. + + >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) + A B C + x 7.0 NaN NaN + y NaN 2.0 NaN + z NaN NaN 6.0 Aggregate over the columns. @@ -7324,7 +7592,6 @@ def _gotitem( axis=_shared_doc_kwargs["axis"], see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, - versionadded="\n.. versionadded:: 0.20.0\n", ) def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) @@ -7333,7 +7600,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): result = None try: - result, how = self._aggregate(func, axis=axis, *args, **kwargs) + result, how = self._aggregate(func, axis, *args, **kwargs) except TypeError as err: exc = TypeError( "DataFrame constructor called with " @@ -7346,6 +7613,12 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if relabeling: # This is to keep the order to columns occurrence unchanged, and also # keep the order of new columns occurrence unchanged + + # For the return values of reconstruct_func, if relabeling is + # False, columns and order will be None. + assert columns is not None + assert order is not None + result_in_dict = relabel_result(result, func, columns, order) result = DataFrame(result_in_dict, index=columns) @@ -7355,23 +7628,24 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): if axis == 1: # NDFrame.aggregate returns a tuple, and we need to transpose # only result - result, how = self.T._aggregate(arg, *args, **kwargs) + result, how = aggregate(self.T, arg, *args, **kwargs) result = result.T if result is not None else result return result, how - return super()._aggregate(arg, *args, **kwargs) + return aggregate(self, arg, *args, **kwargs) agg = aggregate @doc( - NDFrame.transform, + _shared_docs["transform"], klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": - axis = self._get_axis_number(axis) - if axis == 1: - return self.T.transform(func, *args, **kwargs).T - return super().transform(func, *args, **kwargs) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> DataFrame: + result = transform(self, func, axis, *args, **kwargs) + assert isinstance(result, DataFrame) + return result def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ @@ -7417,9 +7691,6 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns. - - .. versionadded:: 0.23.0 - args : tuple Positional arguments to pass to `func` in addition to the array/series. @@ -7521,7 +7792,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): ) return op.get_result() - def applymap(self, func) -> "DataFrame": + def applymap(self, func, na_action: Optional[str] = None) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -7532,6 +7803,10 @@ def applymap(self, func) -> "DataFrame": ---------- func : callable Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If ‘ignore’, propagate NaN values, without passing them to func. + + .. versionadded:: 1.2 Returns ------- @@ -7555,6 +7830,15 @@ def applymap(self, func) -> "DataFrame": 0 3 4 1 5 5 + Like Series.map, NA values can be ignored: + + >>> df_copy = df.copy() + >>> df_copy.iloc[0, 0] = pd.NA + >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') + 0 1 + 0 4 + 1 5 5 + Note that a vectorized version of `func` often exists, which will be much faster. You could square each number elementwise. @@ -7570,20 +7854,26 @@ def applymap(self, func) -> "DataFrame": 0 1.000000 4.494400 1 11.262736 20.857489 """ + if na_action not in {"ignore", None}: + raise ValueError( + f"na_action must be 'ignore' or None. Got {repr(na_action)}" + ) + ignore_na = na_action == "ignore" + # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): if x.empty: - return lib.map_infer(x, func) - return lib.map_infer(x.astype(object)._values, func) + return lib.map_infer(x, func, ignore_na=ignore_na) + return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) - return self.apply(infer) + return self.apply(infer).__finalize__(self, "applymap") # ---------------------------------------------------------------------- # Merging / joining methods def append( self, other, ignore_index=False, verify_integrity=False, sort=False - ) -> "DataFrame": + ) -> DataFrame: """ Append rows of `other` to the end of caller, returning a new object. @@ -7600,7 +7890,6 @@ def append( sort : bool, default False Sort columns if the columns of `self` and `other` are not aligned. - .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Changed to not sort by default. @@ -7714,16 +8003,18 @@ def append( to_concat = [self, *other] else: to_concat = [self, other] - return concat( - to_concat, - ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, - ) + return ( + concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) + ).__finalize__(self, method="append") def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False - ) -> "DataFrame": + ) -> DataFrame: """ Join columns of another DataFrame. @@ -7769,7 +8060,7 @@ def join( See Also -------- - DataFrame.merge : For column(s)-on-columns(s) operations. + DataFrame.merge : For column(s)-on-column(s) operations. Notes ----- @@ -7848,8 +8139,8 @@ def join( def _join_compat( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False ): - from pandas.core.reshape.merge import merge from pandas.core.reshape.concat import concat + from pandas.core.reshape.merge import merge if isinstance(other, Series): if other.name is None: @@ -7857,6 +8148,15 @@ def _join_compat( other = DataFrame({other.name: other}) if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) return merge( self, other, @@ -7914,7 +8214,7 @@ def merge( copy=True, indicator=False, validate=None, - ) -> "DataFrame": + ) -> DataFrame: from pandas.core.reshape.merge import merge return merge( @@ -7933,7 +8233,7 @@ def merge( validate=validate, ) - def round(self, decimals=0, *args, **kwargs) -> "DataFrame": + def round(self, decimals=0, *args, **kwargs) -> DataFrame: """ Round a DataFrame to a variable number of decimal places. @@ -8047,7 +8347,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method="pearson", min_periods=1) -> "DataFrame": + def corr(self, method="pearson", min_periods=1) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -8138,7 +8438,7 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": def cov( self, min_periods: Optional[int] = None, ddof: Optional[int] = 1 - ) -> "DataFrame": + ) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. @@ -8378,6 +8678,7 @@ def count(self, axis=0, level=None, numeric_only=False): See Also -------- Series.count: Number of non-NA elements in a Series. + DataFrame.value_counts: Count unique combinations of columns. DataFrame.shape: Number of DataFrame rows and columns (including NA elements). DataFrame.isna: Boolean same-sized DataFrame showing places of NA @@ -8499,16 +8800,24 @@ def _count_level(self, level, axis=0, numeric_only=False): return result def _reduce( - self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + self, + op, + name: str, + *, + axis=0, + skipna=True, + numeric_only=None, + filter_type=None, + **kwds, ): assert filter_type is None or filter_type == "bool", filter_type + out_dtype = "bool" if filter_type == "bool" else None + + own_dtypes = [arr.dtype for arr in self._iter_column_arrays()] dtype_is_dt = np.array( - [ - is_datetime64_any_dtype(values.dtype) - for values in self._iter_column_arrays() - ], + [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], dtype=bool, ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): @@ -8517,141 +8826,90 @@ def _reduce( "will include datetime64 and datetime64tz columns in a " "future version.", FutureWarning, - stacklevel=3, + stacklevel=5, ) cols = self.columns[~dtype_is_dt] self = self[cols] - if axis is None and filter_type == "bool": - labels = None - constructor = None - else: - # TODO: Make other agg func handle axis=None properly - axis = self._get_axis_number(axis) - labels = self._get_agg_axis(axis) - constructor = self._constructor + # TODO: Make other agg func handle axis=None properly GH#21597 + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + assert axis in [0, 1] - def f(x): - return op(x, axis=axis, skipna=skipna, **kwds) + def func(values: np.ndarray): + # We only use this in the case that operates on self.values + return op(values, axis=axis, skipna=skipna, **kwds) - def _get_data(axis_matters): + def blk_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) + + def _get_data() -> DataFrame: if filter_type is None: data = self._get_numeric_data() - elif filter_type == "bool": - if axis_matters: - # GH#25101, GH#24434 - data = self._get_bool_data() if axis == 0 else self - else: - data = self._get_bool_data() - else: # pragma: no cover - msg = ( - f"Generating numeric_only data with filter_type {filter_type} " - "not supported." - ) - raise NotImplementedError(msg) + else: + # GH#25101, GH#24434 + assert filter_type == "bool" + data = self._get_bool_data() return data - if numeric_only is not None and axis in [0, 1]: + if numeric_only is not None or axis == 0: + # For numeric_only non-None and axis non-None, we know + # which blocks to use and no try/except is needed. + # For numeric_only=None only the case with axis==0 and no object + # dtypes are unambiguous can be handled with BlockManager.reduce + # Case with EAs see GH#35881 df = self if numeric_only is True: - df = _get_data(axis_matters=True) + df = _get_data() if axis == 1: df = df.T axis = 0 - out_dtype = "bool" if filter_type == "bool" else None - - def blk_func(values): - if isinstance(values, ExtensionArray): - return values._reduce(name, skipna=skipna, **kwds) - else: - return op(values, axis=1, skipna=skipna, **kwds) + ignore_failures = numeric_only is None # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager._reduce - res = df._mgr.reduce(blk_func) - assert isinstance(res, dict) - if len(res): - assert len(res) == max(list(res.keys())) + 1, res.keys() - out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) - out.index = df.columns - if axis == 0 and is_object_dtype(out.dtype): - out[:] = coerce_to_dtypes(out.values, df.dtypes) + # simple case where we can use BlockManager.reduce + res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) + out = df._constructor(res).iloc[0] + if out_dtype is not None: + out = out.astype(out_dtype) + if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: + # Even if we are object dtype, follow numpy and return + # float64, see test_apply_funcs_over_empty + out = out.astype(np.float64) return out - if not self._is_homogeneous_type: - # try to avoid self.values call + assert numeric_only is None - if filter_type is None and axis == 0 and len(self) > 0: - # operate column-wise + data = self + values = data.values - # numeric_only must be None here, as other cases caught above - # require len(self) > 0 bc frame_apply messes up empty prod/sum + try: + result = func(values) - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series + except TypeError: + # e.g. in nanops trying to convert strs to float - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply + data = _get_data() + labels = data._get_agg_axis(axis) - opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0].rename(None) - return result - - if numeric_only is None: - data = self values = data.values + with np.errstate(all="ignore"): + result = func(values) + if filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) + elif filter_type is None and is_object_dtype(result.dtype): try: - result = f(values) - - except TypeError: - # e.g. in nanops trying to convert strs to float - - # TODO: why doesnt axis matter here? - data = _get_data(axis_matters=False) - labels = data._get_agg_axis(axis) - - values = data.values - with np.errstate(all="ignore"): - result = f(values) - - else: - if numeric_only: - data = _get_data(axis_matters=True) - labels = data._get_agg_axis(axis) - - values = data.values - else: - data = self - values = data.values - result = f(values) - - if filter_type == "bool" and is_object_dtype(values) and axis is None: - # work around https://github.com/numpy/numpy/issues/10489 - # TODO: can we de-duplicate parts of this with the next blocK? - result = np.bool_(result) - elif hasattr(result, "dtype") and is_object_dtype(result.dtype): - try: - if filter_type is None: - result = result.astype(np.float64) - elif filter_type == "bool" and notna(result).all(): - result = result.astype(np.bool_) + result = result.astype(np.float64) except (ValueError, TypeError): # try to coerce to the original dtypes item by item if we can - if axis == 0: - result = coerce_to_dtypes(result, data.dtypes) + pass - if constructor is not None: - result = self._constructor_sliced(result, index=labels) + result = self._constructor_sliced(result, index=labels) return result def nunique(self, axis=0, dropna=True) -> Series: @@ -8756,7 +9014,11 @@ def idxmin(self, axis=0, skipna=True) -> Series: dtype: object """ axis = self._get_axis_number(axis) - indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) + + res = self._reduce( + nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False + ) + indices = res._values # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame @@ -8829,7 +9091,11 @@ def idxmax(self, axis=0, skipna=True) -> Series: dtype: object """ axis = self._get_axis_number(axis) - indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) + + res = self._reduce( + nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False + ) + indices = res._values # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame @@ -8851,7 +9117,7 @@ def _get_agg_axis(self, axis_num: int) -> Index: else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": + def mode(self, axis=0, numeric_only=False, dropna=True) -> DataFrame: """ Get the mode(s) of each element along the selected axis. @@ -8899,8 +9165,8 @@ def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": ostrich bird 2 NaN By default, missing values are not considered, and the mode of wings - are both 0 and 2. The second row of species and legs contains ``NaN``, - because they have only one mode, but the DataFrame has two rows. + are both 0 and 2. Because the resulting DataFrame has two rows, + the second row of ``species`` and ``legs`` contains ``NaN``. >>> df.mode() species legs wings @@ -9036,7 +9302,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): def to_timestamp( self, freq=None, how: str = "start", axis: Axis = 0, copy: bool = True - ) -> "DataFrame": + ) -> DataFrame: """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -9060,12 +9326,15 @@ def to_timestamp( axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) + if not isinstance(old_ax, PeriodIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + new_ax = old_ax.to_timestamp(freq=freq, how=how) setattr(new_obj, axis_name, new_ax) return new_obj - def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame": + def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> DataFrame: """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -9089,12 +9358,15 @@ def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame" axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) + if not isinstance(old_ax, DatetimeIndex): + raise TypeError(f"unsupported Type {type(old_ax).__name__}") + new_ax = old_ax.to_period(freq=freq) setattr(new_obj, axis_name, new_ax) return new_obj - def isin(self, values) -> "DataFrame": + def isin(self, values) -> DataFrame: """ Whether each element in the DataFrame is contained in values. @@ -9201,10 +9473,10 @@ def isin(self, values) -> "DataFrame": _info_axis_number = 1 _info_axis_name = "columns" - index: "Index" = properties.AxisProperty( + index: Index = properties.AxisProperty( axis=1, doc="The index (row labels) of the DataFrame." ) - columns: "Index" = properties.AxisProperty( + columns: Index = properties.AxisProperty( axis=0, doc="The column labels of the DataFrame." ) @@ -9229,14 +9501,12 @@ def _AXIS_NAMES(self) -> Dict[int, str]: DataFrame._add_numeric_operations() -DataFrame._add_series_or_dataframe_operations() ops.add_flex_arithmetic_methods(DataFrame) -ops.add_special_arithmetic_methods(DataFrame) -def _from_nested_dict(data): - new_data = collections.defaultdict(dict) +def _from_nested_dict(data) -> collections.defaultdict: + new_data: collections.defaultdict = collections.defaultdict(dict) for index, s in data.items(): for col, v in s.items(): new_data[col][index] = v diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 571fcc67f3bb5..41cb76d88957e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import collections from datetime import timedelta import functools @@ -6,7 +8,6 @@ import operator import pickle import re -from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -22,6 +23,7 @@ Tuple, Type, Union, + cast, ) import warnings import weakref @@ -31,29 +33,28 @@ from pandas._config import config from pandas._libs import lib -from pandas._libs.tslibs import Tick, Timestamp, to_offset +from pandas._libs.tslibs import Period, Tick, Timestamp, to_offset from pandas._typing import ( Axis, + CompressionOptions, FilePathOrBuffer, FrameOrSeries, + IndexKeyFunc, + IndexLabel, JSONSerializable, Label, Level, Renamer, + StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, + final, ) -from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, InvalidIndexError -from pandas.util._decorators import ( - Appender, - Substitution, - doc, - rewrite_axis_style_signature, -) +from pandas.util._decorators import doc, rewrite_axis_style_signature from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -69,6 +70,7 @@ is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, + is_dtype_equal, is_extension_array_dtype, is_float, is_list_like, @@ -85,69 +87,56 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import missing, nanops +from pandas.core import arraylike, indexing, missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype -from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import Period, PeriodIndex -import pandas.core.indexing as indexing +from pandas.core.flags import Flags +from pandas.core.indexes import base as ibase +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + RangeIndex, + ensure_index, +) from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index -from pandas.core.ops import _align_method_FRAME +from pandas.core.ops import align_method_FRAME from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import get_indexer_indexer +from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.format import DataFrameFormatter, format_percentiles +from pandas.io.formats.format import ( + DataFrameFormatter, + DataFrameRenderer, + format_percentiles, +) from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from pandas._libs.tslibs import BaseOffset + + from pandas.core.frame import DataFrame from pandas.core.resample import Resampler - from pandas.core.series import Series # noqa: F401 + from pandas.core.series import Series + from pandas.core.window.indexers import BaseIndexer # goal is to be able to define the docs close to function, while still being # able to share -_shared_doc_kwargs = dict( - axes="keywords for axes", - klass="Series/DataFrame", - axes_single_arg="int or labels for object", - args_transpose="axes to permute (int or label for object)", - optional_by=""" +_shared_docs = {**_shared_docs} +_shared_doc_kwargs = { + "axes": "keywords for axes", + "klass": "Series/DataFrame", + "axes_single_arg": "int or labels for object", + "args_transpose": "axes to permute (int or label for object)", + "optional_by": """ by : str or list of str Name or list of names to sort by""", -) - - -def _single_replace(self, to_replace, method, inplace, limit): - """ - Replaces values in a Series using the fill method specified when no - replacement value is given in the replace method - """ - if self.ndim != 1: - raise TypeError( - f"cannot replace {to_replace} with method {method} on a " - f"{type(self).__name__}" - ) - - orig_dtype = self.dtype - result = self if inplace else self.copy() - fill_f = missing.get_fill_func(method) - - mask = missing.mask_missing(result.values, to_replace) - values = fill_f(result.values, limit=limit, mask=mask) - - if values.dtype == orig_dtype and inplace: - return - - result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self) - - if inplace: - self._update_inplace(result) - return - - return result +} bool_t = bool # Need alias because NDFrame has def bool: @@ -179,10 +168,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): "_metadata", "__array_struct__", "__array_interface__", + "_flags", ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset(["get_values", "tshift"]) + _hidden_attrs: FrozenSet[str] = frozenset(["get_values", "tshift"]) _metadata: List[str] = [] _is_copy = None _mgr: BlockManager @@ -208,6 +198,7 @@ def __init__( else: attrs = dict(attrs) object.__setattr__(self, "_attrs", attrs) + object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) @classmethod def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: @@ -228,15 +219,20 @@ def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: return mgr # ---------------------------------------------------------------------- + # attrs and flags @property def attrs(self) -> Dict[Optional[Hashable], Any]: """ - Dictionary of global attributes on this object. + Dictionary of global attributes of this dataset. .. warning:: attrs is experimental and may change without warning. + + See Also + -------- + DataFrame.flags : Global flags applying to this object. """ if self._attrs is None: self._attrs = {} @@ -246,6 +242,99 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) + @final + @property + def flags(self) -> Flags: + """ + Get the properties associated with this pandas object. + + The available flags are + + * :attr:`Flags.allows_duplicate_labels` + + See Also + -------- + Flags : Flags that apply to pandas objects. + DataFrame.attrs : Global metadata applying to this dataset. + + Notes + ----- + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags + + + Flags can be get or set using ``.`` + + >>> df.flags.allows_duplicate_labels + True + >>> df.flags.allows_duplicate_labels = False + + Or by slicing with a key + + >>> df.flags["allows_duplicate_labels"] + False + >>> df.flags["allows_duplicate_labels"] = True + """ + return self._flags + + @final + def set_flags( + self: FrameOrSeries, + *, + copy: bool = False, + allows_duplicate_labels: Optional[bool] = None, + ) -> FrameOrSeries: + """ + Return a new object with updated flags. + + Parameters + ---------- + allows_duplicate_labels : bool, optional + Whether the returned object allows duplicate labels. + + Returns + ------- + Series or DataFrame + The same type as the caller. + + See Also + -------- + DataFrame.attrs : Global metadata applying to this dataset. + DataFrame.flags : Global flags applying to this object. + + Notes + ----- + This method returns a new object that's a view on the same data + as the input. Mutating the input or the output values will be reflected + in the other. + + This method is intended to be used in method chains. + + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags.allows_duplicate_labels + True + >>> df2 = df.set_flags(allows_duplicate_labels=False) + >>> df2.flags.allows_duplicate_labels + False + """ + df = self.copy(deep=copy) + if allows_duplicate_labels is not None: + df.flags["allows_duplicate_labels"] = allows_duplicate_labels + return df + + @final @classmethod def _validate_dtype(cls, dtype): """ validate the passed dtype """ @@ -291,6 +380,7 @@ def _constructor_expanddim(self): # ---------------------------------------------------------------------- # Internals + @final @property def _data(self): # GH#33054 retained because some downstream packages uses this, @@ -312,25 +402,23 @@ def _data(self): @property def _AXIS_NUMBERS(self) -> Dict[str, int]: """.. deprecated:: 1.1.0""" - warnings.warn( - "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3, - ) + warnings.warn("_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3) return {"index": 0} @property def _AXIS_NAMES(self) -> Dict[int, str]: """.. deprecated:: 1.1.0""" - warnings.warn( - "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3, - ) + warnings.warn("_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3) return {0: "index"} + @final def _construct_axes_dict(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)} d.update(kwargs) return d + @final @classmethod def _construct_axes_from_arguments( cls, args, kwargs, require_all: bool = False, sentinel=None @@ -362,6 +450,7 @@ def _construct_axes_from_arguments( axes = {a: kwargs.pop(a, sentinel) for a in cls._AXIS_ORDERS} return axes, kwargs + @final @classmethod def _get_axis_number(cls, axis: Axis) -> int: try: @@ -369,16 +458,19 @@ def _get_axis_number(cls, axis: Axis) -> int: except KeyError: raise ValueError(f"No axis named {axis} for object type {cls.__name__}") + @final @classmethod def _get_axis_name(cls, axis: Axis) -> str: axis_number = cls._get_axis_number(axis) return cls._AXIS_ORDERS[axis_number] + @final def _get_axis(self, axis: Axis) -> Index: axis_number = self._get_axis_number(axis) assert axis_number in {0, 1} return self.index if axis_number == 0 else self.columns + @final @classmethod def _get_block_manager_axis(cls, axis: Axis) -> int: """Map the axis to the block_manager axis.""" @@ -388,10 +480,11 @@ def _get_block_manager_axis(cls, axis: Axis) -> int: return m - axis return axis - def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: + @final + def _get_axis_resolvers(self, axis: str) -> Dict[str, Union[Series, MultiIndex]]: # index or columns axis_index = getattr(self, axis) - d = dict() + d = {} prefix = axis[0] for i, name in enumerate(axis_index.names): @@ -418,16 +511,18 @@ def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: d[axis] = dindex return d - def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + @final + def _get_index_resolvers(self) -> Dict[Label, Union[Series, MultiIndex]]: from pandas.core.computation.parsing import clean_column_name - d: Dict[str, ABCSeries] = {} + d: Dict[str, Union[Series, MultiIndex]] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} - def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + @final + def _get_cleaned_column_resolvers(self) -> Dict[Label, Series]: """ Return the special character free column resolvers of a dataframe. @@ -515,11 +610,13 @@ def size(self) -> int: """ return np.prod(self.shape) + @final @property def _selected_obj(self: FrameOrSeries) -> FrameOrSeries: """ internal compat with SelectionMixin """ return self + @final @property def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries: """ internal compat with SelectionMixin """ @@ -546,12 +643,18 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): Returns ------- renamed : %(klass)s or None - An object of type %(klass)s if inplace=False, None otherwise. + An object of type %(klass)s or None if ``inplace=True``. See Also -------- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. """ + self._check_inplace_and_allows_duplicate_labels(inplace) + return self._set_axis_nocheck(labels, axis, inplace) + + @final + def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool): + # NDFrame.rename with inplace=False calls set_axis(inplace=True) on a copy. if inplace: setattr(self, self._get_axis_name(axis), labels) else: @@ -564,6 +667,7 @@ def _set_axis(self, axis: int, labels: Index) -> None: self._mgr.set_axis(axis, labels) self._clear_item_cache() + @final def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: """ Interchange axes and swap values axes appropriately. @@ -589,10 +693,11 @@ def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: # ignore needed because of NDFrame constructor is different than # DataFrame/Series constructors. - return self._constructor(new_values, *new_axes).__finalize__( # type: ignore - self, method="swapaxes" - ) + return self._constructor( + new_values, *new_axes # type: ignore[arg-type] + ).__finalize__(self, method="swapaxes") + @final def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ Return DataFrame with requested index / column level(s) removed. @@ -657,7 +762,7 @@ def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: result = self.set_axis(new_labels, axis=axis, inplace=False) return result - def pop(self, item: Label) -> Union["Series", Any]: + def pop(self, item: Label) -> Union[Series, Any]: result = self[item] del self[item] if self.ndim == 2: @@ -665,6 +770,7 @@ def pop(self, item: Label) -> Union["Series", Any]: return result + @final def squeeze(self, axis=None): """ Squeeze 1 dimensional axis objects into scalars. @@ -921,6 +1027,7 @@ def rename( else: index = mapper + self._check_inplace_and_allows_duplicate_labels(inplace) result = self if inplace else self.copy(deep=copy) for axis_no, replacements in enumerate((index, columns)): @@ -945,7 +1052,7 @@ def rename( raise KeyError(f"{missing_labels} not found in axis") new_index = ax._transform_index(f, level) - result.set_axis(new_index, axis=axis_no, inplace=True) + result._set_axis_nocheck(new_index, axis=axis_no, inplace=True) result._clear_item_cache() if inplace: @@ -987,7 +1094,7 @@ def rename_axis(self, mapper=lib.no_default, **kwargs): Returns ------- Series, DataFrame, or None - The same type as the caller or None if `inplace` is True. + The same type as the caller or None if ``inplace=True``. See Also -------- @@ -1007,7 +1114,7 @@ def rename_axis(self, mapper=lib.no_default, **kwargs): In this case, the parameter ``copy`` is ignored. The second calling convention will modify the names of the - the corresponding index if mapper is a list or a scalar. + corresponding index if mapper is a list or a scalar. However, if mapper is dict-like or a function, it will use the deprecated behavior of modifying the axis *labels*. @@ -1128,6 +1235,7 @@ class name if not inplace: return result + @final def _set_axis_name(self, name, axis=0, inplace=False): """ Set the name(s) of the axis. @@ -1190,20 +1298,24 @@ def _set_axis_name(self, name, axis=0, inplace=False): # ---------------------------------------------------------------------- # Comparison Methods + @final def _indexed_same(self, other) -> bool: return all( self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS ) - def equals(self, other): + @final + def equals(self, other: object) -> bool: """ Test whether two objects contain the same elements. This function allows two Series or DataFrames to be compared against each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. The column headers do not - need to have the same type, but the elements within the columns must - be the same dtype. + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns must be of + the same dtype. Parameters ---------- @@ -1232,13 +1344,6 @@ def equals(self, other): numpy.array_equal : Return True if two arrays have the same shape and elements, False otherwise. - Notes - ----- - This function requires that the elements have the same dtype as their - respective elements in the other Series or DataFrame. However, the - column labels do not need to have the same type, as long as they are - still considered equal. - Examples -------- >>> df = pd.DataFrame({1: [10], 2: [20]}) @@ -1278,13 +1383,15 @@ def equals(self, other): >>> df.equals(different_data_type) False """ - if not isinstance(other, self._constructor): + if not (isinstance(other, type(self)) or isinstance(self, type(other))): return False + other = cast(NDFrame, other) return self._mgr.equals(other._mgr) # ------------------------------------------------------------------------- # Unary Methods + @final def __neg__(self): values = self._values if is_bool_dtype(values): @@ -1299,6 +1406,7 @@ def __neg__(self): raise TypeError(f"Unary negative expects numeric dtype, not {values.dtype}") return self.__array_wrap__(arr) + @final def __pos__(self): values = self._values if is_bool_dtype(values): @@ -1310,9 +1418,13 @@ def __pos__(self): ): arr = operator.pos(values) else: - raise TypeError(f"Unary plus expects numeric dtype, not {values.dtype}") + raise TypeError( + "Unary plus expects bool, numeric, timedelta, " + f"or object dtype, not {values.dtype}" + ) return self.__array_wrap__(arr) + @final def __invert__(self): if not self.size: # inv fails with 0 len @@ -1322,6 +1434,7 @@ def __invert__(self): result = self._constructor(new_data).__finalize__(self, method="__invert__") return result + @final def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " @@ -1330,6 +1443,7 @@ def __nonzero__(self): __bool__ = __nonzero__ + @final def bool(self): """ Return the bool of a single element Series or DataFrame. @@ -1374,9 +1488,11 @@ def bool(self): self.__nonzero__() + @final def __abs__(self: FrameOrSeries) -> FrameOrSeries: return self.abs() + @final def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: return self.round(decimals) @@ -1388,6 +1504,7 @@ def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: # operations should utilize/extend these methods when possible so that we # have consistent precedence and validation logic throughout the library. + @final def _is_level_reference(self, key, axis=0): """ Test whether a key is a level reference for a given axis. @@ -1418,6 +1535,7 @@ def _is_level_reference(self, key, axis=0): and not self._is_label_reference(key, axis=axis) ) + @final def _is_label_reference(self, key, axis=0) -> bool_t: """ Test whether a key is a label reference for a given axis. @@ -1447,6 +1565,7 @@ def _is_label_reference(self, key, axis=0) -> bool_t: and any(key in self.axes[ax] for ax in other_axes) ) + @final def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t: """ Test whether a key is a label or level reference for a given axis. @@ -1471,6 +1590,7 @@ def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t: key, axis=axis ) + @final def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: """ Check whether `key` is ambiguous. @@ -1514,6 +1634,7 @@ def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: ) raise ValueError(msg) + @final def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: """ Return a 1-D array of values associated with `key`, a label or level @@ -1573,14 +1694,12 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: label_axis_name = "column" if axis == 0 else "index" raise ValueError( - ( - f"The {label_axis_name} label '{key}' " - f"is not unique.{multi_message}" - ) + f"The {label_axis_name} label '{key}' is not unique.{multi_message}" ) return values + @final def _drop_labels_or_levels(self, keys, axis: int = 0): """ Drop labels and/or levels for the given `axis`. @@ -1617,10 +1736,8 @@ def _drop_labels_or_levels(self, keys, axis: int = 0): if invalid_keys: raise ValueError( - ( - "The following keys are not valid labels or " - f"levels for axis {axis}: {invalid_keys}" - ) + "The following keys are not valid labels or " + f"levels for axis {axis}: {invalid_keys}" ) # Compute levels and labels to drop @@ -1661,7 +1778,7 @@ def _drop_labels_or_levels(self, keys, axis: int = 0): # ---------------------------------------------------------------------- # Iteration - def __hash__(self): + def __hash__(self) -> int: raise TypeError( f"{repr(type(self).__name__)} objects are mutable, " f"thus they cannot be hashed" @@ -1713,6 +1830,7 @@ def __len__(self) -> int: """Returns length of info axis""" return len(self._info_axis) + @final def __contains__(self, key) -> bool_t: """True if the key is in the info axis""" return key in self._info_axis @@ -1777,7 +1895,28 @@ def empty(self) -> bool_t: def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._values, dtype=dtype) - def __array_wrap__(self, result, context=None): + def __array_wrap__( + self, + result: np.ndarray, + context: Optional[Tuple[Callable, Tuple[Any, ...], int]] = None, + ): + """ + Gets called after a ufunc and other functions. + + Parameters + ---------- + result: np.ndarray + The result of the ufunc or other function called on the NumPy array + returned by __array__ + context: tuple of (func, tuple, int) + This parameter is returned by ufuncs as a 3-element tuple: (name of the + ufunc, arguments of the ufunc, domain of the ufunc), but is not set by + other numpy functions.q + + Notes + ----- + Series implements __array_ufunc_ so this not called for ufunc on Series. + """ result = lib.item_from_zerodim(result) if is_scalar(result): # e.g. we get here with np.ptp(series) @@ -1788,6 +1927,11 @@ def __array_wrap__(self, result, context=None): self, method="__array_wrap__" ) + def __array_ufunc__( + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any + ): + return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs) + # ideally we would define this to avoid the getattr checks, but # is slower # @property @@ -1799,18 +1943,20 @@ def __array_wrap__(self, result, context=None): # ---------------------------------------------------------------------- # Picklability + @final def __getstate__(self) -> Dict[str, Any]: meta = {k: getattr(self, k, None) for k in self._metadata} - return dict( - _mgr=self._mgr, - _typ=self._typ, - _metadata=self._metadata, - attrs=self.attrs, + return { + "_mgr": self._mgr, + "_typ": self._typ, + "_metadata": self._metadata, + "attrs": self.attrs, + "_flags": {k: self.flags[k] for k in self.flags._keys}, **meta, - ) + } + @final def __setstate__(self, state): - if isinstance(state, BlockManager): self._mgr = state elif isinstance(state, dict): @@ -1821,6 +1967,8 @@ def __setstate__(self, state): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) + flags = state.get("_flags", {"allows_duplicate_labels": True}) + object.__setattr__(self, "_flags", Flags(self, **flags)) # set in the order of internal names # to avoid definitional recursion @@ -1828,7 +1976,7 @@ def __setstate__(self, state): # defined meta = set(self._internal_names + self._metadata) for k in list(meta): - if k in state: + if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) @@ -1852,6 +2000,7 @@ def __repr__(self) -> str: prepr = f"[{','.join(map(pprint_thing, self))}]" return f"{type(self).__name__}({prepr})" + @final def _repr_latex_(self): """ Returns a LaTeX representation for a particular object. @@ -1862,6 +2011,7 @@ def _repr_latex_(self): else: return None + @final def _repr_data_resource_(self): """ Not a real Jupyter special repr method, but we use the same @@ -1869,21 +2019,23 @@ def _repr_data_resource_(self): """ if config.get_option("display.html.table_schema"): data = self.head(config.get_option("display.max_rows")) - payload = json.loads( - data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict - ) + + as_json = data.to_json(orient="table") + as_json = cast(str, as_json) + payload = json.loads(as_json, object_pairs_hook=collections.OrderedDict) return payload # ---------------------------------------------------------------------- # I/O Methods - @doc(klass="object") + @final + @doc(klass="object", storage_options=_shared_docs["storage_options"]) def to_excel( self, excel_writer, - sheet_name="Sheet1", - na_rep="", - float_format=None, + sheet_name: str = "Sheet1", + na_rep: str = "", + float_format: Optional[str] = None, columns=None, header=True, index=True, @@ -1896,6 +2048,7 @@ def to_excel( inf_rep="inf", verbose=True, freeze_panes=None, + storage_options: StorageOptions = None, ) -> None: """ Write {klass} to an Excel sheet. @@ -1912,7 +2065,7 @@ def to_excel( Parameters ---------- - excel_writer : str or ExcelWriter object + excel_writer : path-like, file-like, or ExcelWriter object File path or existing ExcelWriter. sheet_name : str, default 'Sheet1' Name of sheet which will contain DataFrame. @@ -1940,6 +2093,13 @@ def to_excel( Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. + + .. deprecated:: 1.2.0 + + As the `xlwt `__ package is no longer + maintained, the ``xlwt`` engine will be removed in a future version + of pandas. + merge_cells : bool, default True Write MultiIndex and Hierarchical Rows as merged cells. encoding : str, optional @@ -1953,6 +2113,9 @@ def to_excel( freeze_panes : tuple of int (length 2), optional Specifies the one-based bottommost row and rightmost column that is to be frozen. + {storage_options} + + .. versionadded:: 1.2.0 See Also -------- @@ -2027,8 +2190,11 @@ def to_excel( startcol=startcol, freeze_panes=freeze_panes, engine=engine, + storage_options=storage_options, ) + @final + @doc(storage_options=_shared_docs["storage_options"]) def to_json( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -2039,9 +2205,10 @@ def to_json( date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool_t = False, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", index: bool_t = True, indent: Optional[int] = None, + storage_options: StorageOptions = None, ) -> Optional[str]: """ Convert the object to a JSON string. @@ -2060,29 +2227,27 @@ def to_json( * Series: - default is 'index' - - allowed values are: {'split','records','index','table'}. + - allowed values are: {{'split', 'records', 'index', 'table'}}. * DataFrame: - default is 'columns' - - allowed values are: {'split', 'records', 'index', 'columns', - 'values', 'table'}. + - allowed values are: {{'split', 'records', 'index', 'columns', + 'values', 'table'}}. * The format of the JSON string: - - 'split' : dict like {'index' -> [index], 'columns' -> [columns], - 'data' -> [values]} - - 'records' : list like [{column -> value}, ... , {column -> value}] - - 'index' : dict like {index -> {column -> value}} - - 'columns' : dict like {column -> {index -> value}} + - 'split' : dict like {{'index' -> [index], 'columns' -> [columns], + 'data' -> [values]}} + - 'records' : list like [{{column -> value}}, ... , {{column -> value}}] + - 'index' : dict like {{index -> {{column -> value}}}} + - 'columns' : dict like {{column -> {{index -> value}}}} - 'values' : just the values array - - 'table' : dict like {'schema': {schema}, 'data': {data}} + - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} Describing the data, where data component is like ``orient='records'``. - .. versionchanged:: 0.20.0 - - date_format : {None, 'epoch', 'iso'} + date_format : {{None, 'epoch', 'iso'}} Type of date conversion. 'epoch' = epoch milliseconds, 'iso' = ISO8601. The default depends on the `orient`. For ``orient='table'``, the default is 'iso'. For all other orients, @@ -2105,7 +2270,7 @@ def to_json( throw ValueError if incorrect 'orient' since others are not list like. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}} A string representing the compression to use in the output file, only used when the first argument is a filename. By default, the @@ -2117,14 +2282,15 @@ def to_json( Whether to include the index values in the JSON string. Not including the index (``index=False``) is only supported when orient is 'split' or 'table'. - - .. versionadded:: 0.23.0 - indent : int, optional Length of whitespace used to indent each record. .. versionadded:: 1.0.0 + {storage_options} + + .. versionadded:: 1.2.0 + Returns ------- None or str @@ -2142,6 +2308,10 @@ def to_json( and the default ``indent=None`` are equivalent in pandas, though this may change in a future release. + ``orient='table'`` contains a 'pandas_version' field under 'schema'. + This stores the version of `pandas` used in the latest revision of the + schema. + Examples -------- >>> import json @@ -2154,7 +2324,7 @@ def to_json( >>> result = df.to_json(orient="split") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { + {{ "columns": [ "col 1", "col 2" @@ -2173,7 +2343,7 @@ def to_json( "d" ] ] - } + }} Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. @@ -2182,14 +2352,14 @@ def to_json( >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP [ - { + {{ "col 1": "a", "col 2": "b" - }, - { + }}, + {{ "col 1": "c", "col 2": "d" - } + }} ] Encoding/decoding a Dataframe using ``'index'`` formatted JSON: @@ -2197,32 +2367,32 @@ def to_json( >>> result = df.to_json(orient="index") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { - "row 1": { + {{ + "row 1": {{ "col 1": "a", "col 2": "b" - }, - "row 2": { + }}, + "row 2": {{ "col 1": "c", "col 2": "d" - } - } + }} + }} Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: >>> result = df.to_json(orient="columns") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { - "col 1": { + {{ + "col 1": {{ "row 1": "a", "row 2": "c" - }, - "col 2": { + }}, + "col 2": {{ "row 1": "b", "row 2": "d" - } - } + }} + }} Encoding/decoding a Dataframe using ``'values'`` formatted JSON: @@ -2245,40 +2415,40 @@ def to_json( >>> result = df.to_json(orient="table") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - { - "schema": { + {{ + "schema": {{ "fields": [ - { + {{ "name": "index", "type": "string" - }, - { + }}, + {{ "name": "col 1", "type": "string" - }, - { + }}, + {{ "name": "col 2", "type": "string" - } + }} ], "primaryKey": [ "index" ], "pandas_version": "0.20.0" - }, + }}, "data": [ - { + {{ "index": "row 1", "col 1": "a", "col 2": "b" - }, - { + }}, + {{ "index": "row 2", "col 1": "c", "col 2": "d" - } + }} ] - } + }} """ from pandas.io import json @@ -2303,8 +2473,10 @@ def to_json( compression=compression, index=index, indent=indent, + storage_options=storage_options, ) + @final def to_hdf( self, path_or_buf, @@ -2446,6 +2618,7 @@ def to_hdf( encoding=encoding, ) + @final def to_sql( self, name: str, @@ -2556,7 +2729,8 @@ def to_sql( >>> engine.execute("SELECT * FROM users").fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] - An `sqlalchemy.engine.Connection` can also be passed to to `con`: + An `sqlalchemy.engine.Connection` can also be passed to `con`: + >>> with engine.begin() as connection: ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) ... df1.to_sql('users', con=connection, if_exists='append') @@ -2612,11 +2786,14 @@ def to_sql( method=method, ) + @final + @doc(storage_options=_shared_docs["storage_options"]) def to_pickle( self, path, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: StorageOptions = None, ) -> None: """ Pickle (serialize) object to file. @@ -2625,18 +2802,29 @@ def to_pickle( ---------- path : str File path where the pickled object will be stored. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ + compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, \ default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. + Compression mode may be any of the following possible + values: {{‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}}. If compression + mode is ‘infer’ and path_or_buf is path-like, then detect + compression mode from the following extensions: + ‘.gz’, ‘.bz2’, ‘.zip’ or ‘.xz’. (otherwise no compression). + If dict given and mode is ‘zip’ or inferred as ‘zip’, other entries + passed as additional compression options. protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible - values are 0, 1, 2, 3, 4. A negative value for the protocol + values are 0, 1, 2, 3, 4, 5. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html. + {storage_options} + + .. versionadded:: 1.2.0 + See Also -------- read_pickle : Load pickled pandas object (or any object) from file. @@ -2646,7 +2834,7 @@ def to_pickle( Examples -------- - >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) + >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) >>> original_df foo bar 0 0 5 @@ -2670,8 +2858,15 @@ def to_pickle( """ from pandas.io.pickle import to_pickle - to_pickle(self, path, compression=compression, protocol=protocol) + to_pickle( + self, + path, + compression=compression, + protocol=protocol, + storage_options=storage_options, + ) + @final def to_clipboard( self, excel: bool_t = True, sep: Optional[str] = None, **kwargs ) -> None: @@ -2733,6 +2928,7 @@ def to_clipboard( clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) + @final def to_xarray(self): """ Return an xarray object from the pandas object. @@ -2816,7 +3012,8 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' else: return xarray.Dataset.from_dataframe(self) - @Substitution(returns=fmt.return_docstring) + @final + @doc(returns=fmt.return_docstring) def to_latex( self, buf=None, @@ -2840,20 +3037,21 @@ def to_latex( multirow=None, caption=None, label=None, + position=None, ): r""" Render object to a LaTeX tabular, longtable, or nested table/tabular. - Requires ``\usepackage{booktabs}``. The output can be copy/pasted + Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted into a main LaTeX document or read from an external file - with ``\input{table.tex}``. - - .. versionchanged:: 0.20.2 - Added to Series. + with ``\input{{table.tex}}``. .. versionchanged:: 1.0.0 Added caption and label arguments. + .. versionchanged:: 1.2.0 + Added position argument, changed meaning of caption argument. + Parameters ---------- buf : str, Path or StringIO-like, optional, default None @@ -2869,13 +3067,13 @@ def to_latex( Write row names (index). na_rep : str, default 'NaN' Missing data representation. - formatters : list of functions or dict of {str: function}, optional + formatters : list of functions or dict of {{str: function}}, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. float_format : one-parameter function or str, optional, default None Formatter for floating point numbers. For example - ``float_format="%%.2f"`` and ``float_format="{:0.2f}".format`` will + ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will both result in 0.1234 being formatted as 0.12. sparsify : bool, optional Set to False for a DataFrame with a hierarchical index to print @@ -2893,7 +3091,7 @@ def to_latex( longtable : bool, optional By default, the value will be read from the pandas config module. Use a longtable environment instead of tabular. Requires - adding a \usepackage{longtable} to your LaTeX preamble. + adding a \usepackage{{longtable}} to your LaTeX preamble. escape : bool, optional By default, the value will be read from the pandas config module. When set to False prevents from escaping latex special @@ -2911,21 +3109,31 @@ def to_latex( The default will be read from the config module. multirow : bool, default False Use \multirow to enhance MultiIndex rows. Requires adding a - \usepackage{multirow} to your LaTeX preamble. Will print + \usepackage{{multirow}} to your LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module. - caption : str, optional - The LaTeX caption to be placed inside ``\caption{}`` in the output. + caption : str or tuple, optional + Tuple (full_caption, short_caption), + which results in ``\caption[short_caption]{{full_caption}}``; + if a single string is passed, no short caption will be set. .. versionadded:: 1.0.0 + .. versionchanged:: 1.2.0 + Optionally allow caption to be a tuple ``(full_caption, short_caption)``. + label : str, optional - The LaTeX label to be placed inside ``\label{}`` in the output. - This is used with ``\ref{}`` in the main ``.tex`` file. + The LaTeX label to be placed inside ``\label{{}}`` in the output. + This is used with ``\ref{{}}`` in the main ``.tex`` file. .. versionadded:: 1.0.0 - %(returns)s + position : str, optional + The LaTeX positional argument for tables, to be placed after + ``\begin{{}}`` in the output. + + .. versionadded:: 1.2.0 + {returns} See Also -------- DataFrame.to_string : Render a DataFrame to a console-friendly @@ -2934,18 +3142,18 @@ def to_latex( Examples -------- - >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], - ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}) + >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'], + ... mask=['red', 'purple'], + ... weapon=['sai', 'bo staff'])) >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE - \begin{tabular}{lll} + \begin{{tabular}}{{lll}} \toprule name & mask & weapon \\ \midrule Raphael & red & sai \\ Donatello & purple & bo staff \\ \bottomrule - \end{tabular} + \end{{tabular}} """ # Get defaults from the pandas config if self.ndim == 1: @@ -2961,6 +3169,7 @@ def to_latex( if multirow is None: multirow = config.get_option("display.latex.multirow") + self = cast("DataFrame", self) formatter = DataFrameFormatter( self, columns=columns, @@ -2976,7 +3185,7 @@ def to_latex( escape=escape, decimal=decimal, ) - return formatter.to_latex( + return DataFrameRenderer(formatter).to_latex( buf=buf, column_format=column_format, longtable=longtable, @@ -2986,8 +3195,11 @@ def to_latex( multirow=multirow, caption=caption, label=label, + position=position, ) + @final + @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -2997,10 +3209,10 @@ def to_csv( columns: Optional[Sequence[Label]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, + index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, - compression: Optional[Union[str, Mapping[str, str]]] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, quotechar: str = '"', line_terminator: Optional[str] = None, @@ -3008,8 +3220,9 @@ def to_csv( date_format: Optional[str] = None, doublequote: bool_t = True, escapechar: Optional[str] = None, - decimal: Optional[str] = ".", + decimal: str = ".", errors: str = "strict", + storage_options: StorageOptions = None, ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3021,13 +3234,18 @@ def to_csv( ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as - a string. If a file object is passed it should be opened with - `newline=''`, disabling universal newlines. + a string. If a non-binary file object is passed, it should be opened + with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` might need to contain a `'b'`. .. versionchanged:: 0.24.0 Was previously named "path" for Series. + .. versionchanged:: 1.2.0 + + Support for binary file objects was introduced. + sep : str, default ',' String of length 1. Field delimiter for the output file. na_rep : str, default '' @@ -3056,15 +3274,16 @@ def to_csv( Python write mode, default 'w'. encoding : str, optional A string representing the encoding to use in the output file, - defaults to 'utf-8'. + defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` + is a non-binary file object. compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following - possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as + and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, other entries passed as additional compression options. @@ -3080,6 +3299,16 @@ def to_csv( supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + .. versionchanged:: 1.2.0 + + Compression is supported for binary file objects. + + .. versionchanged:: 1.2.0 + + Previous versions forwarded dict entries for 'gzip' to + `gzip.open` instead of `gzip.GzipFile` which prevented + setting `mtime`. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC @@ -3111,6 +3340,10 @@ def to_csv( .. versionadded:: 1.1.0 + {storage_options} + + .. versionadded:: 1.2.0 + Returns ------- None or str @@ -3124,9 +3357,9 @@ def to_csv( Examples -------- - >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], + >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}) + ... 'weapon': ['sai', 'bo staff']}}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' @@ -3139,10 +3372,16 @@ def to_csv( """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() - from pandas.io.formats.csvs import CSVFormatter + formatter = DataFrameFormatter( + frame=df, + header=header, + index=index, + na_rep=na_rep, + float_format=float_format, + decimal=decimal, + ) - formatter = CSVFormatter( - df, + return DataFrameRenderer(formatter).to_csv( path_or_buf, line_terminator=line_terminator, sep=sep, @@ -3150,11 +3389,7 @@ def to_csv( errors=errors, compression=compression, quoting=quoting, - na_rep=na_rep, - float_format=float_format, - cols=columns, - header=header, - index=index, + columns=columns, index_label=index_label, mode=mode, chunksize=chunksize, @@ -3162,18 +3397,13 @@ def to_csv( date_format=date_format, doublequote=doublequote, escapechar=escapechar, - decimal=decimal, + storage_options=storage_options, ) - formatter.save() - - if path_or_buf is None: - return formatter.path_or_buf.getvalue() - - return None # ---------------------------------------------------------------------- # Lookup Caching + @final def _set_as_cached(self, item, cacher) -> None: """ Set the _cacher attribute on the calling object with a weakref to @@ -3181,6 +3411,7 @@ def _set_as_cached(self, item, cacher) -> None: """ self._cacher = (item, weakref.ref(cacher)) + @final def _reset_cacher(self) -> None: """ Reset the cacher. @@ -3188,6 +3419,7 @@ def _reset_cacher(self) -> None: if hasattr(self, "_cacher"): del self._cacher + @final def _maybe_cache_changed(self, item, value) -> None: """ The object has called back to us saying maybe it has changed. @@ -3195,11 +3427,13 @@ def _maybe_cache_changed(self, item, value) -> None: loc = self._info_axis.get_loc(item) self._mgr.iset(loc, value) + @final @property def _is_cached(self) -> bool_t: """Return boolean indicating if self is cached or not.""" return getattr(self, "_cacher", None) is not None + @final def _get_cacher(self): """return my cacher or None""" cacher = getattr(self, "_cacher", None) @@ -3207,6 +3441,7 @@ def _get_cacher(self): cacher = cacher[1]() return cacher + @final def _maybe_update_cacher( self, clear: bool_t = False, verify_is_copy: bool_t = True ) -> None: @@ -3225,7 +3460,7 @@ def _maybe_update_cacher( if cacher is not None: ref = cacher[1]() - # we are trying to reference a dead referant, hence + # we are trying to reference a dead referent, hence # a copy if ref is None: del self._cacher @@ -3233,13 +3468,18 @@ def _maybe_update_cacher( if len(self) == len(ref): # otherwise, either self or ref has swapped in new arrays ref._maybe_cache_changed(cacher[0], self) + else: + # GH#33675 we have swapped in a new array, so parent + # reference to self is now invalid + ref._item_cache.pop(cacher[0], None) if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t="referant") + self._check_setitem_copy(stacklevel=5, t="referent") if clear: self._clear_item_cache() + @final def _clear_item_cache(self) -> None: self._item_cache.clear() @@ -3336,13 +3576,16 @@ class max_speed stacklevel=2, ) - nv.validate_take(tuple(), kwargs) + nv.validate_take((), kwargs) + + self._consolidate_inplace() new_data = self._mgr.take( indices, axis=self._get_block_manager_axis(axis), verify=True ) return self._constructor(new_data).__finalize__(self, method="take") + @final def _take_with_is_copy(self: FrameOrSeries, indices, axis=0) -> FrameOrSeries: """ Internal version of the `take` method that sets the `_is_copy` @@ -3357,6 +3600,7 @@ def _take_with_is_copy(self: FrameOrSeries, indices, axis=0) -> FrameOrSeries: result._set_is_copy(self) return result + @final def xs(self, key, axis=0, level=None, drop_level: bool_t = True): """ Return cross-section from the Series/DataFrame. @@ -3473,13 +3717,23 @@ class animal locomotion return result if axis == 1: - return self[key] + if drop_level: + return self[key] + index = self.columns + else: + index = self.index + + self._consolidate_inplace() - index = self.index if isinstance(index, MultiIndex): - loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + try: + loc, new_index = index._get_loc_level( + key, level=0, drop_level=drop_level + ) + except TypeError as e: + raise TypeError(f"Expected label or tuple of labels, got {key}") from e else: - loc = self.index.get_loc(key) + loc = index.get_loc(key) if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: @@ -3489,9 +3743,9 @@ class animal locomotion return self._take_with_is_copy(loc, axis=axis) if not is_scalar(loc): - new_index = self.index[loc] + new_index = index[loc] - if is_scalar(loc): + if is_scalar(loc) and axis == 0: # In this case loc should be an integer if self.ndim == 1: # if we encounter an array-like and we only have 1 dim @@ -3507,7 +3761,10 @@ class animal locomotion name=self.index[loc], dtype=new_values.dtype, ) - + elif is_scalar(loc): + result = self.iloc[:, slice(loc, loc + 1)] + elif axis == 1: + result = self.iloc[:, loc] else: result = self.iloc[loc] result.index = new_index @@ -3520,6 +3777,7 @@ class animal locomotion def __getitem__(self, item): raise AbstractMethodError(self) + @final def _get_item_cache(self, item): """Return the cached item, item represents a label indexer.""" cache = self._item_cache @@ -3530,7 +3788,7 @@ def _get_item_cache(self, item): loc = self.columns.get_loc(item) values = self._mgr.iget(loc) - res = self._box_col_values(values, loc) + res = self._box_col_values(values, loc).__finalize__(self) cache[item] = res res._set_as_cached(item, self) @@ -3556,20 +3814,7 @@ def _slice(self: FrameOrSeries, slobj: slice, axis=0) -> FrameOrSeries: result._set_is_copy(self, copy=is_copy) return result - def _iset_item(self, loc: int, value) -> None: - self._mgr.iset(loc, value) - self._clear_item_cache() - - def _set_item(self, key, value) -> None: - try: - loc = self._info_axis.get_loc(key) - except KeyError: - # This item wasn't present, just insert at end - self._mgr.insert(len(self._info_axis), key, value) - return - - NDFrame._iset_item(self, loc, value) - + @final def _set_is_copy(self, ref, copy: bool_t = True) -> None: if not copy: self._is_copy = None @@ -3577,6 +3822,7 @@ def _set_is_copy(self, ref, copy: bool_t = True) -> None: assert ref is not None self._is_copy = weakref.ref(ref) + @final def _check_is_chained_assignment_possible(self) -> bool_t: """ Check if we are a view, have a cacher, and are of mixed type. @@ -3591,12 +3837,13 @@ def _check_is_chained_assignment_possible(self) -> bool_t: if self._is_view and self._is_cached: ref = self._get_cacher() if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t="referant", force=True) + self._check_setitem_copy(stacklevel=4, t="referent", force=True) return True elif self._is_copy: - self._check_setitem_copy(stacklevel=4, t="referant") + self._check_setitem_copy(stacklevel=4, t="referent") return False + @final def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): """ @@ -3639,7 +3886,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): # the copy weakref if self._is_copy is not None and not isinstance(self._is_copy, str): r = self._is_copy() - if not gc.get_referents(r) or r.shape == self.shape: + if not gc.get_referents(r) or (r is not None and r.shape == self.shape): self._is_copy = None return @@ -3647,7 +3894,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): if isinstance(self._is_copy, str): t = self._is_copy - elif t == "referant": + elif t == "referent": t = ( "\n" "A value is trying to be set on a copy of a slice from a " @@ -3711,6 +3958,15 @@ def __delitem__(self, key) -> None: # ---------------------------------------------------------------------- # Unsorted + @final + def _check_inplace_and_allows_duplicate_labels(self, inplace): + if inplace and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'inplace=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) + + @final def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). @@ -3730,11 +3986,13 @@ def get(self, key, default=None): except (KeyError, ValueError, IndexError): return default + @final @property def _is_view(self) -> bool_t: """Return boolean indicating if self is view of another array """ return self._mgr.is_view + @final def reindex_like( self: FrameOrSeries, other, @@ -3773,7 +4031,7 @@ def reindex_like( Maximum number of consecutive labels to fill for inexact matches. tolerance : optional Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most + matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Tolerance may be a scalar value, which applies the same tolerance @@ -3882,6 +4140,7 @@ def drop( else: return obj + @final def _drop_axis( self: FrameOrSeries, labels, axis, level=None, errors: str = "raise" ) -> FrameOrSeries: @@ -3937,6 +4196,7 @@ def _drop_axis( return result + @final def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: """ Replace self internals with result. @@ -3954,6 +4214,7 @@ def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: self._mgr = result._mgr self._maybe_update_cacher(verify_is_copy=verify_is_copy) + @final def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: """ Prefix labels with string `prefix`. @@ -4011,8 +4272,13 @@ def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) # type: ignore + # error: Incompatible return value type (got "Optional[FrameOrSeries]", + # expected "FrameOrSeries") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + return self.rename(**mapper) # type: ignore[return-value, arg-type] + @final def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: """ Suffix labels with string `suffix`. @@ -4070,7 +4336,11 @@ def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) # type: ignore + # error: Incompatible return value type (got "Optional[FrameOrSeries]", + # expected "FrameOrSeries") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + return self.rename(**mapper) # type: ignore[return-value, arg-type] def sort_values( self, @@ -4121,7 +4391,7 @@ def sort_values( Returns ------- DataFrame or None - DataFrame with sorted values if inplace=False, None otherwise. + DataFrame with sorted values or None if ``inplace=True``. See Also -------- @@ -4199,9 +4469,79 @@ def sort_values( 3 NaN 8 4 D 4 D 7 2 e 5 C 4 3 F + + Natural sort with the key argument, + using the `natsort ` package. + + >>> df = pd.DataFrame({ + ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], + ... "value": [10, 20, 30, 40, 50] + ... }) + >>> df + time value + 0 0hr 10 + 1 128hr 20 + 2 72hr 30 + 3 48hr 40 + 4 96hr 50 + >>> from natsort import index_natsorted + >>> df.sort_values( + ... by="time", + ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... ) + time value + 0 0hr 10 + 3 48hr 40 + 2 72hr 30 + 4 96hr 50 + 1 128hr 20 """ raise AbstractMethodError(self) + def sort_index( + self, + axis=0, + level=None, + ascending: bool_t = True, + inplace: bool_t = False, + kind: str = "quicksort", + na_position: str = "last", + sort_remaining: bool_t = True, + ignore_index: bool_t = False, + key: IndexKeyFunc = None, + ): + + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + target = self._get_axis(axis) + + indexer = get_indexer_indexer( + target, level, ascending, kind, na_position, sort_remaining, key + ) + + if indexer is None: + if inplace: + return + else: + return self.copy() + + baxis = self._get_block_manager_axis(axis) + new_data = self._mgr.take(indexer, axis=baxis, verify=False) + + # reconstruct axis if needed + new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + + if ignore_index: + axis = 1 if isinstance(self, ABCDataFrame) else 0 + new_data.axes[axis] = ibase.default_index(len(indexer)) + + result = self._constructor(new_data) + + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_index") + @doc( klass=_shared_doc_kwargs["klass"], axes=_shared_doc_kwargs["axes"], @@ -4453,6 +4793,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: axes, level, limit, tolerance, method, fill_value, copy ).__finalize__(self, method="reindex") + @final def _reindex_axes( self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy ) -> FrameOrSeries: @@ -4478,6 +4819,7 @@ def _reindex_axes( return obj + @final def _needs_reindex_multi(self, axes, method, level) -> bool_t: """Check if we do need a multi reindex.""" return ( @@ -4490,6 +4832,7 @@ def _needs_reindex_multi(self, axes, method, level) -> bool_t: def _reindex_multi(self, axes, copy, fill_value): raise AbstractMethodError(self) + @final def _reindex_with_indexers( self: FrameOrSeries, reindexers, @@ -4614,14 +4957,15 @@ def filter( return self.reindex(**{name: [r for r in items if r in labels]}) elif like: - def f(x): + def f(x) -> bool: + assert like is not None # needed for mypy return like in ensure_str(x) values = labels.map(f) return self.loc(axis=axis)[values] elif regex: - def f(x): + def f(x) -> bool: return matcher.search(ensure_str(x)) is not None matcher = re.compile(regex) @@ -4630,6 +4974,7 @@ def f(x): else: raise TypeError("Must pass either `items`, `like`, or `regex`") + @final def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: """ Return the first `n` rows. @@ -4702,6 +5047,7 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: """ return self.iloc[:n] + @final def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: """ Return the last `n` rows. @@ -4776,6 +5122,7 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: return self.iloc[0:0] return self.iloc[-n:] + @final def sample( self: FrameOrSeries, n=None, @@ -4984,6 +5331,7 @@ def sample( locs = rs.choice(axis_length, size=n, replace=replace, p=weights) return self.take(locs, axis=axis) + @final @doc(klass=_shared_doc_kwargs["klass"]) def pipe(self, func, *args, **kwargs): r""" @@ -5035,54 +5383,13 @@ def pipe(self, func, *args, **kwargs): ... .pipe(g, arg1=a) ... .pipe((func, 'arg2'), arg1=a, arg3=c) ... ) # doctest: +SKIP - """ - return com.pipe(self, func, *args, **kwargs) - - _shared_docs["aggregate"] = dedent( """ - Aggregate using one or more operations over the specified axis. - {versionadded} - Parameters - ---------- - func : function, str, list or dict - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of axis labels -> functions, function names or list of such. - {axis} - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. - - Returns - ------- - scalar, Series or DataFrame - - The return can be: - - * scalar : when Series.agg is called with single function - * Series : when DataFrame.agg is called with a single function - * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. - {see_also} - Notes - ----- - `agg` is an alias for `aggregate`. Use the alias. - - A passed user-defined-function will be passed a Series for evaluation. - {examples}""" - ) + return com.pipe(self, func, *args, **kwargs) # ---------------------------------------------------------------------- # Attribute access + @final def __finalize__( self: FrameOrSeries, other, method: Optional[str] = None, **kwargs ) -> FrameOrSeries: @@ -5097,7 +5404,7 @@ def __finalize__( A passed method name providing context on where ``__finalize__`` was called. - .. warning: + .. warning:: The value passed as `method` are not currently considered stable across pandas releases. @@ -5105,10 +5412,19 @@ def __finalize__( if isinstance(other, NDFrame): for name in other.attrs: self.attrs[name] = other.attrs[name] + + self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. - for name in self._metadata: + for name in set(self._metadata) & set(other._metadata): assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) + + if method == "concat": + allows_duplicate_labels = all( + x.flags.allows_duplicate_labels for x in other.objs + ) + self.flags.allows_duplicate_labels = allows_duplicate_labels + return self def __getattr__(self, name: str): @@ -5170,21 +5486,21 @@ def __setattr__(self, name: str, value) -> None: ) object.__setattr__(self, name, value) - def _dir_additions(self): + @final + def _dir_additions(self) -> Set[str]: """ add the string-like attributes from the info_axis. - If info_axis is a MultiIndex, it's first level values are used. + If info_axis is a MultiIndex, its first level values are used. """ - additions = { - c - for c in self._info_axis.unique(level=0)[:100] - if isinstance(c, str) and c.isidentifier() - } - return super()._dir_additions().union(additions) + additions = super()._dir_additions() + if self._info_axis._can_hold_strings: + additions.update(self._info_axis._dir_additions_for_owner) + return additions # ---------------------------------------------------------------------- # Consolidation of internals + @final def _protect_consolidate(self, f): """ Consolidate _mgr -- if the blocks have changed, then clear the @@ -5196,6 +5512,7 @@ def _protect_consolidate(self, f): self._clear_item_cache() return result + @final def _consolidate_inplace(self) -> None: """Consolidate data in place and return None""" @@ -5204,33 +5521,34 @@ def f(): self._protect_consolidate(f) - def _consolidate(self, inplace: bool_t = False): + @final + def _consolidate(self): """ Compute NDFrame with "consolidated" internals (data of each dtype grouped together in a single ndarray). - Parameters - ---------- - inplace : bool, default False - If False return new object, otherwise modify existing object. - Returns ------- consolidated : same type as caller """ - inplace = validate_bool_kwarg(inplace, "inplace") - if inplace: - self._consolidate_inplace() - else: - f = lambda: self._mgr.consolidate() - cons_data = self._protect_consolidate(f) - return self._constructor(cons_data).__finalize__(self) + f = lambda: self._mgr.consolidate() + cons_data = self._protect_consolidate(f) + return self._constructor(cons_data).__finalize__(self) + @final @property def _is_mixed_type(self) -> bool_t: - f = lambda: self._mgr.is_mixed_type - return self._protect_consolidate(f) + if self._mgr.is_single_block: + return False + + if self._mgr.any_extension_types: + # Even if they have the same dtype, we cant consolidate them, + # so we pretend this is "mixed'" + return True + + return self.dtypes.nunique() > 1 + @final def _check_inplace_setting(self, value) -> bool_t: """ check whether we allow in-place setting with this type of value """ if self._is_mixed_type: @@ -5247,9 +5565,11 @@ def _check_inplace_setting(self, value) -> bool_t: return True + @final def _get_numeric_data(self): return self._constructor(self._mgr.get_numeric_data()).__finalize__(self) + @final def _get_bool_data(self): return self._constructor(self._mgr.get_bool_data()).__finalize__(self) @@ -5369,6 +5689,7 @@ def dtypes(self): data = self._mgr.get_dtypes() return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) + @final def _to_dict_of_blocks(self, copy: bool_t = True): """ Return a dict of dtype -> Constructor Types that @@ -5534,7 +5855,7 @@ def astype( else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) return self._constructor(new_data).__finalize__(self, method="astype") # GH 33113: handle empty frame or series @@ -5546,6 +5867,7 @@ def astype( result.columns = self.columns return result + @final def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: """ Make a copy of this object's indices and data. @@ -5655,9 +5977,11 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: self._clear_item_cache() return self._constructor(data).__finalize__(self, method="copy") + @final def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: return self.copy(deep=deep) + @final def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries: """ Parameters @@ -5667,12 +5991,12 @@ def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries: """ return self.copy(deep=True) + @final def _convert( self: FrameOrSeries, datetime: bool_t = False, numeric: bool_t = False, timedelta: bool_t = False, - coerce: bool_t = False, ) -> FrameOrSeries: """ Attempt to infer better dtype for object columns @@ -5686,9 +6010,6 @@ def _convert( unconvertible values becoming NaN. timedelta : bool, default False If True, convert to timedelta where possible. - coerce : bool, default False - If True, force conversion with unconvertible values converted to - nulls (NaN or NaT). Returns ------- @@ -5697,17 +6018,16 @@ def _convert( validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") - validate_bool_kwarg(coerce, "coerce") return self._constructor( self._mgr.convert( datetime=datetime, numeric=numeric, timedelta=timedelta, - coerce=coerce, copy=True, ) ).__finalize__(self) + @final def infer_objects(self: FrameOrSeries) -> FrameOrSeries: """ Attempt to infer better dtypes for object columns. @@ -5750,17 +6070,17 @@ def infer_objects(self: FrameOrSeries) -> FrameOrSeries: # python objects will still be converted to # native numpy numeric types return self._constructor( - self._mgr.convert( - datetime=True, numeric=False, timedelta=True, coerce=False, copy=True - ) + self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True) ).__finalize__(self, method="infer_objects") + @final def convert_dtypes( self: FrameOrSeries, infer_objects: bool_t = True, convert_string: bool_t = True, convert_integer: bool_t = True, convert_boolean: bool_t = True, + convert_floating: bool_t = True, ) -> FrameOrSeries: """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. @@ -5777,6 +6097,12 @@ def convert_dtypes( Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. + convert_floating : bool, defaults True + Whether, if possible, conversion can be done to floating extension types. + If `convert_integer` is also True, preference will be give to integer + dtypes if the floats can be faithfully casted to integers. + + .. versionadded:: 1.2.0 Returns ------- @@ -5794,19 +6120,25 @@ def convert_dtypes( ----- By default, ``convert_dtypes`` will attempt to convert a Series (or each Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options - ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is - possible to turn off individual conversions to ``StringDtype``, the integer - extension types or ``BooleanDtype``, respectively. + ``convert_string``, ``convert_integer``, ``convert_boolean`` and + ``convert_boolean``, it is possible to turn off individual conversions + to ``StringDtype``, the integer extension types, ``BooleanDtype`` + or floating extension types, respectively. For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference rules as during normal Series/DataFrame construction. Then, if possible, - convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension - type, otherwise leave as ``object``. + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer + or floating extension type, otherwise leave as ``object``. If the dtype is integer, convert to an appropriate integer extension type. If the dtype is numeric, and consists of all integers, convert to an - appropriate integer extension type. + appropriate integer extension type. Otherwise, convert to an + appropriate floating extension type. + + .. versionchanged:: 1.2 + Starting with pandas 1.2, this method also converts float columns + to the nullable floating extension type. In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. @@ -5846,7 +6178,7 @@ def convert_dtypes( >>> dfn = df.convert_dtypes() >>> dfn a b c d e f - 0 1 x True h 10 NaN + 0 1 x True h 10 1 2 y False i 100.5 2 3 z 20 200.0 @@ -5856,7 +6188,7 @@ def convert_dtypes( c boolean d string e Int64 - f float64 + f Float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -5878,12 +6210,20 @@ def convert_dtypes( """ if self.ndim == 1: return self._convert_dtypes( - infer_objects, convert_string, convert_integer, convert_boolean + infer_objects, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) else: results = [ col._convert_dtypes( - infer_objects, convert_string, convert_integer, convert_boolean + infer_objects, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) for col_name, col in self.items() ] @@ -6002,6 +6342,8 @@ def fillna( inplace = validate_bool_kwarg(inplace, "inplace") value, method = validate_fillna_kwargs(value, method) + self._consolidate_inplace() + # set the default here, so functions examining the signaure # can detect if something was set (e.g. in groupby) (GH9221) if axis is None: @@ -6009,8 +6351,7 @@ def fillna( axis = self._get_axis_number(axis) if value is None: - - if self._is_mixed_type and axis == 1: + if not self._mgr.is_single_block and axis == 1: if inplace: raise NotImplementedError() result = self.T.fillna(method=method, limit=limit).T @@ -6080,6 +6421,7 @@ def fillna( else: return result.__finalize__(self, method="fillna") + @final def ffill( self: FrameOrSeries, axis=None, @@ -6101,6 +6443,7 @@ def ffill( pad = ffill + @final def bfill( self: FrameOrSeries, axis=None, @@ -6127,8 +6470,8 @@ def replace( self, to_replace=None, value=None, - inplace=False, - limit=None, + inplace: bool_t = False, + limit: Optional[int] = None, regex=False, method="pad", ): @@ -6204,7 +6547,7 @@ def replace( If True, in place. Note: this will modify any other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. - limit : int, default None + limit : int or None, default None Maximum size gap to forward or backward fill. regex : bool or same types as `to_replace`, default False Whether to interpret `to_replace` and/or `value` as regular @@ -6216,13 +6559,10 @@ def replace( The method to use when for replacement, when `to_replace` is a scalar, list or tuple and `value` is ``None``. - .. versionchanged:: 0.23.0 - Added to DataFrame. - Returns ------- - {klass} - Object after replacement. + {klass} or None + Object after replacement or None if ``inplace=True``. Raises ------ @@ -6377,20 +6717,6 @@ def replace( 1 new new 2 bait xyz - Note that when replacing multiple ``bool`` or ``datetime64`` objects, - the data types in the `to_replace` parameter must match the data - type of the value being replaced: - - >>> df = pd.DataFrame({{'A': [True, False, True], - ... 'B': [False, True, False]}}) - >>> df.replace({{'a string': 'new value', True: False}}) # raises - Traceback (most recent call last): - ... - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - This raises a ``TypeError`` because one of the ``dict`` keys is not of - the correct type for replacement. - Compare the behavior of ``s.replace({{'a': None}})`` and ``s.replace('a', None)`` to understand the peculiarities of the `to_replace` parameter: @@ -6424,7 +6750,7 @@ def replace( 3 b 4 b dtype: object - """ + """ if not ( is_scalar(to_replace) or is_re_compilable(to_replace) @@ -6438,7 +6764,9 @@ def replace( inplace = validate_bool_kwarg(inplace, "inplace") if not is_bool(regex) and to_replace is not None: - raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") + raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") + + self._consolidate_inplace() if value is None: # passing a single value that is scalar like @@ -6448,10 +6776,14 @@ def replace( if isinstance(to_replace, (tuple, list)): if isinstance(self, ABCDataFrame): + from pandas import Series + return self.apply( - _single_replace, args=(to_replace, method, inplace, limit) + Series._replace_single, + args=(to_replace, method, inplace, limit), ) - return _single_replace(self, to_replace, method, inplace, limit) + self = cast("Series", self) + return self._replace_single(to_replace, method, inplace, limit) if not is_dict_like(to_replace): if not is_dict_like(regex): @@ -6464,7 +6796,10 @@ def replace( regex = True items = list(to_replace.items()) - keys, values = zip(*items) if items else ([], []) + if items: + keys, values = zip(*items) + else: + keys, values = ([], []) are_mappings = [is_dict_like(v) for v in values] @@ -6495,12 +6830,14 @@ def replace( # need a non-zero len on all axes if not self.size: - return self + if inplace: + return + return self.copy() if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} # Note: Checking below for `in foo.keys()` instead of - # `in foo`is needed for when we have a Series and not dict + # `in foo` is needed for when we have a Series and not dict mapping = { col: (to_replace[col], value[col]) for col in to_replace.keys() @@ -6523,25 +6860,25 @@ def replace( else: raise TypeError("value argument must be scalar, dict, or Series") - elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] - if is_list_like(value): - if len(to_replace) != len(value): - raise ValueError( - f"Replacement lists must match in length. " - f"Expecting {len(to_replace)} got {len(value)} " - ) - self._consolidate_inplace() - new_data = self._mgr.replace_list( - src_list=to_replace, - dest_list=value, - inplace=inplace, - regex=regex, - ) + elif is_list_like(to_replace): + if not is_list_like(value): + # e.g. to_replace = [NA, ''] and value is 0, + # so we replace NA with 0 and then replace '' with 0 + value = [value] * len(to_replace) - else: # [NA, ''] -> 0 - new_data = self._mgr.replace( - to_replace=to_replace, value=value, inplace=inplace, regex=regex + # e.g. we have to_replace = [NA, ''] and value = [0, 'missing'] + if len(to_replace) != len(value): + raise ValueError( + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " ) + new_data = self._mgr.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, + ) + elif to_replace is None: if not ( is_re_compilable(regex) @@ -6584,6 +6921,7 @@ def replace( else: return result.__finalize__(self, method="replace") + @final def interpolate( self: FrameOrSeries, method: str = "linear", @@ -6596,6 +6934,8 @@ def interpolate( **kwargs, ) -> Optional[FrameOrSeries]: """ + Fill NaN values using an interpolation method. + Please note that only ``method='linear'`` is supported for DataFrame/Series with a MultiIndex. @@ -6623,6 +6963,7 @@ def interpolate( `scipy.interpolate.BPoly.from_derivatives` which replaces 'piecewise_polynomial' interpolation method in scipy 0.18. + axis : {{0 or 'index', 1 or 'columns', None}}, default None Axis to interpolate along. limit : int, optional @@ -6657,18 +6998,16 @@ def interpolate( (interpolate). * 'outside': Only fill NaNs outside valid values (extrapolate). - .. versionadded:: 0.23.0 - downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. - **kwargs + ``**kwargs`` : optional Keyword arguments to pass on to the interpolating function. Returns ------- - Series or DataFrame + Series or DataFrame or None Returns the same object type as the caller, interpolated at - some or all ``NaN`` values. + some or all ``NaN`` values or None if ``inplace=True``. See Also -------- @@ -6799,6 +7138,9 @@ def interpolate( obj = self.T if should_transpose else self + if obj.empty: + return self.copy() + if method not in fillna_methods: axis = self._info_axis_number @@ -6833,6 +7175,7 @@ def interpolate( if method == "linear": # prior default index = np.arange(len(obj.index)) + index = Index(index) else: index = obj.index methods = {"index", "values", "nearest", "time"} @@ -6878,6 +7221,7 @@ def interpolate( # ---------------------------------------------------------------------- # Timeseries methods Methods + @final def asof(self, where, subset=None): """ Return the last row(s) without any NaNs before `where`. @@ -7025,10 +7369,13 @@ def asof(self, where, subset=None): nulls = self.isna() if is_series else self[subset].isna().any(1) if nulls.all(): if is_series: + self = cast("Series", self) return self._constructor(np.nan, index=where, name=self.name) elif is_list: + self = cast("DataFrame", self) return self._constructor(np.nan, index=where, columns=self.columns) else: + self = cast("DataFrame", self) return self._constructor_sliced( np.nan, index=self.columns, name=where[0] ) @@ -7061,7 +7408,7 @@ def isna(self: FrameOrSeries) -> FrameOrSeries: ------- {klass} Mask of bool values for each element in {klass} that - indicates whether an element is not an NA value. + indicates whether an element is an NA value. See Also -------- @@ -7074,11 +7421,11 @@ def isna(self: FrameOrSeries) -> FrameOrSeries: -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], - ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}}) + >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + ... born=[pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'])) >>> df age born name toy 0 5.0 NaT Alfred None @@ -7141,11 +7488,11 @@ def notna(self: FrameOrSeries) -> FrameOrSeries: -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], - ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}}) + >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + ... born=[pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'])) >>> df age born name toy 0 5.0 NaT Alfred None @@ -7179,6 +7526,7 @@ def notna(self: FrameOrSeries) -> FrameOrSeries: def notnull(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self, method="notnull") + @final def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if (lower is not None and np.any(isna(lower))) or ( upper is not None and np.any(isna(upper)) @@ -7204,6 +7552,7 @@ def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): else: return result + @final def _clip_with_one_bound(self, threshold, method, axis, inplace): if axis is not None: @@ -7224,9 +7573,10 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): if isinstance(self, ABCSeries): threshold = self._constructor(threshold, index=self.index) else: - threshold = _align_method_FRAME(self, threshold, axis, flex=None)[1] + threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] return self.where(subset, threshold, axis=axis, inplace=inplace) + @final def clip( self: FrameOrSeries, lower=None, @@ -7261,9 +7611,9 @@ def clip( Returns ------- - Series or DataFrame + Series or DataFrame or None Same type as calling object with the values outside the - clip boundaries replaced. + clip boundaries replaced or None if ``inplace=True``. See Also -------- @@ -7353,77 +7703,7 @@ def clip( return result - _shared_docs[ - "groupby" - ] = """ - Group %(klass)s using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the - object, applying a function, and combining the results. This can be - used to group large amounts of data and compute operations on these - groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. - If ``by`` is a function, it's called on each value of the object's - index. If a dict or Series is passed, the Series or dict VALUES - will be used to determine the groups (the Series' values are first - aligned; see ``.align()`` method). If an ndarray is passed, the - values are used as-is determine the groups. A label or list of - labels may be passed to group by the columns in ``self``. Notice - that a tuple is interpreted as a (single) key. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Split along rows (0) or columns (1). - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as the - index. Only relevant for DataFrame input. as_index=False is - effectively "SQL-style" grouped output. - sort : bool, default True - Sort group keys. Get better performance by turning this off. - Note this does not influence the order of observations within each - group. Groupby preserves the order of rows within each group. - group_keys : bool, default True - When calling apply, add group keys to index to identify pieces. - squeeze : bool, default False - Reduce the dimensionality of the return type if possible, - otherwise return a consistent type. - - .. deprecated:: 1.1.0 - - observed : bool, default False - This only applies if any of the groupers are Categoricals. - If True: only show observed values for categorical groupers. - If False: show all values for categorical groupers. - - .. versionadded:: 0.23.0 - dropna : bool, default True - If True, and if group keys contain NA values, NA values together - with row/column will be dropped. - If False, NA values will also be treated as the key in groups - - .. versionadded:: 1.1.0 - - Returns - ------- - %(klass)sGroupBy - Returns a groupby object that contains information about the groups. - - See Also - -------- - resample : Convenience method for frequency conversion and resampling - of time series. - - Notes - ----- - See the `user guide - `_ for more. - """ - + @final def asfreq( self: FrameOrSeries, freq, @@ -7535,6 +7815,7 @@ def asfreq( fill_value=fill_value, ) + @final def at_time( self: FrameOrSeries, time, asof: bool_t = False, axis=None ) -> FrameOrSeries: @@ -7593,6 +7874,7 @@ def at_time( indexer = index.indexer_at_time(time, asof=asof) return self._take_with_is_copy(indexer, axis=axis) + @final def between_time( self: FrameOrSeries, start_time, @@ -7673,10 +7955,11 @@ def between_time( raise TypeError("Index must be DatetimeIndex") indexer = index.indexer_between_time( - start_time, end_time, include_start=include_start, include_end=include_end, + start_time, end_time, include_start=include_start, include_end=include_end ) return self._take_with_is_copy(indexer, axis=axis) + @final def resample( self, rule, @@ -7691,7 +7974,7 @@ def resample( level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, - ) -> "Resampler": + ) -> Resampler: """ Resample time-series data. @@ -7927,8 +8210,8 @@ def resample( For DataFrame objects, the keyword `on` can be used to specify the column instead of the index for resampling. - >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) + >>> d = {'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]} >>> df = pd.DataFrame(d) >>> df['week_starting'] = pd.date_range('01/01/2018', ... periods=8, @@ -7953,8 +8236,8 @@ def resample( specify on which level the resampling needs to take place. >>> days = pd.date_range('1/1/2000', periods=4, freq='D') - >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) + >>> d2 = {'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]} >>> df2 = pd.DataFrame(d2, ... index=pd.MultiIndex.from_product([days, ... ['morning', @@ -8078,6 +8361,7 @@ def resample( offset=offset, ) + @final def first(self: FrameOrSeries, offset) -> FrameOrSeries: """ Select initial periods of time series data based on a date offset. @@ -8146,6 +8430,7 @@ def first(self: FrameOrSeries, offset) -> FrameOrSeries: return self.loc[:end] + @final def last(self: FrameOrSeries, offset) -> FrameOrSeries: """ Select final periods of time series data based on a date offset. @@ -8209,6 +8494,7 @@ def last(self: FrameOrSeries, offset) -> FrameOrSeries: start = self.index.searchsorted(start_date, side="right") return self.iloc[start:] + @final def rank( self: FrameOrSeries, axis=0, @@ -8332,36 +8618,7 @@ def ranker(data): return ranker(data) - _shared_docs[ - "compare" - ] = """ - Compare to another %(klass)s and show the differences. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - other : %(klass)s - Object to compare with. - - align_axis : {0 or 'index', 1 or 'columns'}, default 1 - Determine which axis to align the comparison on. - - * 0, or 'index' : Resulting differences are stacked vertically - with rows drawn alternately from self and other. - * 1, or 'columns' : Resulting differences are aligned horizontally - with columns drawn alternately from self and other. - - keep_shape : bool, default False - If true, all rows and columns are kept. - Otherwise, only the ones with different values are kept. - - keep_equal : bool, default False - If true, the result keeps values that are equal. - Otherwise, equal values are shown as NaNs. - """ - - @Appender(_shared_docs["compare"] % _shared_doc_kwargs) + @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"]) def compare( self, other, @@ -8558,6 +8815,7 @@ def align( else: # pragma: no cover raise TypeError(f"unsupported type: {type(other)}") + @final def _align_frame( self, other, @@ -8615,6 +8873,10 @@ def _align_frame( if is_datetime64tz_dtype(left.index.dtype): if left.index.tz != right.index.tz: if join_index is not None: + # GH#33671 ensure we don't change the index on + # our original Series (NB: by default deep=False) + left = left.copy() + right = right.copy() left.index = join_index right.index = join_index @@ -8623,6 +8885,7 @@ def _align_frame( right.__finalize__(other), ) + @final def _align_series( self, other, @@ -8702,6 +8965,10 @@ def _align_series( if is_datetime64tz_dtype(left.index.dtype): if left.index.tz != right.index.tz: if join_index is not None: + # GH#33671 ensure we don't change the index on + # our original Series (NB: by default deep=False) + left = left.copy() + right = right.copy() left.index = join_index right.index = join_index @@ -8710,6 +8977,7 @@ def _align_series( right.__finalize__(other), ) + @final def _where( self, cond, @@ -8759,7 +9027,6 @@ def _where( cond = -cond if inplace else cond # try to align with other - try_quick = True if isinstance(other, NDFrame): # align with me @@ -8798,12 +9065,11 @@ def _where( # match True cond to other elif len(cond[icond]) == len(other): - # try to not change dtype at first (if try_quick) - if try_quick: - new_other = np.asarray(self) - new_other = new_other.copy() - new_other[icond] = other - other = new_other + # try to not change dtype at first + new_other = np.asarray(self) + new_other = new_other.copy() + new_other[icond] = other + other = new_other else: raise ValueError( @@ -8840,7 +9106,7 @@ def _where( self._check_inplace_setting(other) new_data = self._mgr.putmask( - mask=cond, new=other, align=align, axis=block_axis, + mask=cond, new=other, align=align, axis=block_axis ) result = self._constructor(new_data) return self._update_inplace(result) @@ -8857,6 +9123,7 @@ def _where( result = self._constructor(new_data) return result.__finalize__(self) + @final @doc( klass=_shared_doc_kwargs["klass"], cond="True", @@ -8909,7 +9176,7 @@ def where( Returns ------- - Same type as caller + Same type as caller or None if ``inplace=True``. See Also -------- @@ -8940,7 +9207,6 @@ def where( 3 3.0 4 4.0 dtype: float64 - >>> s.mask(s > 0) 0 0.0 1 NaN @@ -8956,6 +9222,13 @@ def where( 3 3 4 4 dtype: int64 + >>> s.mask(s > 1, 10) + 0 0 + 1 1 + 2 10 + 3 10 + 4 10 + dtype: int64 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) >>> df @@ -8993,6 +9266,7 @@ def where( cond, other, inplace, axis, level, errors=errors, try_cast=try_cast ) + @final @doc( where, klass=_shared_doc_kwargs["klass"], @@ -9054,7 +9328,7 @@ def shift( extend the index when shifting and preserve the original data. If `freq` is specified as "infer" then it will be inferred from the freq or inferred_freq attributes of the index. If neither of - those attributes exist, a ValueError is thrown + those attributes exist, a ValueError is thrown. axis : {{0 or 'index', 1 or 'columns', None}}, default None Shift direction. fill_value : object, optional @@ -9103,11 +9377,11 @@ def shift( >>> df.shift(periods=1, axis="columns") Col1 Col2 Col3 - 2020-01-01 NaN 10.0 13.0 - 2020-01-02 NaN 20.0 23.0 - 2020-01-03 NaN 15.0 18.0 - 2020-01-04 NaN 30.0 33.0 - 2020-01-05 NaN 45.0 48.0 + 2020-01-01 NaN 10 13 + 2020-01-02 NaN 20 23 + 2020-01-03 NaN 15 18 + 2020-01-04 NaN 30 33 + 2020-01-05 NaN 45 48 >>> df.shift(periods=3, fill_value=0) Col1 Col2 Col3 @@ -9175,13 +9449,17 @@ def shift( result = self.set_axis(new_ax, axis) return result.__finalize__(self, method="shift") + @final def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ Equivalent to `shift` without copying data. - The shifted data will not include the dropped periods and the shifted axis will be smaller than the original. + .. deprecated:: 1.2.0 + slice_shift is deprecated, + use DataFrame/Series.shift instead. + Parameters ---------- periods : int @@ -9196,6 +9474,14 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: While the `slice_shift` is faster than `shift`, you may pay for it later during alignment. """ + + msg = ( + "The 'slice_shift' method is deprecated " + "and will be removed in a future version. " + "You can use DataFrame/Series.shift instead" + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + if periods == 0: return self @@ -9212,6 +9498,7 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: return new_obj.__finalize__(self, method="slice_shift") + @final def tshift( self: FrameOrSeries, periods: int = 1, freq=None, axis: Axis = 0 ) -> FrameOrSeries: @@ -9387,7 +9674,13 @@ def truncate( # if we have a date index, convert to dates, otherwise # treat like a slice - if ax.is_all_dates: + if ax._is_all_dates: + if is_object_dtype(ax.dtype): + warnings.warn( + "Treating object-dtype Index of date objects as DatetimeIndex " + "is deprecated, will be removed in a future version.", + FutureWarning, + ) from pandas.core.tools.datetimes import to_datetime before = to_datetime(before) @@ -9397,7 +9690,7 @@ def truncate( if before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if ax.is_monotonic_decreasing: + if len(ax) > 1 and ax.is_monotonic_decreasing: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN @@ -9412,6 +9705,7 @@ def truncate( return result + @final def tz_convert( self: FrameOrSeries, tz, axis=0, level=None, copy: bool_t = True ) -> FrameOrSeries: @@ -9469,6 +9763,7 @@ def _tz_convert(ax, tz): result = result.set_axis(ax, axis=axis, inplace=False) return result.__finalize__(self, method="tz_convert") + @final def tz_localize( self: FrameOrSeries, tz, @@ -9641,6 +9936,8 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): # ---------------------------------------------------------------------- # Numeric Methods + + @final def abs(self: FrameOrSeries) -> FrameOrSeries: """ Return a Series/DataFrame with absolute numeric value of each element. @@ -9710,8 +10007,13 @@ def abs(self: FrameOrSeries) -> FrameOrSeries: """ return np.abs(self) + @final def describe( - self: FrameOrSeries, percentiles=None, include=None, exclude=None + self: FrameOrSeries, + percentiles=None, + include=None, + exclude=None, + datetime_is_numeric=False, ) -> FrameOrSeries: """ Generate descriptive statistics. @@ -9757,6 +10059,12 @@ def describe( ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. This affects statistics + calculated for the column. For DataFrame input, this also + controls whether datetime columns are included by default. + + .. versionadded:: 1.1.0 Returns ------- @@ -9834,7 +10142,7 @@ def describe( ... np.datetime64("2010-01-01"), ... np.datetime64("2010-01-01") ... ]) - >>> s.describe() + >>> s.describe(datetime_is_numeric=True) count 3 mean 2006-09-01 08:00:00 min 2000-01-01 00:00:00 @@ -9920,7 +10228,7 @@ def describe( categorical count 3 unique 3 - top f + top d freq 1 Excluding numeric columns from a ``DataFrame`` description. @@ -9973,7 +10281,7 @@ def describe( formatted_percentiles = format_percentiles(percentiles) - def describe_numeric_1d(series): + def describe_numeric_1d(series) -> "Series": stat_index = ( ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] ) @@ -9984,7 +10292,7 @@ def describe_numeric_1d(series): ) return pd.Series(d, index=stat_index, name=series.name) - def describe_categorical_1d(data): + def describe_categorical_1d(data) -> "Series": names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) @@ -9992,8 +10300,37 @@ def describe_categorical_1d(data): dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - names += ["top", "freq"] - result += [top, freq] + if is_datetime64_any_dtype(data.dtype): + if self.ndim == 1: + stacklevel = 4 + else: + stacklevel = 5 + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=stacklevel, + ) + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + else: + names += ["top", "freq"] + result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency @@ -10004,7 +10341,7 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name, dtype=dtype) - def describe_timestamp_1d(data): + def describe_timestamp_1d(data) -> "Series": # GH-30164 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] d = ( @@ -10014,12 +10351,12 @@ def describe_timestamp_1d(data): ) return pd.Series(d, index=stat_index, name=data.name) - def describe_1d(data): + def describe_1d(data) -> "Series": if is_bool_dtype(data.dtype): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) - elif is_datetime64_any_dtype(data.dtype): + elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: return describe_timestamp_1d(data) elif is_timedelta64_dtype(data.dtype): return describe_numeric_1d(data) @@ -10027,10 +10364,15 @@ def describe_1d(data): return describe_categorical_1d(data) if self.ndim == 1: - return describe_1d(self) + # Incompatible return value type + # (got "Series", expected "FrameOrSeries") [return-value] + return describe_1d(self) # type:ignore[return-value] elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics - data = self.select_dtypes(include=[np.number]) + default_include = [np.number] + if datetime_is_numeric: + default_include.append("datetime") + data = self.select_dtypes(include=default_include) if len(data.columns) == 0: data = self elif include == "all": @@ -10054,6 +10396,7 @@ def describe_1d(data): d.columns = data.columns.copy() return d + @final def pct_change( self: FrameOrSeries, periods=1, @@ -10192,6 +10535,7 @@ def pct_change( rs = rs.reindex_like(data) return rs + @final def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): if axis is None: raise ValueError("Must specify 'axis' when aggregating by level.") @@ -10203,290 +10547,674 @@ def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs) return grouped.aggregate(applyf) - @classmethod - def _add_numeric_operations(cls): - """ - Add the operations to the cls; evaluate the doc strings again - """ - axis_descr, name1, name2 = _doc_parms(cls) + @final + def _logical_func( + self, name: str, func, axis=0, bool_only=None, skipna=True, level=None, **kwargs + ): + nv.validate_logical_func((), kwargs, fname=name) + if level is not None: + if bool_only is not None: + raise NotImplementedError( + "Option bool_only is not implemented with option level." + ) + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - cls.any = _make_logical_function( - cls, - "any", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc=_any_desc, - func=nanops.nanany, - see_also=_any_see_also, - examples=_any_examples, - empty_value=False, + if self.ndim > 1 and axis is None: + # Reduce along one dimension then the other, to simplify DataFrame._reduce + res = self._logical_func( + name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs + ) + return res._logical_func(name, func, skipna=skipna, **kwargs) + + return self._reduce( + func, + name=name, + axis=axis, + skipna=skipna, + numeric_only=bool_only, + filter_type="bool", ) - cls.all = _make_logical_function( - cls, - "all", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc=_all_desc, - func=nanops.nanall, - see_also=_all_see_also, - examples=_all_examples, - empty_value=True, + + def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + return self._logical_func( + "any", nanops.nanany, axis, bool_only, skipna, level, **kwargs ) - @doc( - desc="Return the mean absolute deviation of the values " - "for the requested axis.", - name1=name1, - name2=name2, - axis_descr=axis_descr, - see_also="", - examples="", + def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + return self._logical_func( + "all", nanops.nanall, axis, bool_only, skipna, level, **kwargs ) - def mad(self, axis=None, skipna=None, level=None): - """ - {desc} - - Parameters - ---------- - axis : {axis_descr} - Axis for the function to be applied on. - skipna : bool, default None - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. - - Returns - ------- - {name1} or {name2} (if level specified)\ - {see_also}\ - {examples} - """ - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) - data = self._get_numeric_data() - if axis == 0: - demeaned = data - data.mean(axis=0) - else: - demeaned = data.sub(data.mean(axis=1), axis=0) - return np.abs(demeaned).mean(axis=axis, skipna=skipna) + @final + def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): + skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) + if axis is None: + axis = self._stat_axis_number + else: + axis = self._get_axis_number(axis) - cls.mad = mad + if axis == 1: + return self.T._accum_func( + name, func, axis=0, skipna=skipna, *args, **kwargs + ).T - cls.sem = _make_stat_function_ddof( - cls, - "sem", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="Return unbiased standard error of the mean over requested " - "axis.\n\nNormalized by N-1 by default. This can be changed " - "using the ddof argument", - func=nanops.nansem, + def block_accum_func(blk_values): + values = blk_values.T if hasattr(blk_values, "T") else blk_values + + result = nanops.na_accum_func(values, func, skipna=skipna) + + result = result.T if hasattr(result, "T") else result + return result + + result = self._mgr.apply(block_accum_func) + + return self._constructor(result).__finalize__(self, method=name) + + def cummax(self, axis=None, skipna=True, *args, **kwargs): + return self._accum_func( + "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs ) - cls.var = _make_stat_function_ddof( - cls, - "var", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="Return unbiased variance over requested axis.\n\nNormalized by " - "N-1 by default. This can be changed using the ddof argument", - func=nanops.nanvar, + + def cummin(self, axis=None, skipna=True, *args, **kwargs): + return self._accum_func( + "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs ) - cls.std = _make_stat_function_ddof( - cls, - "std", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="Return sample standard deviation over requested axis." - "\n\nNormalized by N-1 by default. This can be changed using the " - "ddof argument", - func=nanops.nanstd, + + def cumsum(self, axis=None, skipna=True, *args, **kwargs): + return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs) + + def cumprod(self, axis=None, skipna=True, *args, **kwargs): + return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs) + + @final + def _stat_function_ddof( + self, + name: str, + func, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + nv.validate_stat_ddof_func((), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, ddof=ddof + ) + return self._reduce( + func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof ) - cls.cummin = _make_cum_function( - cls, - "cummin", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="minimum", - accum_func=np.minimum.accumulate, - accum_func_name="min", - examples=_cummin_examples, + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + return self._stat_function_ddof( + "sem", nanops.nansem, axis, skipna, level, ddof, numeric_only, **kwargs ) - cls.cumsum = _make_cum_function( - cls, - "cumsum", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="sum", - accum_func=np.cumsum, - accum_func_name="sum", - examples=_cumsum_examples, + + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + return self._stat_function_ddof( + "var", nanops.nanvar, axis, skipna, level, ddof, numeric_only, **kwargs ) - cls.cumprod = _make_cum_function( - cls, - "cumprod", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="product", - accum_func=np.cumprod, - accum_func_name="prod", - examples=_cumprod_examples, + + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + return self._stat_function_ddof( + "std", nanops.nanstd, axis, skipna, level, ddof, numeric_only, **kwargs ) - cls.cummax = _make_cum_function( - cls, - "cummax", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="maximum", - accum_func=np.maximum.accumulate, - accum_func_name="max", - examples=_cummax_examples, + + @final + def _stat_function( + self, + name: str, + func, + axis=None, + skipna=None, + level=None, + numeric_only=None, + **kwargs, + ): + if name == "median": + nv.validate_median((), kwargs) + else: + nv.validate_stat_func((), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only ) - cls.sum = _make_min_count_stat_function( - cls, - "sum", - name1=name1, - name2=name2, - axis_descr=axis_descr, - desc="Return the sum of the values for the requested axis.\n\n" - "This is equivalent to the method ``numpy.sum``.", - func=nanops.nansum, - see_also=_stat_func_see_also, - examples=_sum_examples, + def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._stat_function( + "min", nanops.nanmin, axis, skipna, level, numeric_only, **kwargs ) - cls.mean = _make_stat_function( - cls, - "mean", - name1=name1, + + def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._stat_function( + "max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs + ) + + def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._stat_function( + "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs + ) + + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._stat_function( + "median", nanops.nanmedian, axis, skipna, level, numeric_only, **kwargs + ) + + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._stat_function( + "skew", nanops.nanskew, axis, skipna, level, numeric_only, **kwargs + ) + + def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._stat_function( + "kurt", nanops.nankurt, axis, skipna, level, numeric_only, **kwargs + ) + + kurtosis = kurt + + @final + def _min_count_stat_function( + self, + name: str, + func, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + if name == "sum": + nv.validate_sum((), kwargs) + elif name == "prod": + nv.validate_prod((), kwargs) + else: + nv.validate_stat_func((), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, min_count=min_count + ) + return self._reduce( + func, + name=name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) + + def sum( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + return self._min_count_stat_function( + "sum", nanops.nansum, axis, skipna, level, numeric_only, min_count, **kwargs + ) + + def prod( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + return self._min_count_stat_function( + "prod", + nanops.nanprod, + axis, + skipna, + level, + numeric_only, + min_count, + **kwargs, + ) + + product = prod + + def mad(self, axis=None, skipna=None, level=None): + """ + {desc} + + Parameters + ---------- + axis : {axis_descr} + Axis for the function to be applied on. + skipna : bool, default None + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a {name1}. + + Returns + ------- + {name1} or {name2} (if level specified)\ + {see_also}\ + {examples} + """ + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) + + data = self._get_numeric_data() + if axis == 0: + demeaned = data - data.mean(axis=0) + else: + demeaned = data.sub(data.mean(axis=1), axis=0) + return np.abs(demeaned).mean(axis=axis, skipna=skipna) + + @classmethod + def _add_numeric_operations(cls): + """ + Add the operations to the cls; evaluate the doc strings again + """ + axis_descr, name1, name2 = _doc_parms(cls) + + @doc( + _bool_doc, + desc=_any_desc, + name1=name1, name2=name2, axis_descr=axis_descr, - desc="Return the mean of the values for the requested axis.", - func=nanops.nanmean, + see_also=_any_see_also, + examples=_any_examples, + empty_value=False, ) - cls.skew = _make_stat_function( - cls, - "skew", + def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + return NDFrame.any(self, axis, bool_only, skipna, level, **kwargs) + + # pandas\core\generic.py:10725: error: Cannot assign to a method + # [assignment] + cls.any = any # type: ignore[assignment] + + @doc( + _bool_doc, + desc=_all_desc, name1=name1, name2=name2, axis_descr=axis_descr, - desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.", - func=nanops.nanskew, + see_also=_all_see_also, + examples=_all_examples, + empty_value=True, ) - cls.kurt = _make_stat_function( - cls, - "kurt", + def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + return NDFrame.all(self, axis, bool_only, skipna, level, **kwargs) + + # pandas\core\generic.py:10719: error: Cannot assign to a method + # [assignment] + + # pandas\core\generic.py:10719: error: Incompatible types in assignment + # (expression has type "Callable[[Iterable[object]], bool]", variable + # has type "Callable[[NDFrame, Any, Any, Any, Any, KwArg(Any)], Any]") + # [assignment] + cls.all = all # type: ignore[assignment] + + @doc( + NDFrame.mad, + desc="Return the mean absolute deviation of the values " + "over the requested axis.", name1=name1, name2=name2, axis_descr=axis_descr, - desc="Return unbiased kurtosis over requested axis.\n\n" - "Kurtosis obtained using Fisher's definition of\n" - "kurtosis (kurtosis of normal == 0.0). Normalized " - "by N-1.", - func=nanops.nankurt, + see_also="", + examples="", ) - cls.kurtosis = cls.kurt - cls.prod = _make_min_count_stat_function( - cls, - "prod", + def mad(self, axis=None, skipna=None, level=None): + return NDFrame.mad(self, axis, skipna, level) + + # pandas\core\generic.py:10736: error: Cannot assign to a method + # [assignment] + cls.mad = mad # type: ignore[assignment] + + @doc( + _num_ddof_doc, + desc="Return unbiased standard error of the mean over requested " + "axis.\n\nNormalized by N-1 by default. This can be changed " + "using the ddof argument", + name1=name1, + name2=name2, + axis_descr=axis_descr, + ) + def sem( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs) + + # pandas\core\generic.py:10758: error: Cannot assign to a method + # [assignment] + cls.sem = sem # type: ignore[assignment] + + @doc( + _num_ddof_doc, + desc="Return unbiased variance over requested axis.\n\nNormalized by " + "N-1 by default. This can be changed using the ddof argument", + name1=name1, + name2=name2, + axis_descr=axis_descr, + ) + def var( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs) + + # pandas\core\generic.py:10779: error: Cannot assign to a method + # [assignment] + cls.var = var # type: ignore[assignment] + + @doc( + _num_ddof_doc, + desc="Return sample standard deviation over requested axis." + "\n\nNormalized by N-1 by default. This can be changed using the " + "ddof argument", + name1=name1, + name2=name2, + axis_descr=axis_descr, + ) + def std( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs) + + # pandas\core\generic.py:10801: error: Cannot assign to a method + # [assignment] + cls.std = std # type: ignore[assignment] + + @doc( + _cnum_doc, + desc="minimum", + name1=name1, + name2=name2, + axis_descr=axis_descr, + accum_func_name="min", + examples=_cummin_examples, + ) + def cummin(self, axis=None, skipna=True, *args, **kwargs): + return NDFrame.cummin(self, axis, skipna, *args, **kwargs) + + # pandas\core\generic.py:10815: error: Cannot assign to a method + # [assignment] + cls.cummin = cummin # type: ignore[assignment] + + @doc( + _cnum_doc, + desc="maximum", + name1=name1, + name2=name2, + axis_descr=axis_descr, + accum_func_name="max", + examples=_cummax_examples, + ) + def cummax(self, axis=None, skipna=True, *args, **kwargs): + return NDFrame.cummax(self, axis, skipna, *args, **kwargs) + + # pandas\core\generic.py:10829: error: Cannot assign to a method + # [assignment] + cls.cummax = cummax # type: ignore[assignment] + + @doc( + _cnum_doc, + desc="sum", + name1=name1, + name2=name2, + axis_descr=axis_descr, + accum_func_name="sum", + examples=_cumsum_examples, + ) + def cumsum(self, axis=None, skipna=True, *args, **kwargs): + return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) + + # pandas\core\generic.py:10843: error: Cannot assign to a method + # [assignment] + cls.cumsum = cumsum # type: ignore[assignment] + + @doc( + _cnum_doc, + desc="product", name1=name1, name2=name2, axis_descr=axis_descr, - desc="Return the product of the values for the requested axis.", - func=nanops.nanprod, + accum_func_name="prod", + examples=_cumprod_examples, + ) + def cumprod(self, axis=None, skipna=True, *args, **kwargs): + return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) + + # pandas\core\generic.py:10857: error: Cannot assign to a method + # [assignment] + cls.cumprod = cumprod # type: ignore[assignment] + + @doc( + _num_doc, + desc="Return the sum of the values over the requested axis.\n\n" + "This is equivalent to the method ``numpy.sum``.", + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count=_min_count_stub, + see_also=_stat_func_see_also, + examples=_sum_examples, + ) + def sum( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + return NDFrame.sum( + self, axis, skipna, level, numeric_only, min_count, **kwargs + ) + + # pandas\core\generic.py:10883: error: Cannot assign to a method + # [assignment] + cls.sum = sum # type: ignore[assignment] + + @doc( + _num_doc, + desc="Return the product of the values over the requested axis.", + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count=_min_count_stub, + see_also=_stat_func_see_also, examples=_prod_examples, ) - cls.product = cls.prod - cls.median = _make_stat_function( - cls, - "median", + def prod( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + return NDFrame.prod( + self, axis, skipna, level, numeric_only, min_count, **kwargs + ) + + # pandas\core\generic.py:10908: error: Cannot assign to a method + # [assignment] + cls.prod = prod # type: ignore[assignment] + cls.product = prod + + @doc( + _num_doc, + desc="Return the mean of the values over the requested axis.", name1=name1, name2=name2, axis_descr=axis_descr, - desc="Return the median of the values for the requested axis.", - func=nanops.nanmedian, + min_count="", + see_also="", + examples="", + ) + def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs) + + # pandas\core\generic.py:10924: error: Cannot assign to a method + # [assignment] + cls.mean = mean # type: ignore[assignment] + + @doc( + _num_doc, + desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.", + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also="", + examples="", + ) + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs) + + # pandas\core\generic.py:10939: error: Cannot assign to a method + # [assignment] + cls.skew = skew # type: ignore[assignment] + + @doc( + _num_doc, + desc="Return unbiased kurtosis over requested axis.\n\n" + "Kurtosis obtained using Fisher's definition of\n" + "kurtosis (kurtosis of normal == 0.0). Normalized " + "by N-1.", + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also="", + examples="", ) - cls.max = _make_stat_function( - cls, - "max", + def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs) + + # pandas\core\generic.py:10957: error: Cannot assign to a method + # [assignment] + cls.kurt = kurt # type: ignore[assignment] + cls.kurtosis = kurt + + @doc( + _num_doc, + desc="Return the median of the values over the requested axis.", name1=name1, name2=name2, axis_descr=axis_descr, - desc="Return the maximum of the values for the requested axis.\n\n" + min_count="", + see_also="", + examples="", + ) + def median( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs) + + # pandas\core\generic.py:10975: error: Cannot assign to a method + # [assignment] + cls.median = median # type: ignore[assignment] + + @doc( + _num_doc, + desc="Return the maximum of the values over the requested axis.\n\n" "If you want the *index* of the maximum, use ``idxmax``. This is" "the equivalent of the ``numpy.ndarray`` method ``argmax``.", - func=nanops.nanmax, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count="", see_also=_stat_func_see_also, examples=_max_examples, ) - cls.min = _make_stat_function( - cls, - "min", + def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) + + # pandas\core\generic.py:10992: error: Cannot assign to a method + # [assignment] + cls.max = max # type: ignore[assignment] + + @doc( + _num_doc, + desc="Return the minimum of the values over the requested axis.\n\n" + "If you want the *index* of the minimum, use ``idxmin``. This is" + "the equivalent of the ``numpy.ndarray`` method ``argmin``.", name1=name1, name2=name2, axis_descr=axis_descr, - desc="Return the minimum of the values for the requested axis.\n\n" - "If you want the *index* of the minimum, use ``idxmin``. This is" - "the equivalent of the ``numpy.ndarray`` method ``argmin``.", - func=nanops.nanmin, + min_count="", see_also=_stat_func_see_also, examples=_min_examples, ) + def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs) - @classmethod - def _add_series_or_dataframe_operations(cls): - """ - Add the series or dataframe only operations to the cls; evaluate - the doc strings again. - """ - from pandas.core.window import ( - Expanding, - ExponentialMovingWindow, - Rolling, - Window, - ) + # pandas\core\generic.py:11009: error: Cannot assign to a method + # [assignment] + cls.min = min # type: ignore[assignment] - @doc(Rolling) - def rolling( - self, - window, - min_periods=None, - center=False, - win_type=None, - on=None, - axis=0, - closed=None, - ): - axis = self._get_axis_number(axis) - - if win_type is not None: - return Window( - self, - window=window, - min_periods=min_periods, - center=center, - win_type=win_type, - on=on, - axis=axis, - closed=closed, - ) + @final + @doc(Rolling) + def rolling( + self, + window: Union[int, timedelta, BaseOffset, BaseIndexer], + min_periods: Optional[int] = None, + center: bool_t = False, + win_type: Optional[str] = None, + on: Optional[str] = None, + axis: Axis = 0, + closed: Optional[str] = None, + ): + axis = self._get_axis_number(axis) - return Rolling( + if win_type is not None: + return Window( self, window=window, min_periods=min_periods, @@ -10497,121 +11225,129 @@ def rolling( closed=closed, ) - cls.rolling = rolling + return Rolling( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) - @doc(Expanding) - def expanding(self, min_periods=1, center=False, axis=0): - axis = self._get_axis_number(axis) - return Expanding(self, min_periods=min_periods, center=center, axis=axis) + @final + @doc(Expanding) + def expanding( + self, min_periods: int = 1, center: Optional[bool_t] = None, axis: Axis = 0 + ) -> Expanding: + axis = self._get_axis_number(axis) + if center is not None: + warnings.warn( + "The `center` argument on `expanding` will be removed in the future", + FutureWarning, + stacklevel=2, + ) + else: + center = False - cls.expanding = expanding + return Expanding(self, min_periods=min_periods, center=center, axis=axis) - @doc(ExponentialMovingWindow) - def ewm( + @final + @doc(ExponentialMovingWindow) + def ewm( + self, + com: Optional[float] = None, + span: Optional[float] = None, + halflife: Optional[Union[float, TimedeltaConvertibleTypes]] = None, + alpha: Optional[float] = None, + min_periods: int = 0, + adjust: bool_t = True, + ignore_na: bool_t = False, + axis: Axis = 0, + times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, + ) -> ExponentialMovingWindow: + axis = self._get_axis_number(axis) + return ExponentialMovingWindow( self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - times=None, - ): - axis = self._get_axis_number(axis) - return ExponentialMovingWindow( - self, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - times=times, - ) + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + times=times, + ) - cls.ewm = ewm + # ---------------------------------------------------------------------- + # Arithmetic Methods - @doc(klass=_shared_doc_kwargs["klass"], axis="") - def transform(self, func, *args, **kwargs): + @final + def _inplace_method(self, other, op): + """ + Wrap arithmetic method to operate inplace. """ - Call ``func`` on self producing a {klass} with transformed values. + result = op(self, other) - Produced {klass} will have same axis length as self. + if ( + self.ndim == 1 + and result._indexed_same(self) + and is_dtype_equal(result.dtype, self.dtype) + ): + # GH#36498 this inplace op can _actually_ be inplace. + self._values[:] = result._values + return self - Parameters - ---------- - func : function, str, list or dict - Function to use for transforming the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` - - dict of axis labels -> functions, function names or list of such. - {axis} - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. + # Delete cacher + self._reset_cacher() - Returns - ------- - {klass} - A {klass} that must have the same length as self. + # this makes sure that we are aligned like the input + # we are updating inplace so we want to ignore is_copy + self._update_inplace( + result.reindex_like(self, copy=False), verify_is_copy=False + ) + return self - Raises - ------ - ValueError : If the returned {klass} has a different length than self. + def __iadd__(self, other): + return self._inplace_method(other, type(self).__add__) # type: ignore[operator] - See Also - -------- - {klass}.agg : Only perform aggregating type operations. - {klass}.apply : Invoke function on a {klass}. + def __isub__(self, other): + return self._inplace_method(other, type(self).__sub__) # type: ignore[operator] - Examples - -------- - >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) - >>> df - A B - 0 0 1 - 1 1 2 - 2 2 3 - >>> df.transform(lambda x: x + 1) - A B - 0 1 2 - 1 2 3 - 2 3 4 + def __imul__(self, other): + return self._inplace_method(other, type(self).__mul__) # type: ignore[operator] - Even though the resulting {klass} must have the same length as the - input {klass}, it is possible to provide several input functions: + def __itruediv__(self, other): + return self._inplace_method( + other, type(self).__truediv__ # type: ignore[operator] + ) - >>> s = pd.Series(range(3)) - >>> s - 0 0 - 1 1 - 2 2 - dtype: int64 - >>> s.transform([np.sqrt, np.exp]) - sqrt exp - 0 0.000000 1.000000 - 1 1.000000 2.718282 - 2 1.414214 7.389056 - """ - result = self.agg(func, *args, **kwargs) - if is_scalar(result) or len(result) != len(self): - raise ValueError("transforms cannot produce aggregated results") + def __ifloordiv__(self, other): + return self._inplace_method( + other, type(self).__floordiv__ # type: ignore[operator] + ) - return result + def __imod__(self, other): + return self._inplace_method(other, type(self).__mod__) # type: ignore[operator] + + def __ipow__(self, other): + return self._inplace_method(other, type(self).__pow__) # type: ignore[operator] + + def __iand__(self, other): + return self._inplace_method(other, type(self).__and__) # type: ignore[operator] + + def __ior__(self, other): + return self._inplace_method(other, type(self).__or__) # type: ignore[operator] + + def __ixor__(self, other): + return self._inplace_method(other, type(self).__xor__) # type: ignore[operator] # ---------------------------------------------------------------------- # Misc methods + @final def _find_valid_index(self, how: str): """ Retrieves the index of the first valid value. @@ -10630,6 +11366,7 @@ def _find_valid_index(self, how: str): return None return self.index[idxpos] + @final @doc(position="first", klass=_shared_doc_kwargs["klass"]) def first_valid_index(self): """ @@ -10646,6 +11383,7 @@ def first_valid_index(self): """ return self._find_valid_index("first") + @final @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"]) def last_valid_index(self): return self._find_valid_index("last") @@ -10662,43 +11400,43 @@ def _doc_parms(cls): _num_doc = """ -%(desc)s +{desc} Parameters ---------- -axis : %(axis_descr)s +axis : {axis_descr} Axis for the function to be applied on. skipna : bool, default True Exclude NA/null values when computing the result. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s. + particular level, collapsing into a {name1}. numeric_only : bool, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. -%(min_count)s\ +{min_count}\ **kwargs Additional keyword arguments to be passed to the function. Returns ------- -%(name1)s or %(name2)s (if level specified)\ -%(see_also)s\ -%(examples)s +{name1} or {name2} (if level specified)\ +{see_also}\ +{examples} """ _num_ddof_doc = """ -%(desc)s +{desc} Parameters ---------- -axis : %(axis_descr)s +axis : {axis_descr} skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s. + particular level, collapsing into a {name1}. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. @@ -10708,14 +11446,19 @@ def _doc_parms(cls): Returns ------- -%(name1)s or %(name2)s (if level specified)\n""" +{name1} or {name2} (if level specified) + +Notes +----- +To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the +default `ddof=1`)\n""" _bool_doc = """ -%(desc)s +{desc} Parameters ---------- -axis : {0 or 'index', 1 or 'columns', None}, default 0 +axis : {{0 or 'index', 1 or 'columns', None}}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the @@ -10729,24 +11472,24 @@ def _doc_parms(cls): then use only boolean data. Not implemented for Series. skipna : bool, default True Exclude NA/null values. If the entire row/column is NA and skipna is - True, then the result will be %(empty_value)s, as for an empty row/column. + True, then the result will be {empty_value}, as for an empty row/column. If skipna is False, then NA are treated as True, because these are not equal to zero. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s. + particular level, collapsing into a {name1}. **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. Returns ------- -%(name1)s or %(name2)s - If level is specified, then, %(name2)s is returned; otherwise, %(name1)s +{name1} or {name2} + If level is specified, then, {name2} is returned; otherwise, {name1} is returned. -%(see_also)s -%(examples)s""" +{see_also} +{examples}""" _all_desc = """\ Return whether all elements are True, potentially over an axis. @@ -10809,14 +11552,14 @@ def _doc_parms(cls): """ _cnum_doc = """ -Return cumulative %(desc)s over a DataFrame or Series axis. +Return cumulative {desc} over a DataFrame or Series axis. Returns a DataFrame or Series of the same size containing the cumulative -%(desc)s. +{desc}. Parameters ---------- -axis : {0 or 'index', 1 or 'columns'}, default 0 +axis : {{0 or 'index', 1 or 'columns'}}, default 0 The index or the name of the axis. 0 is equivalent to None or 'index'. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result @@ -10827,21 +11570,21 @@ def _doc_parms(cls): Returns ------- -%(name1)s or %(name2)s - Return cumulative %(desc)s of %(name1)s or %(name2)s. +{name1} or {name2} + Return cumulative {desc} of {name1} or {name2}. See Also -------- -core.window.Expanding.%(accum_func_name)s : Similar functionality +core.window.Expanding.{accum_func_name} : Similar functionality but ignores ``NaN`` values. -%(name2)s.%(accum_func_name)s : Return the %(desc)s over - %(name2)s axis. -%(name2)s.cummax : Return cumulative maximum over %(name2)s axis. -%(name2)s.cummin : Return cumulative minimum over %(name2)s axis. -%(name2)s.cumsum : Return cumulative sum over %(name2)s axis. -%(name2)s.cumprod : Return cumulative product over %(name2)s axis. +{name2}.{accum_func_name} : Return the {desc} over + {name2} axis. +{name2}.cummax : Return cumulative maximum over {name2} axis. +{name2}.cummin : Return cumulative minimum over {name2} axis. +{name2}.cumsum : Return cumulative sum over {name2} axis. +{name2}.cumprod : Return cumulative product over {name2} axis. -%(examples)s""" +{examples}""" _cummin_examples = """\ Examples @@ -11116,7 +11859,7 @@ def _doc_parms(cls): _any_desc = """\ Return whether any element is True, potentially over an axis. -Returns False unless there at least one element within a series or +Returns False unless there is at least one element within a series or along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty).""" @@ -11302,218 +12045,4 @@ def _doc_parms(cls): min_count : int, default 0 The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. - - .. versionadded:: 0.22.0 - - Added with the default being 0. This means the sum of an all-NA - or empty Series is 0, and the product of an all-NA or empty - Series is 1. """ - - -def _make_min_count_stat_function( - cls, - name: str, - name1: str, - name2: str, - axis_descr: str, - desc: str, - func: Callable, - see_also: str = "", - examples: str = "", -) -> Callable: - @Substitution( - desc=desc, - name1=name1, - name2=name2, - axis_descr=axis_descr, - min_count=_min_count_stub, - see_also=see_also, - examples=examples, - ) - @Appender(_num_doc) - def stat_func( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - if name == "sum": - nv.validate_sum(tuple(), kwargs) - elif name == "prod": - nv.validate_prod(tuple(), kwargs) - else: - nv.validate_stat_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level( - name, axis=axis, level=level, skipna=skipna, min_count=min_count - ) - return self._reduce( - func, - name=name, - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - ) - - return set_function_name(stat_func, name, cls) - - -def _make_stat_function( - cls, - name: str, - name1: str, - name2: str, - axis_descr: str, - desc: str, - func: Callable, - see_also: str = "", - examples: str = "", -) -> Callable: - @Substitution( - desc=desc, - name1=name1, - name2=name2, - axis_descr=axis_descr, - min_count="", - see_also=see_also, - examples=examples, - ) - @Appender(_num_doc) - def stat_func( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - if name == "median": - nv.validate_median(tuple(), kwargs) - else: - nv.validate_stat_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce( - func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only - ) - - return set_function_name(stat_func, name, cls) - - -def _make_stat_function_ddof( - cls, name: str, name1: str, name2: str, axis_descr: str, desc: str, func: Callable -) -> Callable: - @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) - @Appender(_num_ddof_doc) - def stat_func( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level( - name, axis=axis, level=level, skipna=skipna, ddof=ddof - ) - return self._reduce( - func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof - ) - - return set_function_name(stat_func, name, cls) - - -def _make_cum_function( - cls, - name: str, - name1: str, - name2: str, - axis_descr: str, - desc: str, - accum_func: Callable, - accum_func_name: str, - examples: str, -) -> Callable: - @Substitution( - desc=desc, - name1=name1, - name2=name2, - axis_descr=axis_descr, - accum_func_name=accum_func_name, - examples=examples, - ) - @Appender(_cnum_doc) - def cum_func(self, axis=None, skipna=True, *args, **kwargs): - skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) - if axis is None: - axis = self._stat_axis_number - else: - axis = self._get_axis_number(axis) - - if axis == 1: - return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T - - def block_accum_func(blk_values): - values = blk_values.T if hasattr(blk_values, "T") else blk_values - - result = nanops.na_accum_func(values, accum_func, skipna=skipna) - - result = result.T if hasattr(result, "T") else result - return result - - result = self._mgr.apply(block_accum_func) - - return self._constructor(result).__finalize__(self, method=name) - - return set_function_name(cum_func, name, cls) - - -def _make_logical_function( - cls, - name: str, - name1: str, - name2: str, - axis_descr: str, - desc: str, - func: Callable, - see_also: str, - examples: str, - empty_value: bool, -) -> Callable: - @Substitution( - desc=desc, - name1=name1, - name2=name2, - axis_descr=axis_descr, - see_also=see_also, - examples=examples, - empty_value=empty_value, - ) - @Appender(_bool_doc) - def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - nv.validate_logical_func(tuple(), kwargs, fname=name) - if level is not None: - if bool_only is not None: - raise NotImplementedError( - "Option bool_only is not implemented with option level." - ) - return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce( - func, - name=name, - axis=axis, - skipna=skipna, - numeric_only=bool_only, - filter_type="bool", - ) - - return set_function_name(logical_func, name, cls) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index e71b2f94c8014..99426c55da29b 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -4,17 +4,41 @@ SeriesGroupBy and the DataFrameGroupBy objects. """ import collections +from typing import List + +from pandas._typing import final from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.base import PandasObject + OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) -class GroupByMixin: +class ShallowMixin(PandasObject): + _attributes: List[str] = [] + + @final + def _shallow_copy(self, obj, **kwargs): + """ + return a new object with the replacement attributes + """ + if isinstance(obj, self._constructor): + obj = obj.obj + for attr in self._attributes: + if attr not in kwargs: + kwargs[attr] = getattr(self, attr) + return self._constructor(obj, **kwargs) + + +class GotItemMixin(PandasObject): """ Provide the groupby facilities to the mixed object. """ + _attributes: List[str] + + @final def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. @@ -22,14 +46,16 @@ def _gotitem(self, key, ndim, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on """ # create a new object to prevent aliasing if subset is None: - subset = self.obj + # pandas\core\groupby\base.py:52: error: "GotItemMixin" has no + # attribute "obj" [attr-defined] + subset = self.obj # type: ignore[attr-defined] # we need to make a shallow copy of ourselves # with the same groupby @@ -37,15 +63,28 @@ def _gotitem(self, key, ndim, subset=None): # Try to select from a DataFrame, falling back to a Series try: - groupby = self._groupby[key] + # pandas\core\groupby\base.py:60: error: "GotItemMixin" has no + # attribute "_groupby" [attr-defined] + groupby = self._groupby[key] # type: ignore[attr-defined] except IndexError: - groupby = self._groupby + # pandas\core\groupby\base.py:62: error: "GotItemMixin" has no + # attribute "_groupby" [attr-defined] + groupby = self._groupby # type: ignore[attr-defined] + + # pandas\core\groupby\base.py:64: error: Too many arguments for + # "GotItemMixin" [call-arg] - self = type(self)(subset, groupby=groupby, parent=self, **kwargs) + # pandas\core\groupby\base.py:64: error: Unexpected keyword argument + # "groupby" for "GotItemMixin" [call-arg] + + # pandas\core\groupby\base.py:64: error: Unexpected keyword argument + # "parent" for "GotItemMixin" [call-arg] + self = type(self)( + subset, groupby=groupby, parent=self, **kwargs # type: ignore[call-arg] + ) self._reset_cache() - if subset.ndim == 2: - if is_scalar(key) and key in subset or is_list_like(key): - self._selection = key + if subset.ndim == 2 and (is_scalar(key) and key in subset or is_list_like(key)): + self._selection = key return self @@ -73,15 +112,8 @@ def _gotitem(self, key, ndim, subset=None): ) series_apply_allowlist = ( - ( - common_apply_allowlist - | { - "nlargest", - "nsmallest", - "is_monotonic_increasing", - "is_monotonic_decreasing", - } - ) + common_apply_allowlist + | {"nlargest", "nsmallest", "is_monotonic_increasing", "is_monotonic_decreasing"} ) | frozenset(["dtype", "unique"]) dataframe_apply_allowlist = common_apply_allowlist | frozenset(["dtypes", "corrwith"]) @@ -164,6 +196,7 @@ def _gotitem(self, key, ndim, subset=None): "describe", "dtypes", "expanding", + "ewm", "filter", "get_group", "groups", diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index db734bb2f0c07..64037f5757a38 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + import numpy as np from pandas.core.algorithms import unique1d @@ -6,9 +8,12 @@ CategoricalDtype, recode_for_categories, ) +from pandas.core.indexes.api import CategoricalIndex -def recode_for_groupby(c: Categorical, sort: bool, observed: bool): +def recode_for_groupby( + c: Categorical, sort: bool, observed: bool +) -> Tuple[Categorical, Optional[Categorical]]: """ Code the categories to ensure we can groupby for categoricals. @@ -43,6 +48,9 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool): """ # we only care about observed values if observed: + # In cases with c.ordered, this is equivalent to + # return c.remove_unused_categories(), c + unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] @@ -73,7 +81,9 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool): return c.reorder_categories(cat.categories), None -def recode_from_groupby(c: Categorical, sort: bool, ci): +def recode_from_groupby( + c: Categorical, sort: bool, ci: CategoricalIndex +) -> CategoricalIndex: """ Reverse the codes_to_groupby to account for sort / observed. @@ -91,7 +101,10 @@ def recode_from_groupby(c: Categorical, sort: bool, ci): """ # we re-order to the original category orderings if sort: - return ci.set_categories(c.categories) + # error: "CategoricalIndex" has no attribute "set_categories" + return ci.set_categories(c.categories) # type: ignore[attr-defined] # we are not sorting, so add unobserved to the end - return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) + new_cats = c.categories[~c.categories.isin(ci.categories)] + # error: "CategoricalIndex" has no attribute "add_categories" + return ci.add_categories(new_cats) # type: ignore[attr-defined] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 093e1d4ab3942..07ffb881495fa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -9,7 +9,6 @@ import copy from functools import partial from textwrap import dedent -import typing from typing import ( TYPE_CHECKING, Any, @@ -21,8 +20,8 @@ Mapping, Optional, Sequence, - Tuple, Type, + TypeVar, Union, cast, ) @@ -30,16 +29,14 @@ import numpy as np -from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._libs import lib, reduction as libreduction +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( - maybe_cast_result, + find_common_type, maybe_cast_result_dtype, - maybe_convert_objects, maybe_downcast_numeric, - maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -48,23 +45,25 @@ is_integer_dtype, is_interval_dtype, is_numeric_dtype, - is_object_dtype, is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna, notna +from pandas.core import algorithms, nanops from pandas.core.aggregation import ( + agg_list_like, + aggregate, maybe_mangle_lambdas, reconstruct_func, validate_func_kwargs, ) -import pandas.core.algorithms as algorithms +from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame -from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame +from pandas.core.generic import NDFrame from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, @@ -72,16 +71,13 @@ _apply_docs, _transform_template, get_groupby, + group_selection_context, ) from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase -from pandas.core.internals import BlockManager, make_block +from pandas.core.internals import BlockManager from pandas.core.series import Series -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - generate_numba_func, - split_for_numba, -) +from pandas.core.util.numba_ import maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -95,7 +91,7 @@ # TODO: validate types on ScalarResult and move to _typing # Blocked from using by https://github.com/python/mypy/issues/1484 # See note at _mangle_lambda_list -ScalarResult = typing.TypeVar("ScalarResult") +ScalarResult = TypeVar("ScalarResult") def generate_property(name: str, klass: Type[FrameOrSeries]): @@ -224,12 +220,16 @@ def _selection_name(self): def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) - @doc( - _agg_template, examples=_agg_examples_doc, klass="Series", - ) - def aggregate( - self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs - ): + @doc(_agg_template, examples=_agg_examples_doc, klass="Series") + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + + if maybe_use_numba(engine): + with group_selection_context(self): + data = self._selected_obj + result, index = self._aggregate_with_numba( + data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + return self.obj._constructor(result.ravel(), index=index, name=data.name) relabeling = func is None columns = None @@ -253,18 +253,13 @@ def aggregate( return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._python_agg_general(func, *args, **kwargs) try: - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._python_agg_general(func, *args, **kwargs) except (ValueError, KeyError): - # Do not catch Numba errors here, we want to raise and not fall back. # TODO: KeyError is raised in _python_agg_general, - # see see test_groupby.test_basic + # see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) @@ -278,7 +273,7 @@ def aggregate( if isinstance(ret, dict): from pandas import concat - ret = concat(ret, axis=1) + ret = concat(ret.values(), axis=1, keys=[key.label for key in ret.keys()]) return ret agg = aggregate @@ -307,8 +302,8 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results = {} - for name, func in arg: + results: Dict[base.OutputKey, FrameOrSeriesUnion] = {} + for idx, (name, func) in enumerate(arg): obj = self # reset the cache so that we @@ -317,17 +312,21 @@ def _aggregate_multiple_funcs(self, arg): obj = copy.copy(obj) obj._reset_cache() obj._selection = name - results[name] = obj.aggregate(func) + results[base.OutputKey(label=name, position=idx)] = obj.aggregate(func) if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle return results - return self.obj._constructor_expanddim(results, columns=columns) + output = self._wrap_aggregated_output(results, index=None) + return self.obj._constructor_expanddim(output, columns=columns) + # TODO: index should not be Optional - see GH 35490 def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, - ) -> Union[Series, DataFrame]: + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], + ) -> FrameOrSeriesUnion: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -335,7 +334,7 @@ def _wrap_series_output( ---------- output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. - index : pd.Index + index : pd.Index or None Index to apply to the output. Returns @@ -350,20 +349,25 @@ def _wrap_series_output( indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result: Union[Series, DataFrame] + result: FrameOrSeriesUnion if len(output) > 1: result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns - else: + elif not columns.empty: result = self.obj._constructor( indexed_output[0], index=index, name=columns[0] ) + else: + result = self.obj._constructor_expanddim() return result + # TODO: Remove index argument, use self.grouper.result_index, see GH 35490 def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] - ) -> Union[Series, DataFrame]: + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], + ) -> FrameOrSeriesUnion: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -381,9 +385,7 @@ def _wrap_aggregated_output( In the vast majority of cases output will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - result = self._wrap_series_output( - output=output, index=self.grouper.result_index - ) + result = self._wrap_series_output(output=output, index=index) return self._reindex_output(result) def _wrap_transformed_output( @@ -468,35 +470,48 @@ def _get_index() -> Index: def _aggregate_named(self, func, *args, **kwargs): result = {} + initialized = False for name, group in self: - group.name = name + # Each step of this loop corresponds to + # libreduction._BaseGrouper._apply_to_group + group.name = name # NB: libreduction does not pin name + output = func(group, *args, **kwargs) - if isinstance(output, (Series, Index, np.ndarray)): - raise ValueError("Must produce aggregated value") + output = libreduction.extract_result(output) + if not initialized: + # We only do this validation on the first iteration + libreduction.check_result_array(output, 0) + initialized = True result[name] = output return result @Substitution(klass="Series") @Appender(_transform_template) - def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + + if maybe_use_numba(engine): + with group_selection_context(self): + data = self._selected_obj + result = self._transform_with_numba( + data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + return self.obj._constructor( + result.ravel(), index=data.index, name=data.name + ) + func = self._get_cython_func(func) or func if not isinstance(func, str): - return self._transform_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._transform_general(func, *args, **kwargs) elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) - elif func in base.cythonized_kernels: + elif func in base.cythonized_kernels or func in base.transformation_kernels: # cythonized transform or canned "agg+broadcast" return getattr(self, func)(*args, **kwargs) - elif func in base.transformation_kernels: - return getattr(self, func)(*args, **kwargs) - # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. @@ -505,52 +520,37 @@ def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): result = getattr(self, func)(*args, **kwargs) return self._transform_fast(result) - def _transform_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def _transform_general(self, func, *args, **kwargs): """ Transform with a non-str `func`. """ - - if engine == "numba": - numba_func, cache_key = generate_numba_func( - func, engine_kwargs, kwargs, "groupby_transform" - ) - klass = type(self._selected_obj) results = [] for name, group in self: object.__setattr__(group, "name", name) - if engine == "numba": - values, index = split_for_numba(group) - res = numba_func(values, index, *args) - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_func - else: - res = func(group, *args, **kwargs) + res = func(group, *args, **kwargs) - if isinstance(res, (ABCDataFrame, ABCSeries)): + if isinstance(res, (DataFrame, Series)): res = res._values - indexer = self._get_index(name) - ser = klass(res, indexer) - results.append(ser) + results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat - result = concat(results).sort_index() + concatenated = concat(results) + result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) - dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype): - result = maybe_downcast_to_dtype(result, dtype) + if is_numeric_dtype(result.dtype): + common_dtype = find_common_type([self._selected_obj.dtype, result.dtype]) + if common_dtype is result.dtype: + result = maybe_downcast_numeric(result, self._selected_obj.dtype) result.name = self._selected_obj.name result.index = self._selected_obj.index @@ -601,8 +601,8 @@ def filter(self, func, dropna=True, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) # Interpret np.nan as False. - def true_and_notna(x, *args, **kwargs) -> bool: - b = wrapper(x, *args, **kwargs) + def true_and_notna(x) -> bool: + b = wrapper(x) return b and notna(b) try: @@ -679,8 +679,8 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): + from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut - from pandas.core.reshape.merge import _get_join_indexers if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level @@ -708,7 +708,7 @@ def value_counts( # lab is a Categorical with categories an IntervalIndex lab = cut(Series(val), bins, include_lowest=True) lev = lab.cat.categories - lab = lev.take(lab.cat.codes) + lab = lev.take(lab.cat.codes, allow_fill=True, fill_value=lev._na_value) llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] if is_interval_dtype(lab.dtype): @@ -781,7 +781,7 @@ def value_counts( right = [diff.cumsum() - 1, codes[-1]] - _, idx = _get_join_indexers(left, right, sort=False, how="left") + _, idx = get_join_indexers(left, right, sort=False, how="left") out = np.where(idx != -1, out[idx], 0) if sort: @@ -928,21 +928,21 @@ class DataFrameGroupBy(GroupBy[DataFrame]): See :ref:`groupby.aggregate.named` for more.""" ) - @doc( - _agg_template, examples=_agg_examples_doc, klass="DataFrame", - ) - def aggregate( - self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs - ): - - relabeling, func, columns, order = reconstruct_func(func, **kwargs) + @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - if engine == "numba": - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + if maybe_use_numba(engine): + with group_selection_context(self): + data = self._selected_obj + result, index = self._aggregate_with_numba( + data, func, *args, engine_kwargs=engine_kwargs, **kwargs ) + return self.obj._constructor(result, index=index, columns=data.columns) - result, how = self._aggregate(func, *args, **kwargs) + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + func = maybe_mangle_lambdas(func) + + result, how = aggregate(self, func, *args, **kwargs) if how is None: return result @@ -962,22 +962,23 @@ def aggregate( # try to treat as if we are passing a list try: - result = self._aggregate_multiple_funcs([func], _axis=self.axis) - except ValueError as err: - if "no results" not in str(err): - # raised directly by _aggregate_multiple_funcs - raise - result = self._aggregate_frame(func) - else: + result = agg_list_like(self, [func], _axis=self.axis) + # select everything except for the last level, which is the one # containing the name of the function(s), see GH 32040 result.columns = result.columns.rename( [self._selected_obj.columns.name] * result.columns.nlevels ).droplevel(-1) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) - result.index = np.arange(len(result)) + except ValueError as err: + if "no results" not in str(err): + # raised directly by _aggregate_multiple_funcs + raise + result = self._aggregate_frame(func) + except AttributeError: + # catch exception from line 969 + # (Series does not have attribute "columns"), see GH 35246 + result = self._aggregate_frame(func) if relabeling: @@ -985,6 +986,10 @@ def aggregate( result = result.iloc[:, order] result.columns = columns + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + result.index = np.arange(len(result)) + return result._convert(datetime=True) agg = aggregate @@ -1007,37 +1012,83 @@ def _iterate_slices(self) -> Iterable[Series]: def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> DataFrame: - agg_blocks, agg_items = self._cython_agg_blocks( + agg_mgr = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(agg_blocks, items=agg_items) + return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 - ) -> "Tuple[List[Block], Index]": - # TODO: the actual managing of mgr_locs is a PITA - # here, it should happen via BlockManager.combine + ) -> BlockManager: data: BlockManager = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) - agg_blocks: List[Block] = [] - new_items: List[np.ndarray] = [] - deleted_items: List[np.ndarray] = [] - # Some object-dtype blocks might be split into List[Block[T], Block[U]] - split_items: List[np.ndarray] = [] - split_frames: List[DataFrame] = [] - - no_result = object() - for block in data.blocks: - # Avoid inheriting result from earlier in the loop - result = no_result - locs = block.mgr_locs.as_array + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: + # see if we can cast the values to the desired dtype + # this may not be the original dtype + assert not isinstance(result, DataFrame) + + dtype = maybe_cast_result_dtype(values.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if isinstance(values, Categorical) and isinstance(result, np.ndarray): + # If the Categorical op didn't raise, it is dtype-preserving + result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) + # Note this will have result.dtype == dtype from above + + elif isinstance(result, np.ndarray) and result.ndim == 1: + # We went through a SeriesGroupByPath and need to reshape + # GH#32223 includes case with IntegerArray values + result = result.reshape(1, -1) + # test_groupby_duplicate_columns gets here with + # result.dtype == int64, values.dtype=object, how="min" + + return result + + def py_fallback(bvalues: ArrayLike) -> ArrayLike: + # if self.grouper.aggregate fails, we fall back to a pure-python + # solution + + # We get here with a) EADtypes and b) object dtype + obj: FrameOrSeriesUnion + + # call our grouper again with only this block + if isinstance(bvalues, ExtensionArray): + # TODO(EA2D): special case not needed with 2D EAs + obj = Series(bvalues) + else: + obj = DataFrame(bvalues.T) + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] + + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 + sgb = get_groupby(obj, self.grouper, observed=True) + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + + # unwrap DataFrame to get array + result = result._mgr.blocks[0].values + return result + + def blk_func(bvalues: ArrayLike) -> ArrayLike: + try: - result, _ = self.grouper.aggregate( - block.values, how, axis=1, min_count=min_count + result = self.grouper._cython_operation( + "aggregate", bvalues, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -1048,108 +1099,21 @@ def _cython_agg_blocks( # we cannot perform the operation # in an alternate way, exclude the block assert how == "ohlc" - deleted_items.append(locs) - continue + raise - # call our grouper again with only this block - obj = self.obj[data.items[locs]] - if obj.shape[1] == 1: - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - obj = obj.iloc[:, 0] + result = py_fallback(bvalues) - # Create SeriesGroupBy with observed=True so that it does - # not try to add missing categories if grouping over multiple - # Categoricals. This will done by later self._reindex_output() - # Doing it here creates an error. See GH#34951 - s = get_groupby(obj, self.grouper, observed=True) - try: - result = s.aggregate(lambda x: alt(x, axis=self.axis)) - except TypeError: - # we may have an exception in trying to aggregate - # continue and exclude the block - deleted_items.append(locs) - continue - else: - result = cast(DataFrame, result) - # unwrap DataFrame to get array - if len(result._mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. To keep the code-path for the typical non-split case - # clean, we choose to clean up this mess later on. - split_items.append(locs) - split_frames.append(result) - continue - - assert len(result._mgr.blocks) == 1 - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) + return cast_agg_result(result, bvalues, how) - assert not isinstance(result, DataFrame) + # TypeError -> we may have an exception in trying to aggregate + # continue and exclude the block + # NotImplementedError -> "ohlc" with wrong dtype + new_mgr = data.apply(blk_func, ignore_failures=True) - if result is not no_result: - # see if we can cast the block to the desired dtype - # this may not be the original dtype - dtype = maybe_cast_result_dtype(block.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - # TODO(EA2D): special casing not needed with 2D EAs - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except (ValueError, TypeError): - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) - - agg_block: Block = block.make_block(result) - - new_items.append(locs) - agg_blocks.append(agg_block) - - if not (agg_blocks or split_frames): + if not len(new_mgr): raise DataError("No numeric types to aggregate") - if split_items: - # Clean up the mess left over from split blocks. - for locs, result in zip(split_items, split_frames): - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - new_items.append(np.array([loc], dtype=locs.dtype)) - agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0]) - - # reset the locs in the blocks to correspond to our - # current ordering - indexer = np.concatenate(new_items) - agg_items = data.items.take(np.sort(indexer)) - - if deleted_items: - - # we need to adjust the indexer to account for the - # items we have removed - # really should be done in internals :< - - deleted = np.concatenate(deleted_items) - ai = np.arange(len(data)) - mask = np.zeros(len(data)) - mask[deleted] = 1 - indexer = (ai - mask.cumsum())[indexer] - - offset = 0 - for blk in agg_blocks: - loc = len(blk.mgr_locs) - blk.mgr_locs = indexer[offset : (offset + loc)] - offset += loc - - return agg_blocks, agg_items + return new_mgr def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: @@ -1158,7 +1122,7 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: axis = self.axis obj = self._obj_with_exclusions - result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {} + result: Dict[Label, Union[NDFrame, np.ndarray]] = {} if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) @@ -1181,7 +1145,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: data = obj[item] colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) - cast = self._transform_should_cast(func) try: result[item] = colg.aggregate(func, *args, **kwargs) @@ -1194,10 +1157,6 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: cannot_agg.append(item) continue - else: - if cast: - result[item] = maybe_cast_result(result[item], data) - result_columns = obj.columns if cannot_agg: result_columns = result_columns.drop(cannot_agg) @@ -1208,208 +1167,141 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - # GH12824 first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index - else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] + key_index = self.grouper.result_index if self.as_index else None - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) + if isinstance(first_not_none, (np.ndarray, Index)): + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) + elif not isinstance(first_not_none, Series): + # values are not series or array-like but scalars + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + if self.as_index: + return self.obj._constructor_sliced(values, index=key_index) + else: + result = DataFrame(values, index=key_index, columns=[self._selection]) + self._insert_inaxis_grouper_inplace(result) + return result + else: + # values are Series + return self._wrap_applied_output_series( + keys, values, not_indexed_same, first_not_none, key_index + ) - # reorder the values - values = [values[i] for i in indexer] + def _wrap_applied_output_series( + self, + keys, + values: List[Series], + not_indexed_same: bool, + first_not_none, + key_index, + ) -> FrameOrSeriesUnion: + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) + values = [x if (x is not None) else backup for x in values] + + all_indexed_same = all_indexes_same(x.index for x in values) + + # GH3596 + # provide a reduction (Frame -> Series) if groups are + # unique + if self.squeeze: + applied_index = self._selected_obj._get_axis(self.axis) + singular_series = len(values) == 1 and applied_index.nlevels == 1 + + # assign the name to this series + if singular_series: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat - key_index = Index(keys, name=key_names[0]) + return concat(values) - # don't use the key indexer - if not self.as_index: - key_index = None + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + # Combine values + # vstack+constructor is faster than concat and handles MI-columns + stacked_values = np.vstack([np.asarray(v) for v in values]) - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) - - values = [x if (x is not None) else backup for x in values] - - v = values[0] - - if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: - if isinstance(v, Series): - applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) - singular_series = len(values) == 1 and applied_index.nlevels == 1 - - # GH3596 - # provide a reduction (Frame -> Series) if groups are - # unique - if self.squeeze: - # assign the name to this series - if singular_series: - values[0].name = keys[0] - - # GH2893 - # we have series in the values array, we want to - # produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a - # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same - ) - - # still a series - # path added as of GH 5545 - elif all_indexed_same: - from pandas.core.reshape.concat import concat - - return concat(values) - - if not all_indexed_same: - # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) - - if self.axis == 0 and isinstance(v, ABCSeries): - # GH6124 if the list of Series have a consistent name, - # then propagate that name to the result. - index = v.index.copy() - if index.name is None: - # Only propagate the series name to the result - # if all series have a consistent name. If the - # series do not have a consistent name, do - # nothing. - names = {v.name for v in values} - if len(names) == 1: - index.name = list(names)[0] - - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index - ) - elif not self.as_index: - # We add grouping column below, so create a frame here - result = DataFrame( - values, index=key_index, columns=[self._selection] - ) - else: - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection_name - ) + if self.axis == 0: + index = key_index + columns = first_not_none.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = first_not_none.index + columns = key_index + stacked_values = stacked_values.T - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - so = self._selected_obj - if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): - result = _recast_datetimelike_result(result) - else: - result = result._convert(datetime=True) + result = self.obj._constructor(stacked_values, index=index, columns=columns) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + so = self._selected_obj + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): + result = result._convert(datetime=True) + else: + result = result._convert(datetime=True) - return self._reindex_output(result) + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) - # values are not series or array-like but scalars - else: - # self._selection_name not passed through to Series as the - # result should not take the name of original selection - # of columns - return self.obj._constructor_sliced(values, index=key_index) + return self._reindex_output(result) - def _transform_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def _transform_general(self, func, *args, **kwargs): from pandas.core.reshape.concat import concat applied = [] obj = self._obj_with_exclusions gen = self.grouper.get_iterator(obj, axis=self.axis) - if engine == "numba": - numba_func, cache_key = generate_numba_func( - func, engine_kwargs, kwargs, "groupby_transform" - ) - else: - fast_path, slow_path = self._define_paths(func, *args, **kwargs) + fast_path, slow_path = self._define_paths(func, *args, **kwargs) for name, group in gen: object.__setattr__(group, "name", name) - if engine == "numba": - values, index = split_for_numba(group) - res = numba_func(values, index, *args) - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_func - # Return the result as a DataFrame for concatenation later - res = self.obj._constructor( - res, index=group.index, columns=group.columns - ) - else: - # Try slow path and fast path. - try: - path, res = self._choose_path(fast_path, slow_path, group) - except TypeError: - return self._transform_item_by_item(obj, fast_path) - except ValueError as err: - msg = "transform must return a scalar value for each group" - raise ValueError(msg) from err + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except ValueError as err: + msg = "transform must return a scalar value for each group" + raise ValueError(msg) from err if isinstance(res, Series): @@ -1443,25 +1335,28 @@ def _transform_general( @Substitution(klass="DataFrame") @Appender(_transform_template) - def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + + if maybe_use_numba(engine): + with group_selection_context(self): + data = self._selected_obj + result = self._transform_with_numba( + data, func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + return self.obj._constructor(result, index=data.index, columns=data.columns) # optimized transforms func = self._get_cython_func(func) or func if not isinstance(func, str): - return self._transform_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._transform_general(func, *args, **kwargs) elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) - elif func in base.cythonized_kernels: + elif func in base.cythonized_kernels or func in base.transformation_kernels: # cythonized transformation or canned "reduction+broadcast" return getattr(self, func)(*args, **kwargs) - elif func in base.transformation_kernels: - return getattr(self, func)(*args, **kwargs) - # GH 30918 # Use _transform_fast only when we know func is an aggregation if func in base.reduction_kernels: @@ -1477,9 +1372,7 @@ def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): ): return self._transform_fast(result) - return self._transform_general( - func, engine=engine, engine_kwargs=engine_kwargs, *args, **kwargs - ) + return self._transform_general(func, *args, **kwargs) def _transform_fast(self, result: DataFrame) -> DataFrame: """ @@ -1487,13 +1380,13 @@ def _transform_fast(self, result: DataFrame) -> DataFrame: """ obj = self._obj_with_exclusions - # for each col, reshape to to size of original frame - # by take operation + # for each col, reshape to size of original frame by take operation ids, _, ngroup = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) - output = [] - for i, _ in enumerate(result.columns): - output.append(algorithms.take_1d(result.iloc[:, i].values, ids)) + output = [ + algorithms.take_1d(result.iloc[:, i].values, ids) + for i, _ in enumerate(result.columns) + ] return self.obj._constructor._from_arrays( output, columns=result.columns, index=obj.index @@ -1522,7 +1415,7 @@ def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFram except AssertionError: raise except Exception: - # GH#29631 For user-defined function, we cant predict what may be + # GH#29631 For user-defined function, we can't predict what may be # raised; see test_transform.test_transform_fastpath_raises return path, res @@ -1552,7 +1445,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: else: inds.append(i) - if len(output) == 0: + if not output: raise TypeError("Transform function invalid for data types") columns = obj.columns @@ -1626,6 +1519,9 @@ def filter(self, func, dropna=True, *args, **kwargs): return self._apply_filter(indices, dropna) def __getitem__(self, key): + if self.axis == 1: + # GH 37725 + raise ValueError("Cannot subset columns when using axis=1") # per GH 23566 if isinstance(key, tuple) and len(key) > 1: # if len == 1, then it becomes a SeriesGroupBy and this is actually @@ -1646,7 +1542,7 @@ def _gotitem(self, key, ndim: int, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on @@ -1657,22 +1553,37 @@ def _gotitem(self, key, ndim: int, subset=None): return DataFrameGroupBy( subset, self.grouper, - selection=key, + axis=self.axis, + level=self.level, grouper=self.grouper, exclusions=self.exclusions, + selection=key, as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, observed=self.observed, + mutated=self.mutated, + dropna=self.dropna, ) elif ndim == 1: if subset is None: subset = self.obj[key] return SeriesGroupBy( - subset, selection=key, grouper=self.grouper, observed=self.observed + subset, + level=self.level, + grouper=self.grouper, + selection=key, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + observed=self.observed, + dropna=self.dropna, ) raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj) -> DataFrame: + def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1689,27 +1600,23 @@ def _get_data_to_aggregate(self) -> BlockManager: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result): + def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: # zip in reverse so we can always insert at loc 0 - izip = zip( - *map( - reversed, - ( - self.grouper.names, - self.grouper.get_group_levels(), - [grp.in_axis for grp in self.grouper.groupings], - ), - ) - ) columns = result.columns - for name, lev, in_axis in izip: + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): # GH #28549 # When using .apply(-), name will be in columns already if in_axis and name not in columns: result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1724,8 +1631,8 @@ def _wrap_aggregated_output( DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - name = self._obj_with_exclusions._get_axis(1 - self.axis).name - columns = Index([key.label for key in output], name=name) + columns = Index([key.label for key in output]) + columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names) result = self.obj._constructor(indexed_output) result.columns = columns @@ -1734,8 +1641,7 @@ def _wrap_aggregated_output( self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: - index = self.grouper.result_index - result.index = index + result.index = self.grouper.result_index if self.axis == 1: result = result.T @@ -1758,15 +1664,21 @@ def _wrap_transformed_output( DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - columns = Index(key.label for key in output) - result = self.obj._constructor(indexed_output) - result.columns = columns + + if self.axis == 1: + result = result.T + result.columns = self.obj.columns + else: + columns = Index(key.label for key in output) + columns.name = self.obj.columns.name + result.columns = columns + result.index = self.obj.index return result - def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + def _wrap_agged_blocks(self, blocks: Sequence["Block"], items: Index) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1793,7 +1705,7 @@ def _iterate_column_groupbys(self): exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func): + def _apply_to_column_groupbys(self, func) -> DataFrame: from pandas.core.reshape.concat import concat return concat( @@ -1802,7 +1714,7 @@ def _apply_to_column_groupbys(self, func): axis=1, ) - def count(self): + def count(self) -> DataFrame: """ Compute count of group, excluding missing values. @@ -1815,18 +1727,28 @@ def count(self): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - # TODO(2DEA): reshape would not be necessary with 2D EAs - vals = ((mask & ~isna(blk.values).reshape(blk.shape)) for blk in data.blocks) - locs = (blk.mgr_locs for blk in data.blocks) + def hfunc(bvalues: ArrayLike) -> ArrayLike: + # TODO(2DEA): reshape would not be necessary with 2D EAs + if bvalues.ndim == 1: + # EA + masked = mask & ~isna(bvalues).reshape(1, -1) + else: + masked = mask & ~isna(bvalues) - counted = ( - lib.count_level_2d(x, labels=ids, max_bin=ngroups, axis=1) for x in vals - ) - blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] + counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) + return counted + + new_mgr = data.apply(hfunc) - return self._wrap_agged_blocks(blocks, items=data.items) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _wrap_agged_blocks() returns. GH 35028 + with com.temp_setattr(self, "observed", True): + result = self._wrap_agged_blocks(new_mgr.blocks, items=data.items) + + return self._reindex_output(result, fill_value=0) - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> DataFrame: """ Return DataFrame with counts of unique elements in each position. @@ -1892,6 +1814,7 @@ def nunique(self, dropna: bool = True): ], axis=1, ) + results = cast(DataFrame, results) if axis_number == 1: results = results.T @@ -1903,40 +1826,46 @@ def nunique(self, dropna: bool = True): self._insert_inaxis_grouper_inplace(results) return results - boxplot = boxplot_frame_groupby - - -def _recast_datetimelike_result(result: DataFrame) -> DataFrame: - """ - If we have date/time like in the original, then coerce dates - as we are stacking can easily have object dtypes here. - - Parameters - ---------- - result : DataFrame + @Appender(DataFrame.idxmax.__doc__) + def idxmax(self, axis=0, skipna: bool = True): + axis = DataFrame._get_axis_number(axis) + numeric_only = None if axis == 0 else False + + def func(df): + # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 + res = df._reduce( + nanops.nanargmax, + "argmax", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + ) + indices = res._values + index = df._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return df._constructor_sliced(result, index=res.index) + + return self._python_apply_general(func, self._obj_with_exclusions) + + @Appender(DataFrame.idxmin.__doc__) + def idxmin(self, axis=0, skipna: bool = True): + axis = DataFrame._get_axis_number(axis) + numeric_only = None if axis == 0 else False + + def func(df): + # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 + res = df._reduce( + nanops.nanargmin, + "argmin", + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + ) + indices = res._values + index = df._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return df._constructor_sliced(result, index=res.index) - Returns - ------- - DataFrame + return self._python_apply_general(func, self._obj_with_exclusions) - Notes - ----- - - Assumes Groupby._selected_obj has ndim==2 and at least one - datetimelike column - """ - result = result.copy() - - obj_cols = [ - idx - for idx in range(len(result.columns)) - if is_object_dtype(result.dtypes.iloc[idx]) - ] - - # See GH#26285 - for n in obj_cols: - converted = maybe_convert_objects( - result.iloc[:, n].values, convert_numeric=False - ) - - result.iloc[:, n] = converted - return result + boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..947f18901775b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -11,7 +11,7 @@ class providing the base-class of operations. import datetime from functools import partial, wraps import inspect -import re +from textwrap import dedent import types from typing import ( Callable, @@ -20,10 +20,12 @@ class providing the base-class of operations. Generic, Hashable, Iterable, + Iterator, List, Mapping, Optional, Sequence, + Set, Tuple, Type, TypeVar, @@ -34,14 +36,22 @@ class providing the base-class of operations. from pandas._config.config import option_context -from pandas._libs import Timestamp +from pandas._libs import Timestamp, lib import pandas._libs.groupby as libgroupby -from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar +from pandas._typing import ( + F, + FrameOrSeries, + FrameOrSeriesUnion, + IndexLabel, + Label, + Scalar, + final, +) from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc -from pandas.core.dtypes.cast import maybe_cast_result +from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( ensure_float, is_bool_dtype, @@ -51,6 +61,7 @@ class providing the base-class of operations. is_numeric_dtype, is_object_dtype, is_scalar, + is_timedelta64_dtype, ) from pandas.core.dtypes.missing import isna, notna @@ -61,20 +72,22 @@ class providing the base-class of operations. import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base, ops +from pandas.core.groupby import base, numba_, ops from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter +from pandas.core.util.numba_ import NUMBA_FUNC_CACHE _common_see_also = """ See Also -------- - Series.%(name)s - DataFrame.%(name)s + Series.%(name)s : Apply a function %(name)s to a Series. + DataFrame.%(name)s : Apply a function %(name)s + to each row or column of a DataFrame. """ -_apply_docs = dict( - template=""" +_apply_docs = { + "template": """ Apply function `func` group-wise and combine the results together. The function passed to `apply` must take a {input} as its first @@ -111,7 +124,7 @@ class providing the base-class of operations. Series.apply : Apply a function to a Series. DataFrame.apply : Apply a function to each row or column of a DataFrame. """, - dataframe_examples=""" + "dataframe_examples": """ >>> df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4,6, 5]}) @@ -151,7 +164,7 @@ class providing the base-class of operations. b 2 dtype: int64 """, - series_examples=""" + "series_examples": """ >>> s = pd.Series([0, 1, 2], index='a a b'.split()) >>> g = s.groupby(s.index) @@ -190,7 +203,7 @@ class providing the base-class of operations. -------- {examples} """, -) +} _groupby_agg_method_template = """ Compute {fname} of group values. @@ -214,8 +227,6 @@ class providing the base-class of operations. Apply a function `func` with arguments to this %(klass)s object and return the function's result. -%(versionadded)s - Use `.pipe` when you want to improve readability by chaining together functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing @@ -285,10 +296,11 @@ class providing the base-class of operations. .. versionchanged:: 1.1.0 *args - Positional arguments to pass to func -engine : str, default 'cython' + Positional arguments to pass to func. +engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` .. versionadded:: 1.1.0 engine_kwargs : dict, default None @@ -309,9 +321,12 @@ class providing the base-class of operations. See Also -------- -%(klass)s.groupby.apply -%(klass)s.groupby.aggregate -%(klass)s.transform +%(klass)s.groupby.apply : Apply function func group-wise + and combine the results together. +%(klass)s.groupby.aggregate : Aggregate using one or more + operations over the specified axis. +%(klass)s.transform : Transforms the Series on each group + based on the given function. Notes ----- @@ -382,7 +397,8 @@ class providing the base-class of operations. - dict of axis labels -> functions, function names or list of such. Can also accept a Numba JIT function with - ``engine='numba'`` specified. + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. If the ``'numba'`` engine is chosen, the function must be a user defined function with ``values`` and ``index`` as the @@ -392,10 +408,11 @@ class providing the base-class of operations. .. versionchanged:: 1.1.0 *args - Positional arguments to pass to func -engine : str, default 'cython' + Positional arguments to pass to func. +engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` .. versionadded:: 1.1.0 engine_kwargs : dict, default None @@ -416,9 +433,12 @@ class providing the base-class of operations. See Also -------- -{klass}.groupby.apply -{klass}.groupby.transform -{klass}.aggregate +{klass}.groupby.apply : Apply function func group-wise + and combine the results together. +{klass}.groupby.transform : Aggregate using one or more + operations over the specified axis. +{klass}.aggregate : Transforms the Series on each group + based on the given function. Notes ----- @@ -429,6 +449,7 @@ class providing the base-class of operations. """ +@final class GroupByPlot(PandasObject): """ Class implementing the .plot attribute for groupby objects. @@ -455,13 +476,15 @@ def f(self): @contextmanager -def _group_selection_context(groupby): +def group_selection_context(groupby: "BaseGroupBy") -> Iterator["BaseGroupBy"]: """ - Set / reset the _group_selection_context. + Set / reset the group_selection_context. """ groupby._set_group_selection() - yield groupby - groupby._reset_group_selection() + try: + yield groupby + finally: + groupby._reset_group_selection() _KeysArgType = Union[ @@ -473,19 +496,34 @@ def _group_selection_context(groupby): ] -class _GroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): - _group_selection = None +class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): + _group_selection: Optional[IndexLabel] = None _apply_allowlist: FrozenSet[str] = frozenset() + _hidden_attrs = PandasObject._hidden_attrs | { + "as_index", + "axis", + "dropna", + "exclusions", + "grouper", + "group_keys", + "keys", + "level", + "mutated", + "obj", + "observed", + "sort", + "squeeze", + } def __init__( self, obj: FrameOrSeries, keys: Optional[_KeysArgType] = None, axis: int = 0, - level=None, - grouper: "Optional[ops.BaseGrouper]" = None, - exclusions=None, - selection=None, + level: Optional[IndexLabel] = None, + grouper: Optional["ops.BaseGrouper"] = None, + exclusions: Optional[Set[Label]] = None, + selection: Optional[IndexLabel] = None, as_index: bool = True, sort: bool = True, group_keys: bool = True, @@ -533,35 +571,40 @@ def __init__( self.obj = obj self.axis = obj._get_axis_number(axis) self.grouper = grouper - self.exclusions = set(exclusions) if exclusions else set() + self.exclusions = exclusions or set() + @final def __len__(self) -> int: return len(self.groups) + @final def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) - def _assure_grouper(self): + def _assure_grouper(self) -> None: """ We create the grouper on instantiation sub-classes may have a different policy. """ pass + @final @property - def groups(self): + def groups(self) -> Dict[Hashable, np.ndarray]: """ Dict {group name -> group labels}. """ self._assure_grouper() return self.grouper.groups + @final @property - def ngroups(self): + def ngroups(self) -> int: self._assure_grouper() return self.grouper.ngroups + @final @property def indices(self): """ @@ -570,6 +613,7 @@ def indices(self): self._assure_grouper() return self.grouper.indices + @final def _get_indices(self, names): """ Safe get multiple indices, translate keys for @@ -620,12 +664,14 @@ def get_converter(s): return [self.indices.get(name, []) for name in names] + @final def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ return self._get_indices([name])[0] + @final @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy @@ -637,7 +683,8 @@ def _selected_obj(self): else: return self.obj[self._selection] - def _reset_group_selection(self): + @final + def _reset_group_selection(self) -> None: """ Clear group based selection. @@ -649,7 +696,8 @@ def _reset_group_selection(self): self._group_selection = None self._reset_cache("_selected_obj") - def _set_group_selection(self): + @final + def _set_group_selection(self) -> None: """ Create group based selection. @@ -674,7 +722,10 @@ def _set_group_selection(self): self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - def _set_result_index_ordered(self, result): + @final + def _set_result_index_ordered( + self, result: "OutputFrameOrSeries" + ) -> "OutputFrameOrSeries": # set the result index on the passed values object and # return the new object, xref 8046 @@ -688,7 +739,8 @@ def _set_result_index_ordered(self, result): result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result - def _dir_additions(self): + @final + def _dir_additions(self) -> Set[str]: return self.obj._dir_additions() | self._apply_allowlist def __getattr__(self, attr: str): @@ -703,24 +755,25 @@ def __getattr__(self, attr: str): @Substitution( klass="GroupBy", - versionadded=".. versionadded:: 0.21.0", - examples="""\ ->>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) ->>> df - A B -0 a 1 -1 b 2 -2 a 3 -3 b 4 - -To get the difference between each groups maximum and minimum value in one -pass, you can do - ->>> df.groupby('A').pipe(lambda x: x.max() - x.min()) - B -A -a 2 -b 2""", + examples=dedent( + """\ + >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) + >>> df + A B + 0 a 1 + 1 b 2 + 2 a 3 + 3 b 4 + + To get the difference between each groups maximum and minimum value in one + pass, you can do + + >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + B + A + a 2 + b 2""" + ), ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): @@ -728,16 +781,16 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) - def _make_wrapper(self, name): + @final + def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist - self._set_group_selection() - - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._obj_with_exclusions, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) + with group_selection_context(self): + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._obj_with_exclusions, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) f = getattr(type(self._obj_with_exclusions), name) sig = inspect.signature(f) @@ -761,27 +814,12 @@ def curried(x): if name in base.plotting_methods: return self.apply(curried) - try: - return self._python_apply_general(curried, self._obj_with_exclusions) - except TypeError as err: - if not re.search( - "reduction operation '.*' not allowed for this dtype", str(err) - ): - # We don't have a cython implementation - # TODO: is the above comment accurate? - raise - - if self.obj.ndim == 1: - # this can be called recursively, so need to raise ValueError - raise ValueError - - # GH#3688 try to operate item-by-item - result = self._aggregate_item_by_item(name, *args, **kwargs) - return result + return self._python_apply_general(curried, self._obj_with_exclusions) wrapper.__name__ = name return wrapper + @final def get_group(self, name, obj=None): """ Construct DataFrame from group with provided name. @@ -808,7 +846,7 @@ def get_group(self, name, obj=None): return obj._take_with_is_copy(inds, axis=self.axis) - def __iter__(self): + def __iter__(self) -> Iterator[Tuple[Label, FrameOrSeries]]: """ Groupby iterator. @@ -863,11 +901,12 @@ def f(g): # fails on *some* columns, e.g. a numeric operation # on a string grouper column - with _group_selection_context(self): + with group_selection_context(self): return self._python_apply_general(f, self._selected_obj) return result + @final def _python_apply_general( self, f: F, data: FrameOrSeriesUnion ) -> FrameOrSeriesUnion: @@ -898,6 +937,7 @@ def _iterate_slices(self) -> Iterable[Series]: def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) + @final def _cumcount_array(self, ascending: bool = True): """ Parameters @@ -930,24 +970,12 @@ def _cumcount_array(self, ascending: bool = True): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - def _transform_should_cast(self, func_nm: str) -> bool: - """ - Parameters - ---------- - func_nm: str - The name of the aggregation function being performed - - Returns - ------- - bool - Whether transform should attempt to cast the result of aggregation - """ - filled_series = self.grouper.size().fillna(0) - assert filled_series is not None - return filled_series.gt(0).any() and func_nm not in base.cython_cast_blocklist - - def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): + @final + def _cython_transform( + self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs + ): output: Dict[base.OutputKey, np.ndarray] = {} + for idx, obj in enumerate(self._iterate_slices()): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) @@ -955,22 +983,23 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): continue try: - result, _ = self.grouper.transform(obj.values, how, **kwargs) + result = self.grouper._cython_operation( + "transform", obj._values, how, axis, **kwargs + ) except NotImplementedError: continue - if self._transform_should_cast(how): - result = maybe_cast_result(result, obj, how=how) - key = base.OutputKey(label=name, position=idx) output[key] = result - if len(output) == 0: + if not output: raise DataError("No numeric types to aggregate") return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], index: Optional[Index] + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): @@ -979,6 +1008,7 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + @final def _agg_general( self, numeric_only: bool = True, @@ -987,28 +1017,32 @@ def _agg_general( alias: str, npfunc: Callable, ): - self._set_group_selection() - - # try a cython aggregation if we can - try: - return self._cython_agg_general( - how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, - ) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes + with group_selection_context(self): + # try a cython aggregation if we can + result = None + try: + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) + except DataError: pass - else: - raise + except NotImplementedError as err: + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + else: + raise - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + # apply a non-cython aggregation + if result is None: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1025,34 +1059,105 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result, agg_names = self.grouper.aggregate( - obj._values, how, min_count=min_count + result = self.grouper._cython_operation( + "aggregate", obj._values, how, axis=0, min_count=min_count ) - if agg_names: + if how == "ohlc": # e.g. ohlc + agg_names = ["open", "high", "low", "close"] assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = maybe_cast_result(result_column, obj, how=how) + output[key] = result_column idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = maybe_cast_result(result, obj, how=how) + output[key] = result idx += 1 - if len(output) == 0: + if not output: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) - def _python_agg_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + @final + def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby transform routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + group_keys = self.grouper._get_group_keys() + labels, _, n_groups = self.grouper.group_info + sorted_index = get_group_index_sorter(labels, n_groups) + sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + starts, ends = lib.generate_slices(sorted_labels, n_groups) + + numba_transform_func = numba_.generate_numba_transform_func( + tuple(args), kwargs, func, engine_kwargs + ) + result = numba_transform_func( + sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + ) + + cache_key = (func, "groupby_transform") + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_transform_func + + # result values needs to be resorted to their original positions since we + # evaluated the data sorted by group + return result.take(np.argsort(sorted_index), axis=0) + + @final + def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby aggregation routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + group_keys = self.grouper._get_group_keys() + labels, _, n_groups = self.grouper.group_info + sorted_index = get_group_index_sorter(labels, n_groups) + sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + starts, ends = lib.generate_slices(sorted_labels, n_groups) + + numba_agg_func = numba_.generate_numba_agg_func( + tuple(args), kwargs, func, engine_kwargs + ) + result = numba_agg_func( + sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + ) + + cache_key = (func, "groupby_agg") + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_agg_func + + if self.grouper.nkeys > 1: + index = MultiIndex.from_tuples(group_keys, names=self.grouper.names) + else: + index = Index(group_keys, name=self.grouper.names[0]) + return result, index + + @final + def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) - if engine != "numba": - f = lambda x: func(x, *args, **kwargs) + f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict output: Dict[base.OutputKey, np.ndarray] = {} @@ -1063,43 +1168,36 @@ def _python_agg_general( # agg_series below assumes ngroups > 0 continue - if engine == "numba": - result, counts = self.grouper.agg_series( - obj, - func, - *args, - engine=engine, - engine_kwargs=engine_kwargs, - **kwargs, - ) - else: - try: - # if this function is invalid for this dtype, we will ignore it. - result, counts = self.grouper.agg_series(obj, f) - except TypeError: - continue + try: + # if this function is invalid for this dtype, we will ignore it. + result, counts = self.grouper.agg_series(obj, f) + except TypeError: + continue assert result is not None key = base.OutputKey(label=name, position=idx) - output[key] = maybe_cast_result(result, obj, numeric_only=True) - - if len(output) == 0: - return self._python_apply_general(f, self._selected_obj) - if self.grouper._filter_empty_groups: + if is_numeric_dtype(obj.dtype): + result = maybe_downcast_numeric(result, obj.dtype) - mask = counts.ravel() > 0 - for key, result in output.items(): + if self.grouper._filter_empty_groups: + mask = counts.ravel() > 0 # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - output[key] = maybe_cast_result(values[mask], result) + result = maybe_downcast_numeric(values[mask], result.dtype) - return self._wrap_aggregated_output(output) + output[key] = result + + if not output: + return self._python_apply_general(f, self._selected_obj) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) + + @final def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -1113,19 +1211,19 @@ def reset_identity(values): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self._selected_obj._get_axis(self.axis) + ax = self.filter(lambda x: True).axes[self.axis] # this is a very unfortunate situation # we can't use reindex to restore the original order # when the ax has duplicates # so we resort to this # GH 14776, 30667 - if ax.has_duplicates: + if ax.has_duplicates and not result.axes[self.axis].equals(ax): indexer, _ = result.index.get_indexer_non_unique(ax.values) indexer = algorithms.unique1d(indexer) result = result.take(indexer, axis=self.axis) else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis, copy=False) elif self.group_keys: @@ -1161,6 +1259,7 @@ def reset_identity(values): return result + @final def _apply_filter(self, indices, dropna): if len(indices) == 0: indices = np.array([], dtype="int64") @@ -1182,7 +1281,7 @@ def _apply_filter(self, indices, dropna): OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) -class GroupBy(_GroupBy[FrameOrSeries]): +class GroupBy(BaseGroupBy[FrameOrSeries]): """ Class for grouping and aggregating relational data. @@ -1250,6 +1349,7 @@ class GroupBy(_GroupBy[FrameOrSeries]): more """ + @final @property def _obj_1d_constructor(self) -> Type["Series"]: # GH28330 preserve subclassed Series/DataFrames @@ -1258,6 +1358,7 @@ def _obj_1d_constructor(self) -> Type["Series"]: assert isinstance(self.obj, Series) return self.obj._constructor + @final def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1287,6 +1388,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: skipna=skipna, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def any(self, skipna: bool = True): @@ -1300,10 +1402,13 @@ def any(self, skipna: bool = True): Returns ------- - bool + Series or DataFrame + DataFrame or Series of boolean values, where a value is True if any element + is True within its respective group, False otherwise. """ return self._bool_agg("any", skipna) + @final @Substitution(name="groupby") @Appender(_common_see_also) def all(self, skipna: bool = True): @@ -1317,7 +1422,9 @@ def all(self, skipna: bool = True): Returns ------- - bool + Series or DataFrame + DataFrame or Series of boolean values, where a value is True if all elements + are True within its respective group, False otherwise. """ return self._bool_agg("all", skipna) @@ -1335,6 +1442,7 @@ def count(self): # defined here for API doc raise NotImplementedError + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def mean(self, numeric_only: bool = True): @@ -1391,6 +1499,7 @@ def mean(self, numeric_only: bool = True): numeric_only=numeric_only, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def median(self, numeric_only=True): @@ -1416,6 +1525,7 @@ def median(self, numeric_only=True): numeric_only=numeric_only, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def std(self, ddof: int = 1): @@ -1445,6 +1555,7 @@ def std(self, ddof: int = 1): ddof=ddof, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def var(self, ddof: int = 1): @@ -1469,9 +1580,10 @@ def var(self, ddof: int = 1): ) else: func = lambda x: x.var(ddof=ddof) - with _group_selection_context(self): + with group_selection_context(self): return self._python_agg_general(func) + @final @Substitution(name="groupby") @Appender(_common_see_also) def sem(self, ddof: int = 1): @@ -1497,13 +1609,12 @@ def sem(self, ddof: int = 1): cols = result.columns.get_indexer_for( result.columns.difference(self.exclusions).unique() ) - # TODO(GH-22046) - setting with iloc broken if labels are not unique - # .values to remove labels - result.iloc[:, cols] = ( - result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values + result.iloc[:, cols] = result.iloc[:, cols] / np.sqrt( + self.count().iloc[:, cols] ) return result + @final @Substitution(name="groupby") @Appender(_common_see_also) def size(self) -> FrameOrSeriesUnion: @@ -1529,40 +1640,54 @@ def size(self) -> FrameOrSeriesUnion: return self._reindex_output(result, fill_value=0) + @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): - return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum - ) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) + + @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) + @final @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) def min(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) + @final @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) def max(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) + @final @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): - """Helper function for first item that isn't NA. - """ - x = x.array[notna(x.array)] - if len(x) == 0: + """Helper function for first item that isn't NA.""" + arr = x.array[notna(x.array)] + if not len(arr): return np.nan - return x[0] + return arr[0] if isinstance(obj, DataFrame): return obj.apply(first, axis=axis) @@ -1578,16 +1703,16 @@ def first(x: Series): npfunc=first_compat, ) + @final @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): - """Helper function for last item that isn't NA. - """ - x = x.array[notna(x.array)] - if len(x) == 0: + """Helper function for last item that isn't NA.""" + arr = x.array[notna(x.array)] + if not len(arr): return np.nan - return x[-1] + return arr[-1] if isinstance(obj, DataFrame): return obj.apply(last, axis=axis) @@ -1603,6 +1728,7 @@ def last(x: Series): npfunc=last_compat, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def ohlc(self) -> DataFrame: @@ -1618,14 +1744,16 @@ def ohlc(self) -> DataFrame: """ return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) + @final @doc(DataFrame.describe) def describe(self, **kwargs): - with _group_selection_context(self): + with group_selection_context(self): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T return result.unstack() + @final def resample(self, rule, *args, **kwargs): """ Provide resampling when using a TimeGrouper. @@ -1727,6 +1855,7 @@ def resample(self, rule, *args, **kwargs): return get_resampler_for_grouping(self, rule, *args, **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def rolling(self, *args, **kwargs): @@ -1737,6 +1866,7 @@ def rolling(self, *args, **kwargs): return RollingGroupby(self, *args, **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def expanding(self, *args, **kwargs): @@ -1748,6 +1878,18 @@ def expanding(self, *args, **kwargs): return ExpandingGroupby(self, *args, **kwargs) + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def ewm(self, *args, **kwargs): + """ + Return an ewm grouper, providing ewm functionality per group. + """ + from pandas.core.window import ExponentialMovingWindowGroupby + + return ExponentialMovingWindowGroupby(self, *args, **kwargs) + + @final def _fill(self, direction, limit=None): """ Shared function for `pad` and `backfill` to call Cython method. @@ -1768,8 +1910,8 @@ def _fill(self, direction, limit=None): See Also -------- - pad - backfill + pad : Returns Series with minimum number of char in object. + backfill : Backward fill the missing values in the dataset. """ # Need int value for Cython if limit is None: @@ -1783,8 +1925,10 @@ def _fill(self, direction, limit=None): result_is_index=True, direction=direction, limit=limit, + dropna=self.dropna, ) + @final @Substitution(name="groupby") def pad(self, limit=None): """ @@ -1802,15 +1946,16 @@ def pad(self, limit=None): See Also -------- - Series.pad - DataFrame.pad - Series.fillna - DataFrame.fillna + Series.pad: Returns Series with minimum number of char in object. + DataFrame.pad: Object with missing values filled or None if inplace=True. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. """ return self._fill("ffill", limit=limit) ffill = pad + @final @Substitution(name="groupby") def backfill(self, limit=None): """ @@ -1828,15 +1973,16 @@ def backfill(self, limit=None): See Also -------- - Series.backfill - DataFrame.backfill - Series.fillna - DataFrame.fillna + Series.backfill : Backward fill the missing values in the dataset. + DataFrame.backfill: Backward fill the missing values in the dataset. + Series.fillna: Fill NaN values of a Series. + DataFrame.fillna: Fill NaN values of a DataFrame. """ return self._fill("bfill", limit=limit) bfill = backfill + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: @@ -1925,29 +2071,31 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() + with group_selection_context(self): - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) - mask = mask_left | mask_right + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d( + self._cumcount_array(ascending=False) + 1, -nth_array + ) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - out = self._reindex_output(out) - return out.sort_index() if self.sort else out + out = self._reindex_output(out) + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): @@ -2008,6 +2156,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra return result + @final def quantile(self, q=0.5, interpolation: str = "linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -2060,6 +2209,9 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: elif is_datetime64_dtype(vals.dtype): inference = "datetime64[ns]" vals = np.asarray(vals).astype(float) + elif is_timedelta64_dtype(vals.dtype): + inference = "timedelta64[ns]" + vals = np.asarray(vals).astype(float) return vals, inference @@ -2102,30 +2254,38 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: ) for qi in q ] - result = concat(results, axis=0, keys=q) + result = concat(results, axis=self.axis, keys=q) # fix levels to place quantiles on the inside # TODO(GH-10710): Ideally, we could write this as # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] # but this hits https://github.com/pandas-dev/pandas/issues/10710 # which doesn't reorder the list-like `q` on the inner level. - order = list(range(1, result.index.nlevels)) + [0] + order = list(range(1, result.axes[self.axis].nlevels)) + [0] # temporarily saves the index names - index_names = np.array(result.index.names) + index_names = np.array(result.axes[self.axis].names) # set index names to positions to avoid confusion - result.index.names = np.arange(len(index_names)) + result.axes[self.axis].names = np.arange(len(index_names)) # place quantiles on the inside - result = result.reorder_levels(order) + if isinstance(result, Series): + result = result.reorder_levels(order) + else: + result = result.reorder_levels(order, axis=self.axis) # restore the index names in order - result.index.names = index_names[order] + result.axes[self.axis].names = index_names[order] # reorder rows to keep things sorted - indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() - return result.take(indices) + indices = ( + np.arange(result.shape[self.axis]) + .reshape([len(q), self.ngroups]) + .T.flatten() + ) + return result.take(indices, axis=self.axis) + @final @Substitution(name="groupby") def ngroup(self, ascending: bool = True): """ @@ -2186,13 +2346,14 @@ def ngroup(self, ascending: bool = True): 5 0 dtype: int64 """ - with _group_selection_context(self): + with group_selection_context(self): index = self._selected_obj.index result = self._obj_1d_constructor(self.grouper.group_info[0], index) if not ascending: result = self.ngroups - 1 - result return result + @final @Substitution(name="groupby") def cumcount(self, ascending: bool = True): """ @@ -2247,11 +2408,12 @@ def cumcount(self, ascending: bool = True): 5 0 dtype: int64 """ - with _group_selection_context(self): - index = self._selected_obj.index + with group_selection_context(self): + index = self._selected_obj._get_axis(self.axis) cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) + @final @Substitution(name="groupby") @Appender(_common_see_also) def rank( @@ -2301,6 +2463,7 @@ def rank( axis=axis, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cumprod(self, axis=0, *args, **kwargs): @@ -2317,6 +2480,7 @@ def cumprod(self, axis=0, *args, **kwargs): return self._cython_transform("cumprod", **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cumsum(self, axis=0, *args, **kwargs): @@ -2333,6 +2497,7 @@ def cumsum(self, axis=0, *args, **kwargs): return self._cython_transform("cumsum", **kwargs) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cummin(self, axis=0, **kwargs): @@ -2348,6 +2513,7 @@ def cummin(self, axis=0, **kwargs): return self._cython_transform("cummin", numeric_only=False) + @final @Substitution(name="groupby") @Appender(_common_see_also) def cummax(self, axis=0, **kwargs): @@ -2363,6 +2529,7 @@ def cummax(self, axis=0, **kwargs): return self._cython_transform("cummax", numeric_only=False) + @final def _get_cythonized_result( self, how: str, @@ -2400,7 +2567,7 @@ def _get_cythonized_result( signature needs_2d : bool, default False Whether the values and result of the Cython call signature - are at least 2-dimensional. + are 2-dimensional. min_count : int, default None When not None, min_count for the Cython call needs_mask : bool, default False @@ -2416,7 +2583,9 @@ def _get_cythonized_result( Function should return a tuple where the first element is the values to be passed to Cython and the second element is an optional type which the values should be converted to after being returned - by the Cython operation. Raises if `needs_values` is False. + by the Cython operation. This function is also responsible for + raising a TypeError if the values have an invalid type. Raises + if `needs_values` is False. post_processing : function, default None Function to be applied to result of Cython function. Should accept an array of values as the first argument and type inferences as its @@ -2431,9 +2600,8 @@ def _get_cythonized_result( """ if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") - if post_processing: - if not callable(post_processing): - raise ValueError("'post_processing' must be a callable!") + if post_processing and not callable(post_processing): + raise ValueError("'post_processing' must be a callable!") if pre_processing: if not callable(pre_processing): raise ValueError("'pre_processing' must be a callable!") @@ -2448,6 +2616,7 @@ def _get_cythonized_result( output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) + error_msg = "" for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values @@ -2474,10 +2643,14 @@ def _get_cythonized_result( if needs_values: vals = values if pre_processing: - vals, inferences = pre_processing(vals) + try: + vals, inferences = pre_processing(vals) + except TypeError as e: + error_msg = str(e) + continue + vals = vals.astype(cython_dtype, copy=False) if needs_2d: vals = vals.reshape((-1, 1)) - vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) func = partial(func, labels) @@ -2506,11 +2679,16 @@ def _get_cythonized_result( key = base.OutputKey(label=name, position=idx) output[key] = result + # error_msg is "" on an frame/series with no rows or columns + if not output and error_msg != "": + raise TypeError(error_msg) + if aggregate: - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) else: return self._wrap_transformed_output(output) + @final @Substitution(name="groupby") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ @@ -2554,6 +2732,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): periods=periods, ) + @final @Substitution(name="groupby") @Appender(_common_see_also) def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): @@ -2579,10 +2758,11 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 fill_method = "pad" limit = 0 filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.codes) - shifted = fill_grp.shift(periods=periods, freq=freq) + fill_grp = filled.groupby(self.grouper.codes, axis=self.axis) + shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) return (filled / shifted) - 1 + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def head(self, n=5): @@ -2615,8 +2795,12 @@ def head(self, n=5): """ self._reset_group_selection() mask = self._cumcount_array() < n - return self._selected_obj[mask] + if self.axis == 0: + return self._selected_obj[mask] + else: + return self._selected_obj.iloc[:, mask] + @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def tail(self, n=5): @@ -2649,8 +2833,12 @@ def tail(self, n=5): """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n - return self._selected_obj[mask] + if self.axis == 0: + return self._selected_obj[mask] + else: + return self._selected_obj.iloc[:, mask] + @final def _reindex_output( self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN ) -> OutputFrameOrSeries: @@ -2737,6 +2925,7 @@ def _reindex_output( return output.reset_index(drop=True) + @final def sample( self, n: Optional[int] = None, diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 67003dffb90bb..d814a7cee436e 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -2,12 +2,12 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ -from typing import Dict, Hashable, List, Optional, Tuple +from typing import Dict, Hashable, List, Optional, Set, Tuple import warnings import numpy as np -from pandas._typing import FrameOrSeries +from pandas._typing import FrameOrSeries, Label, final from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -18,7 +18,6 @@ is_scalar, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ABCSeries import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray @@ -99,6 +98,13 @@ class Grouper: .. versionadded:: 1.1.0 + dropna : bool, default True + If True, and if group keys contain NA values, NA values together with + row/column will be dropped. If False, NA values will also be treated as + the key in groups. + + .. versionadded:: 1.2.0 + Returns ------- A specification for a groupby instruction @@ -237,7 +243,6 @@ def __new__(cls, *args, **kwargs): # core/groupby/grouper.py::Grouper # raising these warnings from TimeGrouper directly would fail the test: # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base - # hacky way to set the stacklevel: if cls is TimeGrouper it means # that the call comes from a pandas internal call of resample, # otherwise it comes from pd.Grouper @@ -284,6 +289,7 @@ def __init__( self._grouper = None self.dropna = dropna + @final @property def ax(self): return self.grouper @@ -301,7 +307,10 @@ def _get_grouper(self, obj, validate: bool = True): a tuple of binner, grouper, obj (possibly sorted) """ self._set_grouper(obj) - self.grouper, _, self.obj = get_grouper( + # pandas\core\groupby\grouper.py:310: error: Value of type variable + # "FrameOrSeries" of "get_grouper" cannot be "Optional[Any]" + # [type-var] + self.grouper, _, self.obj = get_grouper( # type: ignore[type-var] self.obj, [self.key], axis=self.axis, @@ -312,6 +321,7 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj + @final def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper @@ -336,10 +346,10 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): if self.key is not None: key = self.key # The 'on' is already defined - if getattr(self.grouper, "name", None) == key and isinstance( - obj, ABCSeries - ): - ax = self._grouper.take(obj.index) + if getattr(self.grouper, "name", None) == key and isinstance(obj, Series): + # pandas\core\groupby\grouper.py:348: error: Item "None" of + # "Optional[Any]" has no attribute "take" [union-attr] + ax = self._grouper.take(obj.index) # type: ignore[union-attr] else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") @@ -371,10 +381,14 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): self.grouper = ax return self.grouper + @final @property def groups(self): - return self.grouper.groups + # pandas\core\groupby\grouper.py:382: error: Item "None" of + # "Optional[Any]" has no attribute "groups" [union-attr] + return self.grouper.groups # type: ignore[union-attr] + @final def __repr__(self) -> str: attrs_list = ( f"{attr_name}={repr(getattr(self, attr_name))}" @@ -386,6 +400,7 @@ def __repr__(self) -> str: return f"{cls_name}({attrs})" +@final class Grouping: """ Holds the grouping information for a single key @@ -394,7 +409,7 @@ class Grouping: ---------- index : Index grouper : - obj Union[DataFrame, Series]: + obj : DataFrame or Series name : Label level : observed : bool, default False @@ -557,8 +572,13 @@ def indices(self): if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices - values = Categorical(self.grouper) - return values._reverse_indexer() + # Return a dictionary of {group label: [indices belonging to the group label]} + # respecting whether sort was specified + codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) + return { + category: np.flatnonzero(codes == i) + for i, category in enumerate(Index(uniques)) + } @property def codes(self) -> np.ndarray: @@ -569,7 +589,9 @@ def codes(self) -> np.ndarray: @cache_readonly def result_index(self) -> Index: if self.all_grouper is not None: - return recode_from_groupby(self.all_grouper, self.sort, self.group_index) + group_idx = self.group_index + assert isinstance(group_idx, CategoricalIndex) # set in __init__ + return recode_from_groupby(self.all_grouper, self.sort, group_idx) return self.group_index @property @@ -580,18 +602,25 @@ def group_index(self) -> Index: return self._group_index def _make_codes(self) -> None: - if self._codes is None or self._group_index is None: - # we have a list of groupers - if isinstance(self.grouper, ops.BaseGrouper): - codes = self.grouper.codes_info - uniques = self.grouper.result_index + if self._codes is not None and self._group_index is not None: + return + + # we have a list of groupers + if isinstance(self.grouper, ops.BaseGrouper): + codes = self.grouper.codes_info + uniques = self.grouper.result_index + else: + # GH35667, replace dropna=False with na_sentinel=None + if not self.dropna: + na_sentinel = None else: - codes, uniques = algorithms.factorize( - self.grouper, sort=self.sort, dropna=self.dropna - ) - uniques = Index(uniques, name=self.name) - self._codes = codes - self._group_index = uniques + na_sentinel = -1 + codes, uniques = algorithms.factorize( + self.grouper, sort=self.sort, na_sentinel=na_sentinel + ) + uniques = Index(uniques, name=self.name) + self._codes = codes + self._group_index = uniques @cache_readonly def groups(self) -> Dict[Hashable, np.ndarray]: @@ -608,7 +637,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": +) -> Tuple["ops.BaseGrouper", Set[Label], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -684,13 +713,13 @@ def get_grouper( if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: - return grouper, [], obj + return grouper, set(), obj else: - return grouper, [key.key], obj + return grouper, {key.key}, obj # already have a BaseGrouper, just return it elif isinstance(key, ops.BaseGrouper): - return key, [], obj + return key, set(), obj if not isinstance(key, list): keys = [key] @@ -733,7 +762,7 @@ def get_grouper( levels = [level] * len(keys) groupings: List[Grouping] = [] - exclusions: List[Hashable] = [] + exclusions: Set[Label] = set() # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: @@ -754,30 +783,30 @@ def is_in_obj(gpr) -> bool: return False try: return gpr is obj[gpr.name] - except (KeyError, IndexError, ValueError): - # TODO: ValueError: Given date string not likely a datetime. - # should be KeyError? + except (KeyError, IndexError): + # IndexError reached in e.g. test_skip_group_keys when we pass + # lambda here return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name - exclusions.append(name) + exclusions.add(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] - exclusions.append(name) + exclusions.add(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions - exclusions.append(gpr.key) + exclusions.add(gpr.key) in_axis, name = False, None else: in_axis, name = False, None @@ -814,7 +843,9 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) + grouper = ops.BaseGrouper( + group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna + ) return grouper, exclusions, obj diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py new file mode 100644 index 0000000000000..76f50f1387196 --- /dev/null +++ b/pandas/core/groupby/numba_.py @@ -0,0 +1,178 @@ +"""Common utilities for Numba operations with groupby ops""" +import inspect +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + NumbaUtilError, + get_jit_arguments, + jit_user_function, +) + + +def validate_udf(func: Callable) -> None: + """ + Validate user defined function for ops when using Numba with groupby ops. + + The first signature arguments should include: + + def f(values, index, ...): + ... + + Parameters + ---------- + func : function, default False + user defined function + + Returns + ------- + None + + Raises + ------ + NumbaUtilError + """ + udf_signature = list(inspect.signature(func).parameters.keys()) + expected_args = ["values", "index"] + min_number_args = len(expected_args) + if ( + len(udf_signature) < min_number_args + or udf_signature[:min_number_args] != expected_args + ): + raise NumbaUtilError( + f"The first {min_number_args} arguments to {func.__name__} must be " + f"{expected_args}" + ) + + +def generate_numba_agg_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: + """ + Generate a numba jitted agg function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby agg function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the groupby evaluation loop. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) + + validate_udf(func) + cache_key = (func, "groupby_agg") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] + + numba_func = jit_user_function(func, nopython, nogil, parallel) + numba = import_optional_dependency("numba") + if parallel: + loop_range = numba.prange + else: + loop_range = range + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_agg( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_groups: int, + num_columns: int, + ) -> np.ndarray: + result = np.empty((num_groups, num_columns)) + for i in loop_range(num_groups): + group_index = index[begin[i] : end[i]] + for j in loop_range(num_columns): + group = values[begin[i] : end[i], j] + result[i, j] = numba_func(group, group_index, *args) + return result + + return group_agg + + +def generate_numba_transform_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., np.ndarray], + engine_kwargs: Optional[Dict[str, bool]], +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: + """ + Generate a numba jitted transform function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby transform function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the groupby evaluation loop. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) + + validate_udf(func) + cache_key = (func, "groupby_transform") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] + + numba_func = jit_user_function(func, nopython, nogil, parallel) + numba = import_optional_dependency("numba") + if parallel: + loop_range = numba.prange + else: + loop_range = range + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_transform( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_groups: int, + num_columns: int, + ) -> np.ndarray: + result = np.empty((len(values), num_columns)) + for i in loop_range(num_groups): + group_index = index[begin[i] : end[i]] + for j in loop_range(num_columns): + group = values[begin[i] : end[i], j] + result[begin[i] : end[i], j] = numba_func(group, group_index, *args) + return result + + return group_transform diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74db87f46c5e2..7724e3930f7df 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -7,19 +7,34 @@ """ import collections -from typing import List, Optional, Sequence, Tuple, Type +from typing import ( + Dict, + Generic, + Hashable, + Iterator, + List, + Optional, + Sequence, + Tuple, + Type, +) import numpy as np from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction -from pandas._typing import F, FrameOrSeries, Label +from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape, final from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import maybe_cast_result +from pandas.core.dtypes.cast import ( + maybe_cast_result, + maybe_cast_result_dtype, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import ( + ensure_float, ensure_float64, ensure_int64, ensure_int_or_float, @@ -30,6 +45,7 @@ is_datetime64_any_dtype, is_datetime64tz_dtype, is_extension_array_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, @@ -37,7 +53,7 @@ is_timedelta64_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.missing import _maybe_fill, isna +from pandas.core.dtypes.missing import isna, maybe_fill import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin @@ -50,16 +66,11 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_iterator, + get_flattened_list, get_group_index, get_group_index_sorter, get_indexer_dict, ) -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - generate_numba_func, - split_for_numba, -) class BaseGrouper: @@ -87,11 +98,12 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: "Sequence[grouper.Grouping]", + groupings: Sequence["grouper.Grouping"], sort: bool = True, group_keys: bool = True, mutated: bool = False, indexer: Optional[np.ndarray] = None, + dropna: bool = True, ): assert isinstance(axis, Index), axis @@ -102,13 +114,14 @@ def __init__( self.group_keys = group_keys self.mutated = mutated self.indexer = indexer + self.dropna = dropna @property def groupings(self) -> List["grouper.Grouping"]: return self._groupings @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: return tuple(ping.ngroups for ping in self.groupings) def __iter__(self): @@ -118,7 +131,9 @@ def __iter__(self): def nkeys(self) -> int: return len(self.groupings) - def get_iterator(self, data: FrameOrSeries, axis: int = 0): + def get_iterator( + self, data: FrameOrSeries, axis: int = 0 + ) -> Iterator[Tuple[Label, FrameOrSeries]]: """ Groupby iterator @@ -130,9 +145,17 @@ def get_iterator(self, data: FrameOrSeries, axis: int = 0): splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() for key, (i, group) in zip(keys, splitter): - yield key, group + yield key, group.__finalize__(data, method="groupby") + @final def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": + """ + Returns + ------- + Generator yielding subsetted objects + + __finalize__ has not been called for the subsetted objects returned. + """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -145,6 +168,7 @@ def _get_grouper(self): """ return self.groupings[0].grouper + @final def _get_group_keys(self): if len(self.groupings) == 1: return self.levels[0] @@ -152,8 +176,9 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) + return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) + @final def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) @@ -210,7 +235,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): # group might be modified group_axes = group.axes res = f(group) - if not _is_indexed_like(res, group_axes): + if not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) @@ -219,12 +244,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ - if len(self.groupings) == 1: - return self.groupings[0].indices - else: - codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + codes_list = [ping.codes for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return get_indexer_dict(codes_list, keys) @property def codes(self) -> List[np.ndarray]: @@ -238,6 +260,7 @@ def levels(self) -> List[Index]: def names(self) -> List[Label]: return [ping.name for ping in self.groupings] + @final def size(self) -> Series: """ Compute group sizes. @@ -251,7 +274,7 @@ def size(self) -> Series: return Series(out, index=self.result_index, dtype="int64") @cache_readonly - def groups(self): + def groups(self) -> Dict[Hashable, np.ndarray]: """ dict {group name -> group labels} """ if len(self.groupings) == 1: return self.groupings[0].groups @@ -260,6 +283,7 @@ def groups(self): to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby) + @final @cache_readonly def is_monotonic(self) -> bool: # return if my group orderings are monotonic @@ -273,6 +297,7 @@ def group_info(self): comp_ids = ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups + @final @cache_readonly def codes_info(self) -> np.ndarray: # return the codes of items in original grouped axis @@ -282,6 +307,7 @@ def codes_info(self) -> np.ndarray: codes = codes[sorter] return codes + @final def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: all_codes = self.codes if len(all_codes) > 1: @@ -291,6 +317,7 @@ def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index)) + @final @cache_readonly def ngroups(self) -> int: return len(self.result_index) @@ -308,11 +335,11 @@ def result_index(self) -> Index: codes = self.reconstructed_codes levels = [ping.result_index for ping in self.groupings] - result = MultiIndex( + return MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names ) - return result + @final def get_group_levels(self) -> List[Index]: if not self.compressed and len(self.groupings) == 1: return [self.groupings[0].result_index] @@ -353,8 +380,7 @@ def get_group_levels(self) -> List[Index]: _cython_arity = {"ohlc": 4} # OHLC - _name_functions = {"ohlc": ["open", "high", "low", "close"]} - + @final def _is_builtin_func(self, arg): """ if we define a builtin function for this argument, return it, @@ -362,6 +388,7 @@ def _is_builtin_func(self, arg): """ return SelectionMixin._builtin_table.get(arg, arg) + @final def _get_cython_function( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -398,6 +425,7 @@ def _get_cython_function( return func + @final def _get_cython_func_and_vals( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -432,17 +460,94 @@ def _get_cython_func_and_vals( raise return func, values - def _cython_operation( + @final + def _disallow_invalid_ops(self, values: ArrayLike, how: str): + """ + Check if we can do this operation with our cython functions. + + Raises + ------ + NotImplementedError + This is either not a valid function for this dtype, or + valid but not implemented in cython. + """ + dtype = values.dtype + + if is_categorical_dtype(dtype) or is_sparse(dtype): + # categoricals are only 1d, so we + # are not setup for dim transforming + raise NotImplementedError(f"{dtype} dtype not supported") + elif is_datetime64_any_dtype(dtype): + # we raise NotImplemented if this is an invalid operation + # entirely, e.g. adding datetimes + if how in ["add", "prod", "cumsum", "cumprod"]: + raise NotImplementedError( + f"datetime64 type does not support {how} operations" + ) + elif is_timedelta64_dtype(dtype): + if how in ["prod", "cumprod"]: + raise NotImplementedError( + f"timedelta64 type does not support {how} operations" + ) + + @final + def _ea_wrap_cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ - Returns the values of a cython operation as a Tuple of [data, names]. + If we have an ExtensionArray, unwrap, call _cython_operation, and + re-wrap if appropriate. + """ + # TODO: general case implementation overrideable by EAs. + orig_values = values - Names is only useful when dealing with 2D results, like ohlc - (see self._name_functions). + if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): + # All of the functions implemented here are ordinal, so we can + # operate on the tz-naive equivalents + values = values.view("M8[ns]") + res_values = self._cython_operation( + kind, values, how, axis, min_count, **kwargs + ) + if how in ["rank"]: + # preserve float64 dtype + return res_values + + res_values = res_values.astype("i8", copy=False) + result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype) + return result + + elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): + # IntegerArray or BooleanArray + values = ensure_int_or_float(values) + res_values = self._cython_operation( + kind, values, how, axis, min_count, **kwargs + ) + dtype = maybe_cast_result_dtype(orig_values.dtype, how) + if is_extension_array_dtype(dtype): + cls = dtype.construct_array_type() + return cls._from_sequence(res_values, dtype=dtype) + return res_values + + elif is_float_dtype(values.dtype): + # FloatingArray + values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) + res_values = self._cython_operation( + kind, values, how, axis, min_count, **kwargs + ) + result = type(orig_values)._from_sequence(res_values) + return result + + raise NotImplementedError(values.dtype) + + @final + def _cython_operation( + self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs + ) -> np.ndarray: + """ + Returns the values of a cython operation. """ - assert kind in ["transform", "aggregate"] orig_values = values + assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") @@ -453,30 +558,12 @@ def _cython_operation( # can we do this operation with our cython functions # if not raise NotImplementedError + self._disallow_invalid_ops(values, how) - # we raise NotImplemented if this is an invalid operation - # entirely, e.g. adding datetimes - - # categoricals are only 1d, so we - # are not setup for dim transforming - if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): - raise NotImplementedError(f"{values.dtype} dtype not supported") - elif is_datetime64_any_dtype(values.dtype): - if how in ["add", "prod", "cumsum", "cumprod"]: - raise NotImplementedError( - f"datetime64 type does not support {how} operations" - ) - elif is_timedelta64_dtype(values.dtype): - if how in ["prod", "cumprod"]: - raise NotImplementedError( - f"timedelta64 type does not support {how} operations" - ) - - if is_datetime64tz_dtype(values.dtype): - # Cast to naive; we'll cast back at the end of the function - # TODO: possible need to reshape? - # TODO(EA2D):kludge can be avoided when 2D EA is allowed. - values = values.view("M8[ns]") + if is_extension_array_dtype(values.dtype): + return self._ea_wrap_cython_operation( + kind, values, how, axis, min_count, **kwargs + ) is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) @@ -494,7 +581,7 @@ def _cython_operation( else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): - values = ensure_float64(values) + values = ensure_float64(ensure_float(values)) else: values = values.astype(object) @@ -529,13 +616,11 @@ def _cython_operation( codes, _, _ = self.group_info if kind == "aggregate": - result = _maybe_fill( - np.empty(out_shape, dtype=out_dtype), fill_value=np.nan - ) + result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": - result = _maybe_fill( + result = maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) @@ -557,48 +642,30 @@ def _cython_operation( if vdim == 1 and arity == 1: result = result[:, 0] - names: Optional[List[str]] = self._name_functions.get(how, None) - if swapped: result = result.swapaxes(0, axis) - if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( - orig_values.dtype - ): - # We need to use the constructors directly for these dtypes - # since numpy won't recognize them - # https://github.com/pandas-dev/pandas/issues/31471 - result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) - elif is_datetimelike and kind == "aggregate": - result = result.astype(orig_values.dtype) - - if is_extension_array_dtype(orig_values.dtype): - result = maybe_cast_result(result=result, obj=orig_values, how=how) + if how not in base.cython_cast_blocklist: + # e.g. if we are int64 and need to restore to datetime64/timedelta64 + # "rank" is the only member of cython_cast_blocklist we get here + dtype = maybe_cast_result_dtype(orig_values.dtype, how) + result = maybe_downcast_to_dtype(result, dtype) - return result, names - - def aggregate( - self, values, how: str, axis: int = 0, min_count: int = -1 - ) -> Tuple[np.ndarray, Optional[List[str]]]: - return self._cython_operation( - "aggregate", values, how, axis, min_count=min_count - ) - - def transform(self, values, how: str, axis: int = 0, **kwargs): - return self._cython_operation("transform", values, how, axis, **kwargs) + return result + @final def _aggregate( - self, result, counts, values, comp_ids, agg_func, min_count: int = -1, + self, result, counts, values, comp_ids, agg_func, min_count: int = -1 ): if agg_func is libgroupby.group_nth: # different signature from the others - # TODO: should we be using min_count instead of hard-coding it? - agg_func(result, counts, values, comp_ids, rank=1, min_count=-1) + agg_func(result, counts, values, comp_ids, min_count, rank=1) else: agg_func(result, counts, values, comp_ids, min_count) return result + @final def _transform( self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs ): @@ -608,22 +675,10 @@ def _transform( return result - def agg_series( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, - ): + def agg_series(self, obj: Series, func: F): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 - if engine == "numba": - return self._aggregate_series_pure_python( - obj, func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) @@ -642,13 +697,14 @@ def agg_series( try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "Function does not reduce" in str(err): + if "Must produce aggregated value" in str(err): # raised in libreduction pass else: raise return self._aggregate_series_pure_python(obj, func) + @final def _aggregate_series_fast(self, obj: Series, func: F): # At this point we have already checked that # - obj.index is not a MultiIndex @@ -668,53 +724,33 @@ def _aggregate_series_fast(self, obj: Series, func: F): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, - ): - - if engine == "numba": - numba_func, cache_key = generate_numba_func( - func, engine_kwargs, kwargs, "groupby_agg" - ) - + @final + def _aggregate_series_pure_python(self, obj: Series, func: F): group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) - result = None + result = np.empty(ngroups, dtype="O") + initialized = False splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: - if engine == "numba": - values, index = split_for_numba(group) - res = numba_func(values, index, *args) - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_func - else: - res = func(group, *args, **kwargs) - - if result is None: - if isinstance(res, (Series, Index, np.ndarray)): - if len(res) == 1: - # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) - # FIXME: are we potentially losing important res.index info? - res = res.item() - else: - raise ValueError("Function does not reduce") - result = np.empty(ngroups, dtype="O") + + # Each step of this loop corresponds to + # libreduction._BaseGrouper._apply_to_group + res = func(group) + res = libreduction.extract_result(res) + + if not initialized: + # We only do this validation on the first iteration + libreduction.check_result_array(res, 0) + initialized = True counts[label] = group.shape[0] result[label] = res - assert result is not None result = lib.maybe_convert_objects(result, try_float=0) - # TODO: maybe_cast_to_extension_array? + result = maybe_cast_result(result, obj, numeric_only=True) return result, counts @@ -874,15 +910,7 @@ def groupings(self) -> "List[grouper.Grouping]": for lvl, name in zip(self.levels, self.names) ] - def agg_series( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, - ): + def agg_series(self, obj: Series, func: F): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result @@ -896,13 +924,13 @@ def agg_series( return grouper.get_result() -def _is_indexed_like(obj, axes) -> bool: +def _is_indexed_like(obj, axes, axis: int) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) elif isinstance(obj, DataFrame): - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) return False @@ -911,7 +939,7 @@ def _is_indexed_like(obj, axes) -> bool: # Splitting / application -class DataSplitter: +class DataSplitter(Generic[FrameOrSeries]): def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_int64(labels) @@ -954,7 +982,8 @@ class SeriesSplitter(DataSplitter): def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) - return type(sdata)(mgr, name=sdata.name, fastpath=True) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr, name=sdata.name, fastpath=True) class FrameSplitter(DataSplitter): @@ -970,7 +999,8 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: # else: # return sdata.iloc[:, slice_obj] mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) - return type(sdata)(mgr) + # __finalize__ not called here, must be applied by caller if applicable + return sdata._constructor(mgr) def get_splitter( diff --git a/pandas/core/index.py b/pandas/core/index.py index a315b9619b0e7..44f434e038a4b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -19,7 +19,7 @@ ensure_index_from_sequences, get_objs_combined_axis, ) -from pandas.core.indexes.multi import _sparsify # noqa:F401 +from pandas.core.indexes.multi import sparsify_labels # noqa:F401 # GH#30193 warnings.warn( diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index d9aa02db3e42a..da4654bbf2c10 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -79,6 +79,9 @@ def is_scalar_indexer(indexer, ndim: int) -> bool: ------- bool """ + if ndim == 1 and is_integer(indexer): + # GH37748: allow indexer to be an integer for Series + return True if isinstance(indexer, tuple): if len(indexer) == ndim: return all( @@ -105,7 +108,7 @@ def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: return True if arr_value.ndim == 1: if not isinstance(indexer, tuple): - indexer = tuple([indexer]) + indexer = (indexer,) return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) return False @@ -114,7 +117,7 @@ def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: # Indexer Validation -def check_setitem_lengths(indexer, value, values) -> None: +def check_setitem_lengths(indexer, value, values) -> bool: """ Validate that value and indexer are the same length. @@ -133,34 +136,46 @@ def check_setitem_lengths(indexer, value, values) -> None: Returns ------- - None + bool + Whether this is an empty listlike setting which is a no-op. Raises ------ ValueError When the indexer is an ndarray or list and the lengths don't match. """ - # boolean with truth values == len of the value is ok too + no_op = False + if isinstance(indexer, (np.ndarray, list)): - if is_list_like(value) and len(indexer) != len(value): - if not ( - isinstance(indexer, np.ndarray) - and indexer.dtype == np.bool_ - and len(indexer[indexer]) == len(value) - ): - raise ValueError( - "cannot set using a list-like indexer " - "with a different length than the value" - ) + # We can ignore other listlikes because they are either + # a) not necessarily 1-D indexers, e.g. tuple + # b) boolean indexers e.g. BoolArray + if is_list_like(value): + if len(indexer) != len(value): + # boolean with truth values == len of the value is ok too + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and len(indexer[indexer]) == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) + if not len(indexer): + no_op = True elif isinstance(indexer, slice): - # slice - if is_list_like(value) and len(values): + if is_list_like(value): if len(value) != length_of_indexer(indexer, values): raise ValueError( "cannot set using a slice indexer with a " "different length than the value" ) + if not len(value): + no_op = True + + return no_op def validate_indices(indices: np.ndarray, n: int) -> None: diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 881d5ce1fbaab..c97778f98387e 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -24,10 +24,15 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex if TYPE_CHECKING: - from pandas import Series # noqa:F401 + from pandas import Series class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): + _hidden_attrs = PandasObject._hidden_attrs | { + "orig", + "name", + } + def __init__(self, data: "Series", orig): if not isinstance(data, ABCSeries): raise TypeError( @@ -78,7 +83,7 @@ def _delegate_property_get(self, name): else: index = self._parent.index # return the result as a Series, which is by definition a copy - result = Series(result, index=index, name=self.name) + result = Series(result, index=index, name=self.name).__finalize__(self._parent) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ( @@ -106,7 +111,9 @@ def _delegate_method(self, name, *args, **kwargs): if not is_list_like(result): return result - result = Series(result, index=self._parent.index, name=self.name) + result = Series(result, index=self._parent.index, name=self.name).__finalize__( + self._parent + ) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ( @@ -234,8 +241,10 @@ def isocalendar(self): See Also -------- - Timestamp.isocalendar - datetime.date.isocalendar + Timestamp.isocalendar : Function return a 3-tuple containing ISO year, + week number, and weekday for the given Timestamp object. + datetime.date.isocalendar : Return a named tuple object with + three components: year, week and weekday. Examples -------- @@ -324,7 +333,8 @@ def to_pytimedelta(self) -> np.ndarray: See Also -------- - datetime.timedelta + datetime.timedelta : A duration expressing the difference + between two date, time, or datetime. Examples -------- @@ -371,7 +381,11 @@ def components(self): 3 0 0 0 3 0 0 0 4 0 0 0 4 0 0 0 """ - return self._get_values().components.set_index(self._parent.index) + return ( + self._get_values() + .components.set_index(self._parent.index) + .__finalize__(self._parent) + ) @property def freq(self): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 9849742abcfca..18981a2190552 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -4,12 +4,12 @@ from pandas._libs import NaT, lib from pandas.errors import InvalidIndexError -import pandas.core.common as com from pandas.core.indexes.base import ( Index, _new_Index, ensure_index, ensure_index_from_sequences, + get_unanimous_names, ) from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex @@ -57,7 +57,7 @@ "ensure_index_from_sequences", "get_objs_combined_axis", "union_indexes", - "get_consensus_names", + "get_unanimous_names", "all_indexes_same", ] @@ -214,23 +214,16 @@ def conv(i): return result.union_many(indexes[1:]) else: for other in indexes[1:]: - # GH 35092. Index.union expects sort=None instead of sort=True - # to signify that sort=True isn't fully implemented and - # legacy implementation sometimes might not sort (see GH 24959) - # In this case we currently sort in _get_combined_index - if sort: - sort = None - result = result.union(other, sort=sort) + result = result.union(other) return result elif kind == "array": index = indexes[0] - for other in indexes[1:]: - if not index.equals(other): - return _unique_indices(indexes) + if not all(index.equals(other) for other in indexes[1:]): + index = _unique_indices(indexes) - name = get_consensus_names(indexes)[0] + name = get_unanimous_names(*indexes)[0] if name != index.name: - index = index._shallow_copy(name=name) + index = index.rename(name) return index else: # kind='list' return _unique_indices(indexes) @@ -274,45 +267,22 @@ def _sanitize_and_check(indexes): return indexes, "array" -def get_consensus_names(indexes): - """ - Give a consensus 'names' to indexes. - - If there's exactly one non-empty 'names', return this, - otherwise, return empty. - - Parameters - ---------- - indexes : list of Index objects - - Returns - ------- - list - A list representing the consensus 'names' found. - """ - # find the non-none names, need to tupleify to make - # the set hashable, then reverse on return - consensus_names = {tuple(i.names) for i in indexes if com.any_not_none(*i.names)} - if len(consensus_names) == 1: - return list(list(consensus_names)[0]) - return [None] * indexes[0].nlevels - - def all_indexes_same(indexes): """ Determine if all indexes contain the same elements. Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f12a2e4c27ea..11b7acc0a9deb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from copy import copy as copy_func from datetime import datetime +from itertools import zip_longest import operator -from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -9,8 +9,14 @@ FrozenSet, Hashable, List, + NewType, Optional, + Sequence, + Set, + Tuple, + TypeVar, Union, + cast, ) import warnings @@ -19,17 +25,15 @@ from pandas._libs import algos as libalgos, index as libindex, lib import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array, no_default -from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp -from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime, Timestamp from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import DtypeObj, Label -from pandas.compat import set_function_name +from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label, Shape, final from pandas.compat.numpy import function as nv -from pandas.errors import InvalidIndexError -from pandas.util._decorators import Appender, Substitution, cache_readonly, doc +from pandas.errors import DuplicateLabelError, InvalidIndexError +from pandas.util._decorators import Appender, cache_readonly, doc -from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.cast import ( + find_common_type, maybe_cast_to_integer_array, validate_numeric_casting, ) @@ -37,7 +41,6 @@ ensure_int64, ensure_object, ensure_platform_int, - is_bool, is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, @@ -57,35 +60,34 @@ is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, + needs_i8_conversion, pandas_dtype, + validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( - ABCCategorical, - ABCDataFrame, ABCDatetimeIndex, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, - ABCRangeIndex, ABCSeries, ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import array_equivalent, isna -from pandas.core import ops +from pandas.core import missing, ops from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.frozen import FrozenList -import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -97,74 +99,29 @@ ) if TYPE_CHECKING: - from pandas import Series + from pandas import MultiIndex, RangeIndex, Series __all__ = ["Index"] _unsortable_types = frozenset(("mixed", "mixed-integer")) -_index_doc_kwargs = dict( - klass="Index", - inplace="", - target_klass="Index", - raises_section="", - unique="Index", - duplicated="np.ndarray", -) -_index_shared_docs = dict() +_index_doc_kwargs = { + "klass": "Index", + "inplace": "", + "target_klass": "Index", + "raises_section": "", + "unique": "Index", + "duplicated": "np.ndarray", +} +_index_shared_docs = {} str_t = str -def _make_comparison_op(op, cls): - def cmp_method(self, other): - if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): - if other.ndim > 0 and len(self) != len(other): - raise ValueError("Lengths must match to compare") - - if is_object_dtype(self.dtype) and isinstance(other, ABCCategorical): - left = type(other)(self._values, dtype=other.dtype) - return op(left, other) - elif is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): - # e.g. PeriodArray - with np.errstate(all="ignore"): - result = op(self._values, other) - - elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex): - # don't pass MultiIndex - with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY(op, self._values, other) - - else: - with np.errstate(all="ignore"): - result = op(self._values, np.asarray(other)) - - if is_bool_dtype(result): - return result - return ops.invalid_comparison(self, other, op) - - name = f"__{op.__name__}__" - return set_function_name(cmp_method, name, cls) - - -def _make_arithmetic_op(op, cls): - def index_arithmetic_method(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame, ABCTimedeltaIndex)): - return NotImplemented - - from pandas import Series - - result = op(Series(self), other) - if isinstance(result, tuple): - return (Index(result[0]), Index(result[1])) - return Index(result) - - name = f"__{op.__name__}__" - return set_function_name(index_arithmetic_method, name, cls) +_o_dtype = np.dtype(object) -_o_dtype = np.dtype(object) -_Identity = object +_Identity = NewType("_Identity", object) def _new_Index(cls, d): @@ -187,9 +144,12 @@ def _new_Index(cls, d): return cls.__new__(cls, **d) +_IndexT = TypeVar("_IndexT", bound="Index") + + class Index(IndexOpsMixin, PandasObject): """ - Immutable ndarray implementing an ordered, sliceable set. The basic object + Immutable sequence used for indexing and alignment. The basic object storing axis labels for all pandas objects. Parameters @@ -233,9 +193,9 @@ class Index(IndexOpsMixin, PandasObject): """ # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations: FrozenSet[str] = ( - PandasObject._deprecations - | IndexOpsMixin._deprecations + _hidden_attrs: FrozenSet[str] = ( + PandasObject._hidden_attrs + | IndexOpsMixin._hidden_attrs | frozenset(["contains", "set_value"]) ) @@ -260,7 +220,7 @@ def _outer_indexer(self, left, right): _typ = "index" _data: Union[ExtensionArray, np.ndarray] - _id = None + _id: Optional[_Identity] = None _name: Label = None # MultiIndex.levels previously allowed setting the index name. We # don't allow this anymore, and raise if it happens rather than @@ -270,6 +230,7 @@ def _outer_indexer(self, left, right): _attributes = ["name"] _is_numeric_dtype = False _can_hold_na = True + _can_hold_strings = True # would we like our indexing holder to defer to us _defer_to_indexing = False @@ -454,6 +415,11 @@ def asi8(self): ndarray An ndarray with int64 dtype. """ + warnings.warn( + "Index.asi8 is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=2, + ) return None @classmethod @@ -475,16 +441,66 @@ def _simple_new(cls, values, name: Label = None): result._index_data = values result._name = name result._cache = {} + result._reset_identity() - return result._reset_identity() + return result @cache_readonly def _constructor(self): return type(self) + @final + def _maybe_check_unique(self): + """ + Check that an Index has no duplicates. + + This is typically only called via + `NDFrame.flags.allows_duplicate_labels.setter` when it's set to + True (duplicates aren't allowed). + + Raises + ------ + DuplicateLabelError + When the index is not unique. + """ + if not self.is_unique: + msg = """Index has duplicates.""" + duplicates = self._format_duplicate_message() + msg += f"\n{duplicates}" + + raise DuplicateLabelError(msg) + + @final + def _format_duplicate_message(self): + """ + Construct the DataFrame for a DuplicateLabelError. + + This returns a DataFrame indicating the labels and positions + of duplicates in an index. This should only be called when it's + already known that duplicates are present. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'a']) + >>> idx._format_duplicate_message() + positions + label + a [0, 2] + """ + from pandas import Series + + duplicates = self[self.duplicated(keep="first")].unique() + assert len(duplicates) + + out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] + if self.nlevels == 1: + out = out.rename_axis("label") + return out.to_frame(name="positions") + # -------------------------------------------------------------------- # Index Internals Methods + @final def _get_attributes_dict(self): """ Return an attributes dict for my class. @@ -505,14 +521,15 @@ def _shallow_copy(self, values=None, name: Label = no_default): name : Label, defaults to self.name """ name = self.name if name is no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._values - result = self._simple_new(values, name=name) - result._cache = cache + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._values, name=name) + result._cache = self._cache return result + @final def is_(self, other) -> bool: """ More flexible, faster check like ``is`` but that works through views. @@ -534,16 +551,23 @@ def is_(self, other) -> bool: -------- Index.identical : Works like ``Index.is_`` but also checks metadata. """ - # use something other than None to be clearer - return self._id is getattr(other, "_id", Ellipsis) and self._id is not None + if self is other: + return True + elif not hasattr(other, "_id"): + return False + elif self._id is None or other._id is None: + return False + else: + return self._id is other._id - def _reset_identity(self): + @final + def _reset_identity(self) -> None: """ Initializes or resets ``_id`` attribute with new object. """ - self._id = _Identity() - return self + self._id = _Identity(object()) + @final def _cleanup(self): self._engine.clear_mapping() @@ -556,6 +580,19 @@ def _engine(self): target_values = self._get_engine_target() return self._engine_type(lambda: target_values, len(self)) + @cache_readonly + def _dir_additions_for_owner(self) -> Set[str_t]: + """ + Add the string-like labels to the owner dataframe/series dir output. + + If this is a MultiIndex, it's first level values are used. + """ + return { + c + for c in self.unique(level=0)[:100] + if isinstance(c, str) and c.isidentifier() + } + # -------------------------------------------------------------------- # Array-Like Methods @@ -574,7 +611,7 @@ def __array__(self, dtype=None) -> np.ndarray: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: @@ -590,6 +627,7 @@ def dtype(self): """ return self._data.dtype + @final def ravel(self, order="C"): """ Return an ndarray of the flattened values of the underlying data. @@ -601,8 +639,14 @@ def ravel(self, order="C"): See Also -------- - numpy.ndarray.ravel + numpy.ndarray.ravel : Return a flattened array. """ + warnings.warn( + "Index.ravel returning ndarray is deprecated; in a future version " + "this will return a view on self.", + FutureWarning, + stacklevel=2, + ) values = self._get_engine_target() return values.ravel(order=order) @@ -623,7 +667,7 @@ def astype(self, dtype, copy=True): Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is - impossible, a ValueError exception is raised. + impossible, a TypeError exception is raised. Parameters ---------- @@ -691,52 +735,45 @@ def astype(self, dtype, copy=True): See Also -------- - numpy.ndarray.take + numpy.ndarray.take: Return an array formed from the + elements of a at the given indices. """ @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if kwargs: - nv.validate_take(tuple(), kwargs) + nv.validate_take((), kwargs) indices = ensure_platform_int(indices) - if self._can_hold_na: - taken = self._assert_take_fillable( - self._values, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value, - ) - else: - if allow_fill and fill_value is not None: - cls_name = type(self).__name__ - raise ValueError( - f"Unable to fill values because {cls_name} cannot contain NA" - ) - taken = self._values.take(indices) + allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) + + # Note: we discard fill_value and use self._na_value, only relevant + # in the case where allow_fill is True and fill_value is not None + taken = algos.take( + self._values, indices, allow_fill=allow_fill, fill_value=self._na_value + ) return self._shallow_copy(taken) - def _assert_take_fillable( - self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan - ): + def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: """ - Internal method to handle NA filling of take. + We only use pandas-style take when allow_fill is True _and_ + fill_value is not None. """ - indices = ensure_platform_int(indices) - - # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: - if (indices < -1).any(): + # only fill if we are passing a non-None fill_value + if self._can_hold_na: + if (indices < -1).any(): + raise ValueError( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + else: + cls_name = type(self).__name__ raise ValueError( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" + f"Unable to fill values because {cls_name} cannot contain NA" ) - taken = algos.take( - values, indices, allow_fill=allow_fill, fill_value=na_value - ) else: - taken = values.take(indices) - return taken + allow_fill = False + return allow_fill _index_shared_docs[ "repeat" @@ -780,13 +817,19 @@ def _assert_take_fillable( @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): repeats = ensure_platform_int(repeats) - nv.validate_repeat(tuple(), dict(axis=axis)) + nv.validate_repeat((), {"axis": axis}) return self._shallow_copy(self._values.repeat(repeats)) # -------------------------------------------------------------------- # Copying Methods - def copy(self, name=None, deep=False, dtype=None, names=None): + def copy( + self: _IndexT, + name: Optional[Label] = None, + deep: bool = False, + dtype: Optional[Dtype] = None, + names: Optional[Sequence[Label]] = None, + ) -> _IndexT: """ Make a copy of this object. @@ -799,6 +842,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None): deep : bool, default False dtype : numpy dtype or pandas type, optional Set dtype for new object. + + .. deprecated:: 1.2.0 + use ``astype`` method instead. names : list-like, optional Kept for compatibility with MultiIndex. Should not be used. @@ -812,21 +858,27 @@ def copy(self, name=None, deep=False, dtype=None, names=None): In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ + name = self._validate_names(name=name, names=names, deep=deep)[0] if deep: - new_index = self._shallow_copy(self._data.copy()) + new_index = self._shallow_copy(self._data.copy(), name=name) else: - new_index = self._shallow_copy() - - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) + new_index = self._shallow_copy(name=name) if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) new_index = new_index.astype(dtype) return new_index + @final def __copy__(self, **kwargs): return self.copy(**kwargs) + @final def __deepcopy__(self, memo=None): """ Parameters @@ -885,7 +937,8 @@ def _format_data(self, name=None) -> str_t: if self.inferred_type == "string": is_justify = False elif self.inferred_type == "categorical": - if is_object_dtype(self.categories): # type: ignore + # error: "Index" has no attribute "categories" + if is_object_dtype(self.categories): # type: ignore[attr-defined] is_justify = False return format_object_summary( @@ -902,7 +955,12 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return self.values - def format(self, name: bool = False, formatter=None, **kwargs): + def format( + self, + name: bool = False, + formatter: Optional[Callable] = None, + na_rep: str_t = "NaN", + ) -> List[str_t]: """ Render a string representation of the Index. """ @@ -917,9 +975,11 @@ def format(self, name: bool = False, formatter=None, **kwargs): if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, **kwargs) + return self._format_with_header(header, na_rep=na_rep) - def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: + def _format_with_header( + self, header: List[str_t], na_rep: str_t = "NaN" + ) -> List[str_t]: from pandas.io.formats.format import format_array values = self._values @@ -933,9 +993,9 @@ def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: # could have nans mask = isna(values) if mask.any(): - result = np.array(result) - result[mask] = na_rep - result = result.tolist() # type: ignore + result_arr = np.array(result) + result_arr[mask] = na_rep + result = result_arr.tolist() else: result = trim_front(format_array(values, None, justify="left")) return header + result @@ -944,6 +1004,8 @@ def to_native_types(self, slicer=None, **kwargs): """ Format specified values of `self` and return them. + .. deprecated:: 1.2.0 + Parameters ---------- slicer : int, array-like @@ -965,6 +1027,12 @@ def to_native_types(self, slicer=None, **kwargs): numpy.ndarray Formatted values. """ + warnings.warn( + "The 'to_native_types' method is deprecated and will be removed in " + "a future version. Use 'astype(str)' instead.", + FutureWarning, + stacklevel=2, + ) values = self if slicer is not None: values = values[slicer] @@ -1179,7 +1247,8 @@ def name(self, value): maybe_extract_name(value, None, type(self)) self._name = value - def _validate_names(self, name=None, names=None, deep: bool = False): + @final + def _validate_names(self, name=None, names=None, deep: bool = False) -> List[Label]: """ Handles the quirks of having a singular 'name' parameter for general Index and plural 'names' parameter for MultiIndex. @@ -1189,15 +1258,25 @@ def _validate_names(self, name=None, names=None, deep: bool = False): if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: - return deepcopy(self.names) if deep else self.names + new_names = deepcopy(self.names) if deep else self.names elif names is not None: if not is_list_like(names): raise TypeError("Must pass list-like as `names`.") - return names + new_names = names + elif not is_list_like(name): + new_names = [name] else: - if not is_list_like(name): - return [name] - return name + new_names = name + + if len(new_names) != len(self.names): + raise ValueError( + f"Length of new names must be {len(self.names)}, got {len(new_names)}" + ) + + # All items in 'new_names' need to be hashable + validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name") + + return new_names def _get_names(self): return FrozenList((self.name,)) @@ -1225,13 +1304,13 @@ def _set_names(self, values, level=None): # GH 20527 # All items in 'name' need to be hashable: - for name in values: - if not is_hashable(name): - raise TypeError(f"{type(self).__name__}.name must be a hashable type") + validate_all_hashable(*values, error_name=f"{type(self).__name__}.name") + self._name = values[0] names = property(fset=_set_names, fget=_get_names) + @final def set_names(self, names, level=None, inplace: bool = False): """ Set Index or MultiIndex name. @@ -1251,8 +1330,8 @@ def set_names(self, names, level=None, inplace: bool = False): Returns ------- - Index - The same type as the caller or None if inplace is True. + Index or None + The same type as the caller or None if ``inplace=True``. See Also -------- @@ -1327,8 +1406,8 @@ def rename(self, name, inplace=False): Returns ------- - Index - The same type as the caller or None if inplace is True. + Index or None + The same type as the caller or None if ``inplace=True``. See Also -------- @@ -1377,6 +1456,7 @@ def _sort_levels_monotonic(self): """ return self + @final def _validate_index_level(self, level): """ Validate index level. @@ -1406,7 +1486,7 @@ def _get_level_number(self, level) -> int: def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ - For internal compatibility with with the Index API. + For internal compatibility with the Index API. Sort the Index. This is for compat with MultiIndex @@ -1421,6 +1501,20 @@ def sortlevel(self, level=None, ascending=True, sort_remaining=None): ------- Index """ + if not isinstance(ascending, (list, bool)): + raise TypeError( + "ascending must be a single bool value or" + "a list of bool values of length 1" + ) + + if isinstance(ascending, list): + if len(ascending) != 1: + raise TypeError("ascending must be a list of bool values of length 1") + ascending = ascending[0] + + if not isinstance(ascending, bool): + raise TypeError("ascending must be a bool value") + return self.sort_values(return_indexer=True, ascending=ascending) def _get_level_values(self, level): @@ -1464,6 +1558,7 @@ def _get_level_values(self, level): get_level_values = _get_level_values + @final def droplevel(self, level=0): """ Return index with requested level(s) removed. @@ -1471,8 +1566,6 @@ def droplevel(self, level=0): If resulting index has only 1 level left, the result will be of Index type, not MultiIndex. - .. versionadded:: 0.23.1 (support for non-MultiIndex) - Parameters ---------- level : int, str, or list-like, default 0 @@ -1482,20 +1575,55 @@ def droplevel(self, level=0): Returns ------- Index or MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + >>> mi + MultiIndex([(1, 3, 5), + (2, 4, 6)], + names=['x', 'y', 'z']) + + >>> mi.droplevel() + MultiIndex([(3, 5), + (4, 6)], + names=['y', 'z']) + + >>> mi.droplevel(2) + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.droplevel('z') + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.droplevel(['x', 'y']) + Int64Index([5, 6], dtype='int64', name='z') """ if not isinstance(level, (tuple, list)): level = [level] levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - if len(level) == 0: + return self._drop_level_numbers(levnums) + + def _drop_level_numbers(self, levnums: List[int]): + """ + Drop MultiIndex levels by level _number_, not name. + """ + + if len(levnums) == 0: return self - if len(level) >= self.nlevels: + if len(levnums) >= self.nlevels: raise ValueError( - f"Cannot remove {len(level)} levels from an index with {self.nlevels} " - "levels: at least one level must be left." + f"Cannot remove {len(levnums)} levels from an index with " + f"{self.nlevels} levels: at least one level must be left." ) # The two checks above guarantee that here self is a MultiIndex + self = cast("MultiIndex", self) new_levels = list(self.levels) new_codes = list(self.codes) @@ -1557,6 +1685,7 @@ def _get_grouper_for_level(self, mapper, level=None): # -------------------------------------------------------------------- # Introspection Methods + @final @property def is_monotonic(self) -> bool: """ @@ -1671,6 +1800,7 @@ def has_duplicates(self) -> bool: """ return not self.is_unique + @final def is_boolean(self) -> bool: """ Check if the Index only consists of booleans. @@ -1706,6 +1836,7 @@ def is_boolean(self) -> bool: """ return self.inferred_type in ["boolean"] + @final def is_integer(self) -> bool: """ Check if the Index only consists of integers. @@ -1741,6 +1872,7 @@ def is_integer(self) -> bool: """ return self.inferred_type in ["integer"] + @final def is_floating(self) -> bool: """ Check if the Index is a floating type. @@ -1784,6 +1916,7 @@ def is_floating(self) -> bool: """ return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] + @final def is_numeric(self) -> bool: """ Check if the Index only consists of numeric data. @@ -1827,6 +1960,7 @@ def is_numeric(self) -> bool: """ return self.inferred_type in ["integer", "floating"] + @final def is_object(self) -> bool: """ Check if the Index is of the object dtype. @@ -1867,6 +2001,7 @@ def is_object(self) -> bool: """ return is_object_dtype(self.dtype) + @final def is_categorical(self) -> bool: """ Check if the Index holds categorical data. @@ -1910,6 +2045,7 @@ def is_categorical(self) -> bool: """ return self.inferred_type in ["categorical"] + @final def is_interval(self) -> bool: """ Check if the Index holds Interval objects. @@ -1943,6 +2079,7 @@ def is_interval(self) -> bool: """ return self.inferred_type in ["interval"] + @final def is_mixed(self) -> bool: """ Check if the Index holds data with mixed data types. @@ -1980,6 +2117,7 @@ def is_mixed(self) -> bool: ) return self.inferred_type in ["mixed"] + @final def holds_integer(self) -> bool: """ Whether the type is an integer type. @@ -1994,17 +2132,30 @@ def inferred_type(self) -> str_t: return lib.infer_dtype(self._values, skipna=False) @cache_readonly - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: """ Whether or not the index values only consist of dates. """ return is_datetime_array(ensure_object(self._values)) + @cache_readonly + def is_all_dates(self): + """ + Whether or not the index values only consist of dates. + """ + warnings.warn( + "Index.is_all_dates is deprecated, will be removed in a future version. " + "check index.inferred_type instead", + FutureWarning, + stacklevel=2, + ) + return self._is_all_dates + # -------------------------------------------------------------------- # Pickle Methods def __reduce__(self): - d = dict(data=self._data) + d = {"data": self._data} d.update(self._get_attributes_dict()) return _new_Index, (type(self), d), None @@ -2028,6 +2179,7 @@ def _isnan(self): return values @cache_readonly + @final def _nan_idxs(self): if self._can_hold_na: return self._isnan.nonzero()[0] @@ -2044,6 +2196,7 @@ def hasnans(self) -> bool: else: return False + @final def isna(self): """ Detect missing values. @@ -2101,6 +2254,7 @@ def isna(self): isnull = isna + @final def notna(self): """ Detect existing (non-missing) values. @@ -2170,7 +2324,7 @@ def fillna(self, value=None, downcast=None): DataFrame.fillna : Fill NaN values of a DataFrame. Series.fillna : Fill NaN Values of a Series. """ - self._assert_can_do_op(value) + value = self._require_scalar(value) if self.hasnans: result = self.putmask(self._isnan, value) if downcast is None: @@ -2214,22 +2368,25 @@ def unique(self, level=None): level : int or str, optional, default None Only return values from specified level (for MultiIndex). - .. versionadded:: 0.23.0 - Returns ------- Index without duplicates See Also -------- - unique - Series.unique + unique : Numpy array of unique values in that column. + Series.unique : Return unique values of Series object. """ if level is not None: self._validate_index_level(level) + + if self.is_unique: + return self._shallow_copy() + result = super().unique() return self._shallow_copy(result) + @final def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed. @@ -2276,6 +2433,9 @@ def drop_duplicates(self, keep="first"): >>> idx.drop_duplicates(keep=False) Index(['cow', 'beetle', 'hippo'], dtype='object') """ + if self.is_unique: + return self._shallow_copy() + return super().drop_duplicates(keep=keep) def duplicated(self, keep="first"): @@ -2332,6 +2492,9 @@ def duplicated(self, keep="first"): >>> idx.duplicated(keep=False) array([ True, False, True, False, True]) """ + if self.is_unique: + # fastpath available bc we are immutable + return np.zeros(len(self), dtype=bool) return super().duplicated(keep=keep) def _get_unique_index(self, dropna: bool = False): @@ -2358,52 +2521,54 @@ def _get_unique_index(self, dropna: bool = False): else: values = self._values - if dropna: - try: - if self.hasnans: - values = values[~isna(values)] - except NotImplementedError: - pass + if dropna and not isinstance(self, ABCMultiIndex): + # isna not defined for MultiIndex + if self.hasnans: + values = values[~isna(values)] return self._shallow_copy(values) # -------------------------------------------------------------------- # Arithmetic & Logical Methods - def __add__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - from pandas import Series - - return Index(Series(self) + other) - - def __radd__(self, other): - from pandas import Series - - return Index(other + Series(self)) - def __iadd__(self, other): # alias for __add__ return self + other - def __sub__(self, other): - return Index(np.array(self) - other) - - def __rsub__(self, other): - # wrap Series to ensure we pin name correctly - from pandas import Series - - return Index(other - Series(self)) - + @final def __and__(self, other): + warnings.warn( + "Index.__and__ operating as a set operation is deprecated, " + "in the future this will be a logical operation matching " + "Series.__and__. Use index.intersection(other) instead", + FutureWarning, + stacklevel=2, + ) return self.intersection(other) + @final def __or__(self, other): + warnings.warn( + "Index.__or__ operating as a set operation is deprecated, " + "in the future this will be a logical operation matching " + "Series.__or__. Use index.union(other) instead", + FutureWarning, + stacklevel=2, + ) return self.union(other) + @final def __xor__(self, other): + warnings.warn( + "Index.__xor__ operating as a set operation is deprecated, " + "in the future this will be a logical operation matching " + "Series.__xor__. Use index.symmetric_difference(other) instead", + FutureWarning, + stacklevel=2, + ) return self.symmetric_difference(other) + @final def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " @@ -2415,6 +2580,7 @@ def __nonzero__(self): # -------------------------------------------------------------------- # Set Operation Methods + @final def _get_reconciled_name_object(self, other): """ If the result of a set operation will be self, @@ -2423,9 +2589,10 @@ def _get_reconciled_name_object(self, other): """ name = get_op_result_name(self, other) if self.name != name: - return self._shallow_copy(name=name) + return self.rename(name) return self + @final def _union_incompatible_dtypes(self, other, sort): """ Casts this and other index to object dtype to allow the formation @@ -2450,7 +2617,7 @@ def _union_incompatible_dtypes(self, other, sort): other = Index(other).astype(object, copy=False) return Index.union(this, other, sort=sort).astype(object, copy=False) - def _is_compatible_with_other(self, other) -> bool: + def _can_union_without_object_cast(self, other) -> bool: """ Check whether this and the other dtype are compatible with each other. Meaning a union can be formed between them without needing to be cast @@ -2466,6 +2633,7 @@ def _is_compatible_with_other(self, other) -> bool: """ return type(self) is type(other) and is_dtype_equal(self.dtype, other.dtype) + @final def _validate_sort_keyword(self, sort): if sort not in [None, False]: raise ValueError( @@ -2526,11 +2694,14 @@ def union(self, other, sort=None): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) + other, result_name = self._convert_can_do_setop(other) - if not self._is_compatible_with_other(other): + if not self._can_union_without_object_cast(other): return self._union_incompatible_dtypes(other, sort=sort) - return self._union(other, sort=sort) + result = self._union(other, sort=sort) + + return self._wrap_setop_result(other, result) def _union(self, other, sort): """ @@ -2552,10 +2723,10 @@ def _union(self, other, sort): Index """ if not len(other) or self.equals(other): - return self._get_reconciled_name_object(other) + return self if not len(self): - return other._get_reconciled_name_object(self) + return other # TODO(EA): setops-refactor, clean all this up lvals = self._values @@ -2597,12 +2768,24 @@ def _union(self, other, sort): stacklevel=3, ) - # for subclasses - return self._wrap_setop_result(other, result) + return result + @final def _wrap_setop_result(self, other, result): + if isinstance(self, (ABCDatetimeIndex, ABCTimedeltaIndex)) and isinstance( + result, np.ndarray + ): + result = type(self._data)._simple_new(result, dtype=self.dtype) + elif is_categorical_dtype(self.dtype) and isinstance(result, np.ndarray): + result = Categorical(result, dtype=self.dtype) + name = get_op_result_name(self, other) - return self._shallow_copy(result, name=name) + if isinstance(result, Index): + if result.name != name: + return result.rename(name) + return result + else: + return self._shallow_copy(result, name=name) # TODO: standardize return type of non-union setops type(self vs other) def intersection(self, other, sort=False): @@ -2641,16 +2824,26 @@ def intersection(self, other, sort=False): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other = ensure_index(other) + other, _ = self._convert_can_do_setop(other) if self.equals(other): + if self.has_duplicates: + return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype("O") - other = other.astype("O") + dtype = find_common_type([self.dtype, other.dtype]) + this = self.astype(dtype, copy=False) + other = other.astype(dtype, copy=False) return this.intersection(other, sort=sort) + result = self._intersection(other, sort=sort) + return self._wrap_setop_result(other, result) + + def _intersection(self, other, sort=False): + """ + intersection specialized to the case with matching dtypes. + """ # TODO(EA): setops-refactor, clean all this up lvals = self._values rvals = other._values @@ -2661,7 +2854,7 @@ def intersection(self, other, sort=False): except TypeError: pass else: - return self._wrap_setop_result(other, result) + return algos.unique1d(result) try: indexer = Index(rvals).get_indexer(lvals) @@ -2672,15 +2865,15 @@ def intersection(self, other, sort=False): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - taken = other.take(indexer) - res_name = get_op_result_name(self, other) + result = other.take(indexer).unique()._values if sort is None: - taken = algos.safe_sort(taken.values) - return self._shallow_copy(taken, name=res_name) + result = algos.safe_sort(result) - taken.name = res_name - return taken + # Intersection has to be unique + assert Index(result).is_unique + + return result def difference(self, other, sort=None): """ @@ -2722,12 +2915,15 @@ def difference(self, other, sort=None): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) + other, result_name = self._convert_can_do_setop(other) if self.equals(other): - # pass an empty np.ndarray with the appropriate dtype - return self._shallow_copy(self._data[:0]) + return self[:0].rename(result_name) - other, result_name = self._convert_can_do_setop(other) + result = self._difference(other, sort=sort) + return self._wrap_setop_result(other, result) + + def _difference(self, other, sort): this = self._get_unique_index() @@ -2735,14 +2931,14 @@ def difference(self, other, sort=None): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - the_diff = this.values.take(label_diff) + the_diff = this._values.take(label_diff) if sort is None: try: the_diff = algos.safe_sort(the_diff) except TypeError: pass - return this._shallow_copy(the_diff, name=result_name) + return the_diff def symmetric_difference(self, other, result_name=None, sort=None): """ @@ -2852,7 +3048,7 @@ def get_loc(self, key, method=None, tolerance=None): distances are broken by preferring the larger index value. tolerance : int or float, optional Maximum distance from index value for inexact matches. The value of - the index at the matching location most satisfy the equation + the index at the matching location must satisfy the equation ``abs(index[loc] - key) <= tolerance``. Returns @@ -2917,7 +3113,7 @@ def get_loc(self, key, method=None, tolerance=None): inexact matches. tolerance : optional Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most + matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Tolerance may be a scalar value, which applies the same tolerance @@ -2947,10 +3143,9 @@ def get_loc(self, key, method=None, tolerance=None): def get_indexer( self, target, method=None, limit=None, tolerance=None ) -> np.ndarray: + method = missing.clean_reindex_fill_method(method) target = ensure_index(target) - if tolerance is not None: - tolerance = self._convert_tolerance(tolerance, target) # Treat boolean labels passed to a numeric index as not found. Without # this fix False and True would be treated as 0 and 1 respectively. @@ -2964,6 +3159,14 @@ def get_indexer( ptarget, method=method, limit=limit, tolerance=tolerance ) + return self._get_indexer(target, method, limit, tolerance) + + def _get_indexer( + self, target: "Index", method=None, limit=None, tolerance=None + ) -> np.ndarray: + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) + if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) @@ -3003,6 +3206,7 @@ def _convert_tolerance(self, tolerance, target): raise ValueError("list-like tolerance size must match target index size") return tolerance + @final def _get_fill_indexer( self, target: "Index", method: str_t, limit=None, tolerance=None ) -> np.ndarray: @@ -3018,10 +3222,11 @@ def _get_fill_indexer( indexer = engine_method(target_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) - if tolerance is not None: + if tolerance is not None and len(self): indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer + @final def _get_fill_indexer_searchsorted( self, target: "Index", method: str_t, limit=None ) -> np.ndarray: @@ -3055,18 +3260,28 @@ def _get_fill_indexer_searchsorted( indexer[indexer == len(self)] = -1 return indexer + @final def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: """ Get the indexer for the nearest index labels; requires an index with values that can be subtracted from each other (e.g., not strings or tuples). """ + if not len(self): + return self._get_fill_indexer(target, "pad") + left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) target_values = target._values - left_distances = np.abs(self._values[left_indexer] - target_values) - right_distances = np.abs(self._values[right_indexer] - target_values) + # error: Unsupported left operand type for - ("ExtensionArray") + left_distances = np.abs( + self._values[left_indexer] - target_values # type: ignore[operator] + ) + # error: Unsupported left operand type for - ("ExtensionArray") + right_distances = np.abs( + self._values[right_indexer] - target_values # type: ignore[operator] + ) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( @@ -3078,13 +3293,15 @@ def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer + @final def _filter_indexer_tolerance( self, target: Union["Index", np.ndarray, ExtensionArray], indexer: np.ndarray, tolerance, ) -> np.ndarray: - distance = abs(self._values[indexer] - target) + # error: Unsupported left operand type for - ("ExtensionArray") + distance = abs(self._values[indexer] - target) # type: ignore[operator] indexer = np.where(distance <= tolerance, indexer, -1) return indexer @@ -3101,6 +3318,7 @@ def _get_partial_string_timestamp_match_key(self, key): # GH#10331 return key + @final def _validate_positional_slice(self, key: slice): """ For positional indexing, a slice must have either int or None @@ -3195,7 +3413,7 @@ def _convert_listlike_indexer(self, keyarr): Return tuple-safe keys. """ if isinstance(keyarr, Index): - keyarr = self._convert_index_indexer(keyarr) + pass else: keyarr = self._convert_arr_indexer(keyarr) @@ -3218,21 +3436,6 @@ def _convert_arr_indexer(self, keyarr): keyarr = com.asarray_tuplesafe(keyarr) return keyarr - def _convert_index_indexer(self, keyarr): - """ - Convert an Index indexer to the appropriate dtype. - - Parameters - ---------- - keyarr : Index (or sub-class) - Indexer to convert. - - Returns - ------- - converted_keyarr : Index (or sub-class) - """ - return keyarr - def _convert_list_indexer(self, keyarr): """ Convert a list-like indexer to the appropriate dtype. @@ -3249,11 +3452,12 @@ def _convert_list_indexer(self, keyarr): """ return None - def _invalid_indexer(self, form: str_t, key): + @final + def _invalid_indexer(self, form: str_t, key) -> TypeError: """ Consistent invalid indexer message. """ - raise TypeError( + return TypeError( f"cannot do {form} indexing on {type(self).__name__} with these " f"indexers [{key}] of type {type(key).__name__}" ) @@ -3261,6 +3465,7 @@ def _invalid_indexer(self, form: str_t, key): # -------------------------------------------------------------------- # Reindex Methods + @final def _can_reindex(self, indexer): """ Check if we are allowing reindexing with this particular indexer. @@ -3274,7 +3479,7 @@ def _can_reindex(self, indexer): ValueError if its a duplicate axis """ # trying to reindex on an axis with duplicates - if not self.is_unique and len(indexer): + if not self._index_as_unique and len(indexer): raise ValueError("cannot reindex from a duplicate axis") def reindex(self, target, method=None, level=None, limit=None, tolerance=None): @@ -3300,11 +3505,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: - if isinstance(self, ABCRangeIndex): - values = range(0) - else: - values = self._data[:0] # appropriately-dtyped empty array - target = self._simple_new(values, name=self.name) + target = self[:0] else: target = ensure_index(target) @@ -3318,8 +3519,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): if self.equals(target): indexer = None else: - # check is_overlapping for IntervalIndex compat - if self.is_unique and not getattr(self, "is_overlapping", False): + if self._index_as_unique: indexer = self.get_indexer( target, method=method, limit=limit, tolerance=tolerance ) @@ -3355,6 +3555,10 @@ def _reindex_non_unique(self, target): """ target = ensure_index(target) + if len(target) == 0: + # GH#13691 + return self[:0], np.array([], dtype=np.intp), None + indexer, missing = self.get_indexer_non_unique(target) check = indexer != -1 new_labels = self.take(indexer[check]) @@ -3369,7 +3573,7 @@ def _reindex_non_unique(self, target): cur_labels = self.take(indexer[check]).values cur_indexer = ensure_int64(length[check]) - new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels = np.empty((len(indexer),), dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels @@ -3392,7 +3596,10 @@ def _reindex_non_unique(self, target): new_indexer = np.arange(len(self.take(indexer))) new_indexer[~check] = -1 - new_index = Index(new_labels, name=self.name) + if isinstance(self, ABCMultiIndex): + new_index = type(self).from_tuples(new_labels, names=self.names) + else: + new_index = Index(new_labels, name=self.name) return new_index, indexer, new_indexer # -------------------------------------------------------------------- @@ -3520,27 +3727,29 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) else: return join_index + @final def _join_multi(self, other, how, return_indexers=True): from pandas.core.indexes.multi import MultiIndex - from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + from pandas.core.reshape.merge import restore_dropped_levels_multijoin # figure out join names - self_names = set(com.not_none(*self.names)) - other_names = set(com.not_none(*other.names)) + self_names_list = list(com.not_none(*self.names)) + other_names_list = list(com.not_none(*other.names)) + self_names_order = self_names_list.index + other_names_order = other_names_list.index + self_names = set(self_names_list) + other_names = set(other_names_list) overlap = self_names & other_names # need at least 1 in common if not overlap: raise ValueError("cannot join with no overlapping index names") - self_is_mi = isinstance(self, ABCMultiIndex) - other_is_mi = isinstance(other, ABCMultiIndex) - - if self_is_mi and other_is_mi: + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): # Drop the non-matching levels from left and right respectively - ldrop_names = list(self_names - overlap) - rdrop_names = list(other_names - overlap) + ldrop_names = sorted(self_names - overlap, key=self_names_order) + rdrop_names = sorted(other_names - overlap, key=other_names_order) # if only the order differs if not len(ldrop_names + rdrop_names): @@ -3561,7 +3770,7 @@ def _join_multi(self, other, how, return_indexers=True): # common levels, ldrop_names, rdrop_names dropped_names = ldrop_names + rdrop_names - levels, codes, names = _restore_dropped_levels_multijoin( + levels, codes, names = restore_dropped_levels_multijoin( self, other, dropped_names, join_idx, lidx, ridx ) @@ -3582,7 +3791,7 @@ def _join_multi(self, other, how, return_indexers=True): # Case where only one index is multi # make the indices into mi's that match flip_order = False - if self_is_mi: + if isinstance(self, MultiIndex): self, other = other, self flip_order = True # flip if join method is right or left @@ -3598,8 +3807,9 @@ def _join_multi(self, other, how, return_indexers=True): return result[0], result[2], result[1] return result + @final def _join_non_unique(self, other, how="left", return_indexers=False): - from pandas.core.reshape.merge import _get_join_indexers + from pandas.core.reshape.merge import get_join_indexers # We only get here if dtypes match assert self.dtype == other.dtype @@ -3607,7 +3817,7 @@ def _join_non_unique(self, other, how="left", return_indexers=False): lvalues = self._get_engine_target() rvalues = other._get_engine_target() - left_idx, right_idx = _get_join_indexers( + left_idx, right_idx = get_join_indexers( [lvalues], [rvalues], how=how, sort=True ) @@ -3625,6 +3835,7 @@ def _join_non_unique(self, other, how="left", return_indexers=False): else: return join_index + @final def _join_level( self, other, level, how="left", return_indexers=False, keep_order=True ): @@ -3672,6 +3883,8 @@ def _get_leaf_sorter(labels): left, right = right, left how = {"right": "left", "left": "right"}.get(how, how) + assert isinstance(left, MultiIndex) + level = left._get_level_number(level) old_level = left.levels[level] @@ -3695,9 +3908,9 @@ def _get_leaf_sorter(labels): else: left_lev_indexer = ensure_int64(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) - + old_codes = left.codes[level] new_lev_codes = algos.take_nd( - rev_indexer, left.codes[level], allow_fill=False + rev_indexer, old_codes[old_codes != -1], allow_fill=False ) new_codes = list(left.codes) @@ -3766,6 +3979,7 @@ def _get_leaf_sorter(labels): else: return join_index + @final def _join_monotonic(self, other, how="left", return_indexers=False): # We only get here with matching dtypes assert other.dtype == self.dtype @@ -3814,9 +4028,16 @@ def _join_monotonic(self, other, how="left", return_indexers=False): else: return join_index - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return Index(joined, name=name) + def _wrap_joined_index( + self: _IndexT, joined: np.ndarray, other: _IndexT + ) -> _IndexT: + assert other.dtype == self.dtype + + if isinstance(self, ABCMultiIndex): + name = self.names if self.names == other.names else None + else: + name = get_op_result_name(self, other) + return self._constructor(joined, name=name) # -------------------------------------------------------------------- # Uncategorized Methods @@ -3860,7 +4081,7 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: This is an ndarray or ExtensionArray. - ``_values`` are consistent between``Series`` and ``Index``. + ``_values`` are consistent between ``Series`` and ``Index``. It may differ from the public '.values' method. @@ -3875,7 +4096,7 @@ def _values(self) -> Union[ExtensionArray, np.ndarray]: See Also -------- - values + values : Values """ return self._data @@ -3928,25 +4149,19 @@ def where(self, cond, other=None): if other is None: other = self._na_value - dtype = self.dtype values = self.values - if is_bool(other) or is_bool_dtype(other): - - # bools force casting - values = values.astype(object) - dtype = None + try: + self._validate_fill_value(other) + except (ValueError, TypeError): + return self.astype(object).where(cond, other) values = np.where(cond, values, other) - if self._is_numeric_dtype and np.any(isna(values)): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index(values, dtype=dtype, name=self.name) + return Index(values, name=self.name) # construction helpers + @final @classmethod def _scalar_data_error(cls, data): # We return the TypeError so that we can raise it from the constructor @@ -3956,6 +4171,7 @@ def _scalar_data_error(cls, data): f"kind, {repr(data)} was passed" ) + @final @classmethod def _string_data_error(cls, data): raise TypeError( @@ -3963,6 +4179,7 @@ def _string_data_error(cls, data): "to explicitly cast to a numeric type" ) + @final def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -3980,24 +4197,22 @@ def _coerce_scalar_to_index(self, item): return Index([item], dtype=dtype, **self._get_attributes_dict()) - def _to_safe_for_reshape(self): + def _validate_fill_value(self, value): """ - Convert to object if we are a categorical. - """ - return self - - def _convert_for_op(self, value): - """ - Convert value to be insertable to ndarray. + Check if the value can be inserted into our array, and convert + it to an appropriate native type if necessary. """ return value - def _assert_can_do_op(self, value): + @final + def _require_scalar(self, value): """ - Check value is valid for scalar op. + Check that this is a scalar value that we can use for setitem-like + operations without changing dtype. """ if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") + return value @property def _has_complex_internals(self) -> bool: @@ -4013,7 +4228,7 @@ def _is_memory_usage_qualified(self) -> bool: """ return self.is_object() - def is_type_compatible(self, kind) -> bool: + def is_type_compatible(self, kind: str_t) -> bool: """ Whether the index type is compatible with the provided type. """ @@ -4060,9 +4275,11 @@ def __contains__(self, key: Any) -> bool: except (OverflowError, TypeError, ValueError): return False + @final def __hash__(self): raise TypeError(f"unhashable type: {repr(type(self).__name__)}") + @final def __setitem__(self, key, value): raise TypeError("Index does not support mutable operations") @@ -4103,6 +4320,7 @@ def __getitem__(self, key): else: return result + @final def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ Faster check for ``name in self`` when we know `name` is a Python @@ -4144,13 +4362,13 @@ def append(self, other): return self._concat(to_concat, name) - def _concat(self, to_concat, name): + def _concat(self, to_concat: List["Index"], name: Label) -> "Index": """ Concatenate multiple Index objects. """ - to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] + to_concat_vals = [x._values for x in to_concat] - result = _concat.concat_compat(to_concat) + result = concat_compat(to_concat_vals) return Index(result, name=name) def putmask(self, mask, value): @@ -4163,15 +4381,12 @@ def putmask(self, mask, value): See Also -------- - numpy.ndarray.putmask + numpy.ndarray.putmask : Changes elements of an array + based on conditional and input values. """ - values = self.values.copy() + values = self._values.copy() try: - np.putmask(values, mask, self._convert_for_op(value)) - if is_period_dtype(self.dtype): - # .values cast to object, so we need to cast back - values = type(self)(values)._data - return self._shallow_copy(values) + converted = self._validate_fill_value(value) except (ValueError, TypeError) as err: if is_object_dtype(self): raise err @@ -4179,7 +4394,10 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other: Any) -> bool: + np.putmask(values, mask, converted) + return self._shallow_copy(values) + + def equals(self, other: object) -> bool: """ Determine if two Index object are equal. @@ -4244,20 +4462,20 @@ def equals(self, other: Any) -> bool: if not isinstance(other, Index): return False - if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): - # if other is not object, use other's logic for coercion - return other.equals(self) - - if isinstance(other, ABCMultiIndex): - # d-level MultiIndex can equal d-tuple Index - return other.equals(self) - - if is_extension_array_dtype(other.dtype): - # All EA-backed Index subclasses override equals + # If other is a subclass of self and defines its own equals method, we + # dispatch to the subclass method. For instance for a MultiIndex, + # a d-level MultiIndex can equal d-tuple Index. + # Note: All EA-backed Index subclasses override equals + if ( + isinstance(other, type(self)) + and type(other) is not type(self) + and other.equals is not self.equals + ): return other.equals(self) return array_equivalent(self._values, other._values) + @final def identical(self, other) -> bool: """ Similar to equals, but checks that object attributes and types are also equal. @@ -4271,14 +4489,13 @@ def identical(self, other) -> bool: return ( self.equals(other) and all( - ( - getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables - ) + getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables ) and type(self) == type(other) ) + @final def asof(self, label): """ Return the label from the index, or, if not present, the previous one. @@ -4344,7 +4561,7 @@ def asof(self, label): loc = loc.indices(len(self))[-1] return self[loc] - def asof_locs(self, where, mask): + def asof_locs(self, where: "Index", mask) -> np.ndarray: """ Return the locations (indices) of labels in the index. @@ -4372,18 +4589,24 @@ def asof_locs(self, where, mask): which correspond to the return values of the `asof` function for every element in `where`. """ - locs = self.values[mask].searchsorted(where.values, side="right") + locs = self._values[mask].searchsorted(where._values, side="right") locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) - first = mask.argmax() - result[(locs == 0) & (where.values < self.values[first])] = -1 + # TODO: overload return type of ExtensionArray.__getitem__ + first_value = cast(Any, self._values[mask.argmax()]) + result[(locs == 0) & (where._values < first_value)] = -1 return result + @final def sort_values( - self, return_indexer=False, ascending=True, key: Optional[Callable] = None + self, + return_indexer: bool = False, + ascending: bool = True, + na_position: str_t = "last", + key: Optional[Callable] = None, ): """ Return a sorted copy of the index. @@ -4397,6 +4620,12 @@ def sort_values( Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 1.2.0 + key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the @@ -4437,9 +4666,16 @@ def sort_values( """ idx = ensure_key_mapped(self, key) - _as = idx.argsort() - if not ascending: - _as = _as[::-1] + # GH 35584. Sort missing values according to na_position kwarg + # ignore na_position for MultiIndex + if not isinstance(self, ABCMultiIndex): + _as = nargsort( + items=idx, ascending=ascending, na_position=na_position, key=key + ) + else: + _as = idx.argsort() + if not ascending: + _as = _as[::-1] sorted_index = self.take(_as) @@ -4448,6 +4684,7 @@ def sort_values( else: return sorted_index + @final def sort(self, *args, **kwargs): """ Use sort_values instead. @@ -4510,7 +4747,10 @@ def shift(self, periods=1, freq=None): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError(f"Not supported for type {type(self).__name__}") + raise NotImplementedError( + f"This method is only implemented for DatetimeIndex, PeriodIndex and " + f"TimedeltaIndex; Got type {type(self).__name__}" + ) def argsort(self, *args, **kwargs) -> np.ndarray: """ @@ -4547,13 +4787,15 @@ def argsort(self, *args, **kwargs) -> np.ndarray: >>> idx[order] Index(['a', 'b', 'c', 'd'], dtype='object') """ - result = self.asi8 - - if result is None: - result = np.array(self) + if needs_i8_conversion(self.dtype): + # TODO: these do not match the underlying EA argsort methods GH#37863 + return self.asi8.argsort(*args, **kwargs) - return result.argsort(*args, **kwargs) + # This works for either ndarray or EA, is overriden + # by RangeIndex, MultIIndex + return self._data.argsort(*args, **kwargs) + @final def get_value(self, series: "Series", key): """ Fast lookup of value from 1-dimensional ndarray. @@ -4620,6 +4862,7 @@ def _get_values_for_loc(self, series: "Series", loc, key): return series.iloc[loc] + @final def set_value(self, arr, key, value): """ Fast lookup of value from 1-dimensional ndarray. @@ -4667,13 +4910,36 @@ def set_value(self, arr, key, value): @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ensure_index(target) + + if target.is_boolean() and self.is_numeric(): + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + return self._get_indexer_non_comparable(target, method=None, unique=False) + pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) - if not self._is_comparable_dtype(target.dtype): - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches + if not self._should_compare(target): + return self._get_indexer_non_comparable(target, method=None, unique=False) + + if not is_dtype_equal(self.dtype, target.dtype): + # TODO: if object, could use infer_dtype to pre-empt costly + # conversion if still non-comparable? + dtype = find_common_type([self.dtype, target.dtype]) + if ( + dtype.kind in ["i", "u"] + and is_categorical_dtype(target.dtype) + and target.hasnans + ): + # FIXME: find_common_type incorrect with Categorical GH#38240 + # FIXME: some cases where float64 cast can be lossy? + dtype = np.dtype(np.float64) + + this = self.astype(dtype, copy=False) + that = target.astype(dtype, copy=False) + return this.get_indexer_non_unique(that) if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) @@ -4683,6 +4949,7 @@ def get_indexer_non_unique(self, target): indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), missing + @final def get_indexer_for(self, target, **kwargs): """ Guaranteed return of an indexer even when non-unique. @@ -4695,11 +4962,59 @@ def get_indexer_for(self, target, **kwargs): numpy.ndarray List of indices. """ - if self.is_unique: + if self._index_as_unique: return self.get_indexer(target, **kwargs) - indexer, _ = self.get_indexer_non_unique(target, **kwargs) + indexer, _ = self.get_indexer_non_unique(target) return indexer + def _get_indexer_non_comparable(self, target: "Index", method, unique: bool = True): + """ + Called from get_indexer or get_indexer_non_unique when the target + is of a non-comparable dtype. + + For get_indexer lookups with method=None, get_indexer is an _equality_ + check, so non-comparable dtypes mean we will always have no matches. + + For get_indexer lookups with a method, get_indexer is an _inequality_ + check, so non-comparable dtypes mean we will always raise TypeError. + + Parameters + ---------- + target : Index + method : str or None + unique : bool, default True + * True if called from get_indexer. + * False if called from get_indexer_non_unique. + + Raises + ------ + TypeError + If doing an inequality check, i.e. method is not None. + """ + if method is not None: + other = unpack_nested_dtype(target) + raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") + + no_matches = -1 * np.ones(target.shape, dtype=np.intp) + if unique: + # This is for get_indexer + return no_matches + else: + # This is for get_indexer_non_unique + missing = np.arange(len(target), dtype=np.intp) + return no_matches, missing + + @property + def _index_as_unique(self): + """ + Whether we should treat this as unique for the sake of + get_indexer vs get_indexer_non_unique. + + For IntervalIndex compat. + """ + return self.is_unique + + @final def _maybe_promote(self, other: "Index"): """ When dealing with an object-dtype Index and a non-object Index, see @@ -4724,12 +5039,21 @@ def _maybe_promote(self, other: "Index"): return self, other + def _should_compare(self, other: "Index") -> bool: + """ + Check if `self == other` can ever have non-False entries. + """ + other = unpack_nested_dtype(other) + dtype = other.dtype + return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ return True + @final def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. @@ -4799,6 +5123,7 @@ def map(self, mapper, na_action=None): return Index(new_values, **attributes) # TODO: De-duplicate with map, xref GH#32349 + @final def _transform_index(self, func, level=None) -> "Index": """ Apply function to all values found in index. @@ -4902,14 +5227,20 @@ def isin(self, values, level=None): """ if level is not None: self._validate_index_level(level) - return algos.isin(self, values) + return algos.isin(self._values, values) - def _get_string_slice(self, key: str_t, use_lhs: bool = True, use_rhs: bool = True): + def _get_string_slice(self, key: str_t): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex raise NotImplementedError - def slice_indexer(self, start=None, end=None, step=None, kind=None): + def slice_indexer( + self, + start: Optional[Label] = None, + end: Optional[Label] = None, + step: Optional[int] = None, + kind: Optional[str_t] = None, + ) -> slice: """ Compute the slice indexer for input labels and step. @@ -4968,6 +5299,7 @@ def _maybe_cast_indexer(self, key): return com.cast_scalar_indexer(key) return key + @final def _validate_indexer(self, form: str_t, key, kind: str_t): """ If we are positional indexer, validate that we have appropriate @@ -4980,7 +5312,7 @@ def _validate_indexer(self, form: str_t, key, kind: str_t): elif is_integer(key): pass else: - self._invalid_indexer(form, key) + raise self._invalid_indexer(form, key) def _maybe_cast_slice_bound(self, label, side: str_t, kind): """ @@ -5007,14 +5339,9 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind): # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes - # reject them - if is_float(label): - self._invalid_indexer("slice", label) - - # we are trying to find integer bounds on a non-integer based index - # this is rejected (generally .loc gets you here) - elif is_integer(label): - self._invalid_indexer("slice", label) + # reject them, if index does not contain label + if (is_float(label) or is_integer(label)) and label not in self.values: + raise self._invalid_indexer("slice", label) return label @@ -5080,7 +5407,9 @@ def get_slice_bound(self, label, side: str_t, kind) -> int: if is_bool_dtype(slc): slc = lib.maybe_booleans_to_slice(slc.view("u1")) else: - slc = lib.maybe_indices_to_slice(slc.astype("i8"), len(self)) + slc = lib.maybe_indices_to_slice( + slc.astype(np.intp, copy=False), len(self) + ) if isinstance(slc, np.ndarray): raise KeyError( f"Cannot get {side} slice bound for non-unique " @@ -5261,7 +5590,7 @@ def drop(self, labels, errors: str_t = "raise"): """ arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) - indexer = self.get_indexer(labels) + indexer = self.get_indexer_for(labels) mask = indexer == -1 if mask.any(): if errors != "ignore": @@ -5272,120 +5601,135 @@ def drop(self, labels, errors: str_t = "raise"): # -------------------------------------------------------------------- # Generated Arithmetic, Comparison, and Unary Methods - @classmethod - def _add_comparison_methods(cls): + def _cmp_method(self, other, op): """ - Add in comparison methods. + Wrapper used to dispatch comparison operations. """ - cls.__eq__ = _make_comparison_op(operator.eq, cls) - cls.__ne__ = _make_comparison_op(operator.ne, cls) - cls.__lt__ = _make_comparison_op(operator.lt, cls) - cls.__gt__ = _make_comparison_op(operator.gt, cls) - cls.__le__ = _make_comparison_op(operator.le, cls) - cls.__ge__ = _make_comparison_op(operator.ge, cls) + if self.is_(other): + # fastpath + if op in {operator.eq, operator.le, operator.ge}: + arr = np.ones(len(self), dtype=bool) + if self._can_hold_na and not isinstance(self, ABCMultiIndex): + # TODO: should set MultiIndex._can_hold_na = False? + arr[self.isna()] = False + return arr + elif op in {operator.ne, operator.lt, operator.gt}: + return np.zeros(len(self), dtype=bool) - @classmethod - def _add_numeric_methods_add_sub_disabled(cls): - """ - Add in the numeric add/sub methods to disable. - """ - cls.__add__ = make_invalid_op("__add__") - cls.__radd__ = make_invalid_op("__radd__") - cls.__iadd__ = make_invalid_op("__iadd__") - cls.__sub__ = make_invalid_op("__sub__") - cls.__rsub__ = make_invalid_op("__rsub__") - cls.__isub__ = make_invalid_op("__isub__") + if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): + if len(self) != len(other): + raise ValueError("Lengths must match to compare") - @classmethod - def _add_numeric_methods_disabled(cls): - """ - Add in numeric methods to disable other than add/sub. - """ - cls.__pow__ = make_invalid_op("__pow__") - cls.__rpow__ = make_invalid_op("__rpow__") - cls.__mul__ = make_invalid_op("__mul__") - cls.__rmul__ = make_invalid_op("__rmul__") - cls.__floordiv__ = make_invalid_op("__floordiv__") - cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") - cls.__truediv__ = make_invalid_op("__truediv__") - cls.__rtruediv__ = make_invalid_op("__rtruediv__") - cls.__mod__ = make_invalid_op("__mod__") - cls.__divmod__ = make_invalid_op("__divmod__") - cls.__neg__ = make_invalid_op("__neg__") - cls.__pos__ = make_invalid_op("__pos__") - cls.__abs__ = make_invalid_op("__abs__") - cls.__inv__ = make_invalid_op("__inv__") + if not isinstance(other, ABCMultiIndex): + other = extract_array(other, extract_numpy=True) + else: + other = np.asarray(other) - @classmethod - def _add_numeric_methods_binary(cls): + if is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): + # e.g. PeriodArray, Categorical + with np.errstate(all="ignore"): + result = op(self._values, other) + + elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex): + # don't pass MultiIndex + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY(op, self._values, other) + + elif is_interval_dtype(self.dtype): + with np.errstate(all="ignore"): + result = op(self._values, np.asarray(other)) + + else: + with np.errstate(all="ignore"): + result = ops.comparison_op(self._values, other, op) + + return result + + def _arith_method(self, other, op): """ - Add in numeric methods. + Wrapper used to dispatch arithmetic operations. """ - cls.__add__ = _make_arithmetic_op(operator.add, cls) - cls.__radd__ = _make_arithmetic_op(ops.radd, cls) - cls.__sub__ = _make_arithmetic_op(operator.sub, cls) - cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls) - cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls) - cls.__pow__ = _make_arithmetic_op(operator.pow, cls) - cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) - cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) + from pandas import Series - # TODO: rmod? rdivmod? - cls.__mod__ = _make_arithmetic_op(operator.mod, cls) - cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) - cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) - cls.__divmod__ = _make_arithmetic_op(divmod, cls) - cls.__mul__ = _make_arithmetic_op(operator.mul, cls) - cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) + result = op(Series(self), other) + if isinstance(result, tuple): + return (Index(result[0]), Index(result[1])) + return Index(result) - @classmethod - def _add_numeric_methods_unary(cls): - """ - Add in numeric unary methods. + def _unary_method(self, op): + result = op(self._values) + return Index(result, name=self.name) + + def __abs__(self): + return self._unary_method(operator.abs) + + def __neg__(self): + return self._unary_method(operator.neg) + + def __pos__(self): + return self._unary_method(operator.pos) + + def __inv__(self): + # TODO: why not operator.inv? + # TODO: __inv__ vs __invert__? + return self._unary_method(lambda x: -x) + + def any(self, *args, **kwargs): """ + Return whether any element is Truthy. - def _make_evaluate_unary(op, opstr: str_t): - def _evaluate_numeric_unary(self): + Parameters + ---------- + *args + These parameters will be passed to numpy.any. + **kwargs + These parameters will be passed to numpy.any. - attrs = self._get_attributes_dict() - return Index(op(self.values), **attrs) + Returns + ------- + any : bool or array_like (if axis is specified) + A single element array_like may be converted to bool. - _evaluate_numeric_unary.__name__ = opstr - return _evaluate_numeric_unary + See Also + -------- + Index.all : Return whether all elements are True. + Series.all : Return whether all elements are True. - cls.__neg__ = _make_evaluate_unary(operator.neg, "__neg__") - cls.__pos__ = _make_evaluate_unary(operator.pos, "__pos__") - cls.__abs__ = _make_evaluate_unary(np.abs, "__abs__") - cls.__inv__ = _make_evaluate_unary(lambda x: -x, "__inv__") + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity + evaluate to True because these are not equal to zero. - @classmethod - def _add_numeric_methods(cls): - cls._add_numeric_methods_unary() - cls._add_numeric_methods_binary() + Examples + -------- + >>> index = pd.Index([0, 1, 2]) + >>> index.any() + True - @classmethod - def _add_logical_methods(cls): + >>> index = pd.Index([0, 0, 0]) + >>> index.any() + False """ - Add in logical methods. + # FIXME: docstr inaccurate, args/kwargs not passed + self._maybe_disable_logical_methods("any") + return np.any(self.values) + + def all(self): """ - _doc = """ - %(desc)s + Return whether all elements are Truthy. Parameters ---------- *args - These parameters will be passed to numpy.%(outname)s. + These parameters will be passed to numpy.all. **kwargs - These parameters will be passed to numpy.%(outname)s. + These parameters will be passed to numpy.all. Returns ------- - %(outname)s : bool or array_like (if axis is specified) - A single element array_like may be converted to bool.""" - - _index_shared_docs["index_all"] = dedent( - """ + all : bool or array_like (if axis is specified) + A single element array_like may be converted to bool. See Also -------- @@ -5424,68 +5768,28 @@ def _add_logical_methods(cls): >>> pd.Index([0, 0, 0]).any() False """ - ) - - _index_shared_docs["index_any"] = dedent( - """ - - See Also - -------- - Index.all : Return whether all elements are True. - Series.all : Return whether all elements are True. - - Notes - ----- - Not a Number (NaN), positive infinity and negative infinity - evaluate to True because these are not equal to zero. - - Examples - -------- - >>> index = pd.Index([0, 1, 2]) - >>> index.any() - True - - >>> index = pd.Index([0, 0, 0]) - >>> index.any() - False - """ - ) + # FIXME: docstr inaccurate, args/kwargs not passed - def _make_logical_function(name: str_t, desc: str_t, f): - @Substitution(outname=name, desc=desc) - @Appender(_index_shared_docs["index_" + name]) - @Appender(_doc) - def logical_func(self, *args, **kwargs): - result = f(self.values) - if ( - isinstance(result, (np.ndarray, ABCSeries, Index)) - and result.ndim == 0 - ): - # return NumPy type - return result.dtype.type(result.item()) - else: # pragma: no cover - return result - - logical_func.__name__ = name - return logical_func - - cls.all = _make_logical_function( - "all", "Return whether all elements are True.", np.all - ) - cls.any = _make_logical_function( - "any", "Return whether any element is True.", np.any - ) + self._maybe_disable_logical_methods("all") + return np.all(self.values) - @classmethod - def _add_logical_methods_disabled(cls): + @final + def _maybe_disable_logical_methods(self, opname: str_t): """ - Add in logical methods to disable. + raise if this Index subclass does not support any or all. """ - cls.all = make_invalid_op("all") - cls.any = make_invalid_op("any") + if ( + isinstance(self, ABCMultiIndex) + or needs_i8_conversion(self.dtype) + or is_interval_dtype(self.dtype) + or is_categorical_dtype(self.dtype) + or is_float_dtype(self.dtype) + ): + # This call will raise + make_invalid_op(opname)(self) @property - def shape(self): + def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. """ @@ -5495,11 +5799,6 @@ def shape(self): return self._values.shape -Index._add_numeric_methods_disabled() -Index._add_logical_methods() -Index._add_comparison_methods() - - def ensure_index_from_sequences(sequences, names=None): """ Construct an index from sequences of data. @@ -5540,7 +5839,9 @@ def ensure_index_from_sequences(sequences, names=None): return MultiIndex.from_arrays(sequences, names=names) -def ensure_index(index_like, copy: bool = False): +def ensure_index( + index_like: Union[AnyArrayLike, Sequence], copy: bool = False +) -> Index: """ Ensure that we have an index from some index-like object. @@ -5576,7 +5877,18 @@ def ensure_index(index_like, copy: bool = False): index_like = index_like.copy() return index_like if hasattr(index_like, "name"): - return Index(index_like, name=index_like.name, copy=copy) + # https://github.com/python/mypy/issues/1424 + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # Sequence[Any]]" has no attribute "name" [union-attr] + # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" + # has no attribute "name" [union-attr] + # error: "Sequence[Any]" has no attribute "name" [attr-defined] + # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no + # attribute "name" [union-attr] + # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no + # attribute "name" [union-attr] + name = index_like.name # type: ignore[union-attr, attr-defined] + return Index(index_like, name=name, copy=copy) if is_iterator(index_like): index_like = list(index_like) @@ -5594,6 +5906,13 @@ def ensure_index(index_like, copy: bool = False): return MultiIndex.from_arrays(converted) else: + if isinstance(converted, np.ndarray) and converted.dtype == np.int64: + # Check for overflows if we should actually be uint64 + # xref GH#35481 + alt = np.asarray(index_like) + if alt.dtype == np.uint64: + converted = alt + index_like = converted else: # clean_index_list does the equivalent of copying @@ -5631,7 +5950,7 @@ def _validate_join_method(method: str): raise ValueError(f"do not recognize join method {method}") -def default_index(n): +def default_index(n: int) -> "RangeIndex": from pandas.core.indexes.range import RangeIndex return RangeIndex(0, n, name=None) @@ -5723,9 +6042,9 @@ def _maybe_cast_data_without_dtype(subarr): """ # Runtime import needed bc IntervalArray imports Index from pandas.core.arrays import ( + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, TimedeltaArray, ) @@ -5856,3 +6175,43 @@ def _maybe_asobject(dtype, klass, data, copy: bool, name: Label, **kwargs): return index.astype(object) return klass(data, dtype=dtype, copy=copy, name=name, **kwargs) + + +def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]: + """ + Return common name if all indices agree, otherwise None (level-by-level). + + Parameters + ---------- + indexes : list of Index objects + + Returns + ------- + list + A list representing the unanimous 'names' found. + """ + name_tups = [tuple(i.names) for i in indexes] + name_sets = [{*ns} for ns in zip_longest(*name_tups)] + names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) + return names + + +def unpack_nested_dtype(other: Index) -> Index: + """ + When checking if our dtype is comparable with another, we need + to unpack CategoricalDtype to look at its categories.dtype. + + Parameters + ---------- + other : Index + + Returns + ------- + Index + """ + dtype = other.dtype + if is_categorical_dtype(dtype): + # If there is ever a SparseIndex, this could get dispatched + # here too. + return dtype.categories + return other diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b0b008de69a94..6c9f839f4b8b2 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,4 +1,4 @@ -from typing import Any, List +from typing import Any, List, Optional import warnings import numpy as np @@ -6,35 +6,27 @@ from pandas._config import get_option from pandas._libs import index as libindex -from pandas._libs.hashtable import duplicated_int64 from pandas._libs.lib import no_default -from pandas._typing import Label +from pandas._typing import ArrayLike, Label from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, is_categorical_dtype, - is_interval_dtype, - is_list_like, is_scalar, - pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import accessor -from pandas.core.algorithms import take_1d -from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories -import pandas.core.common as com +from pandas.core.arrays.categorical import Categorical, contains from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name -from pandas.core.indexes.extension import ExtensionIndex, inherit_names -import pandas.core.missing as missing -from pandas.core.ops import get_op_result_name +from pandas.core.indexes.extension import NDArrayBackedExtensionIndex, inherit_names _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) +_index_doc_kwargs.update({"target_klass": "CategoricalIndex"}) @inherit_names( @@ -68,7 +60,7 @@ typ="method", overwrite=True, ) -class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): +class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -163,9 +155,14 @@ class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): _typ = "categoricalindex" + @property + def _can_hold_strings(self): + return self.categories._can_hold_strings + codes: np.ndarray categories: Index _data: Categorical + _values: Categorical @property def _engine_type(self): @@ -212,29 +209,6 @@ def __new__( return cls._simple_new(data, name=name) - def _create_from_codes(self, codes, dtype=None, name=None): - """ - *this is an internal non-public method* - - create the correct categorical from codes - - Parameters - ---------- - codes : new codes - dtype: CategoricalDtype, defaults to existing - name : optional name attribute, defaults to existing - - Returns - ------- - CategoricalIndex - """ - if dtype is None: - dtype = self.dtype - if name is None: - name = self.name - cat = Categorical.from_codes(codes, dtype=dtype) - return CategoricalIndex(cat, name=name) - @classmethod def _simple_new(cls, values: Categorical, name: Label = None): assert isinstance(values, Categorical), type(values) @@ -245,52 +219,74 @@ def _simple_new(cls, values: Categorical, name: Label = None): result._cache = {} result._reset_identity() - result._no_setting_name = False return result # -------------------------------------------------------------------- + # error: Argument 1 of "_shallow_copy" is incompatible with supertype + # "ExtensionIndex"; supertype defines the argument type as + # "Optional[ExtensionArray]" [override] @doc(Index._shallow_copy) - def _shallow_copy(self, values=None, name: Label = no_default): + def _shallow_copy( # type:ignore[override] + self, + values: Optional[Categorical] = None, + name: Label = no_default, + ): name = self.name if name is no_default else name if values is not None: + # In tests we only get here with Categorical objects that + # have matching .ordered, and values.categories a subset of + # our own. However we do _not_ have a dtype match in general. values = Categorical(values, dtype=self.dtype) return super()._shallow_copy(values=values, name=name) - def _is_dtype_compat(self, other) -> bool: + def _is_dtype_compat(self, other) -> Categorical: """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) + Parameters + ---------- + other : Index + + Returns + ------- + Categorical + Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): - if isinstance(other, CategoricalIndex): - other = other._values - if not other.is_dtype_equal(self): + other = extract_array(other) + if not other._categories_match_up_to_permutation(self): raise TypeError( "categories must match existing categories when appending" ) else: values = other - if not is_list_like(values): - values = [values] + cat = Categorical(other, dtype=self.dtype) other = CategoricalIndex(cat) if not other.isin(values).all(): raise TypeError( "cannot append a non-category item to a CategoricalIndex" ) + other = other._values + + if not ((other == values) | (isna(other) & isna(values))).all(): + # GH#37667 see test_equals_non_category + raise TypeError( + "categories must match existing categories when appending" + ) return other - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determine if two CategoricalIndex objects contain the same elements. @@ -308,13 +304,10 @@ def equals(self, other) -> bool: try: other = self._is_dtype_compat(other) - if isinstance(other, type(self)): - other = other._data - return self._data.equals(other) except (TypeError, ValueError): - pass + return False - return False + return self._data.equals(other) # -------------------------------------------------------------------- # Rendering Methods @@ -337,7 +330,9 @@ def _format_attrs(self): "categories", ibase.default_pprint(self.categories, max_seq_items=max_categories), ), - ("ordered", self.ordered), + # pandas\core\indexes\category.py:315: error: "CategoricalIndex" + # has no attribute "ordered" [attr-defined] + ("ordered", self.ordered), # type: ignore[attr-defined] ] if self.name is not None: attrs.append(("name", ibase.default_pprint(self.name))) @@ -347,13 +342,13 @@ def _format_attrs(self): attrs.append(("length", len(self))) return attrs - def _format_with_header(self, header, na_rep="NaN") -> List[str]: - from pandas.io.formats.format import format_array + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: + from pandas.io.formats.printing import pprint_thing - formatted_values = format_array( - self._values, formatter=None, na_rep=na_rep, justify="left" - ) - result = ibase.trim_front(formatted_values) + result = [ + pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep + for x in self._values + ] return header + result # -------------------------------------------------------------------- @@ -367,11 +362,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. @@ -382,30 +372,14 @@ def __contains__(self, key: Any) -> bool: @doc(Index.astype) def astype(self, dtype, copy=True): - if dtype is not None: - dtype = pandas_dtype(dtype) - - if is_interval_dtype(dtype): - from pandas import IntervalIndex - - return IntervalIndex(np.array(self)) - elif is_categorical_dtype(dtype): - # GH 18630 - dtype = self.dtype.update_dtype(dtype) - if dtype == self.dtype: - return self.copy() if copy else self - - return Index.astype(self, dtype=dtype, copy=copy) - - @cache_readonly - def _isnan(self): - """ return if each value is nan""" - return self._data.codes == -1 + res_data = self._data.astype(dtype, copy=copy) + return Index(res_data, name=self.name) @doc(Index.fillna) def fillna(self, value, downcast=None): - self._assert_can_do_op(value) - return CategoricalIndex(self._data.fillna(value), name=self.name) + value = self._require_scalar(value) + cat = self._data.fillna(value) + return type(self)._simple_new(cat, name=self.name) @cache_readonly def _engine(self): @@ -424,32 +398,6 @@ def unique(self, level=None): # of result, not self. return type(self)._simple_new(result, name=self.name) - @doc(Index.duplicated) - def duplicated(self, keep="first"): - codes = self.codes.astype("i8") - return duplicated_int64(codes, keep) - - def _to_safe_for_reshape(self): - """ convert to object if we are a categorical """ - return self.astype("object") - - def _maybe_cast_indexer(self, key): - code = self.categories.get_loc(key) - code = self.codes.dtype.type(code) - return code - - @doc(Index.where) - def where(self, cond, other=None): - # TODO: Investigate an alternative implementation with - # 1. copy the underlying Categorical - # 2. setitem with `cond` and `other` - # 3. Rebuild CategoricalIndex. - if other is None: - other = self._na_value - values = np.where(cond, self._values, other) - cat = Categorical(values, dtype=self.dtype) - return type(self)._simple_new(cat, name=self.name) - def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -501,7 +449,8 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] - new_target = self._create_from_codes(codes) + cat = self._data._from_backing_data(codes) + new_target = type(self)._simple_new(cat, name=self.name) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't @@ -510,7 +459,8 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): - new_target = target._shallow_copy(new_target, name=self.name) + new_target = Categorical(new_target, dtype=target.dtype) + new_target = type(self)._simple_new(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) @@ -533,59 +483,52 @@ def _reindex_non_unique(self, target): if not (cats == -1).any(): # .reindex returns normal Index. Revert to CategoricalIndex if # all targets are included in my categories - new_target = self._shallow_copy(new_target) + new_target = Categorical(new_target, dtype=self.dtype) + new_target = type(self)._simple_new(new_target, name=self.name) return new_target, indexer, new_indexer + # -------------------------------------------------------------------- + # Indexing Methods + + def _maybe_cast_indexer(self, key) -> int: + return self._data._unbox_scalar(key) + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): - method = missing.clean_reindex_fill_method(method) - target = ibase.ensure_index(target) + def _get_indexer( + self, target: "Index", method=None, limit=None, tolerance=None + ) -> np.ndarray: + + self._check_indexing_method(method) if self.is_unique and self.equals(target): return np.arange(len(self), dtype="intp") - if method == "pad" or method == "backfill": - raise NotImplementedError( - "method='pad' and method='backfill' not " - "implemented yet for CategoricalIndex" - ) - elif method == "nearest": - raise NotImplementedError( - "method='nearest' not implemented yet for CategoricalIndex" - ) - - if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target): - if self._values.equals(target._values): - # we have the same codes - codes = target.codes - else: - codes = recode_for_categories( - target.codes, target.categories, self._values.categories - ) - else: - if isinstance(target, CategoricalIndex): - code_indexer = self.categories.get_indexer(target.categories) - codes = take_1d(code_indexer, target.codes, fill_value=-1) - else: - codes = self.categories.get_indexer(target) - - indexer, _ = self._engine.get_indexer_non_unique(codes) - return ensure_platform_int(indexer) + return self._get_indexer_non_unique(target._values)[0] @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) + return self._get_indexer_non_unique(target._values) - if isinstance(target, CategoricalIndex): - # Indexing on codes is more efficient if categories are the same: - if target.categories is self.categories: - target = target.codes - indexer, missing = self._engine.get_indexer_non_unique(target) - return ensure_platform_int(indexer), missing - target = target._values + def _get_indexer_non_unique(self, values: ArrayLike): + """ + get_indexer_non_unique but after unrapping the target Index object. + """ + # Note: we use engine.get_indexer_non_unique for get_indexer in addition + # to get_indexer_non_unique because, even if `target` is unique, any + # non-category entries in it will be encoded as -1 so `codes` may + # not be unique. + + if isinstance(values, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + cat = self._data._encode_with_my_categories(values) + codes = cat._codes + else: + codes = self.categories.get_indexer(values) - codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing @@ -595,29 +538,23 @@ def _convert_list_indexer(self, keyarr): # the categories if self.categories._defer_to_indexing: + # See tests.indexing.interval.test_interval:test_loc_getitem_frame indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - indexer = self.categories.get_indexer(np.asarray(keyarr)) - if (indexer == -1).any(): - raise KeyError( - "a list-indexer must only include values that are in the categories" - ) + return self.get_indexer_for(keyarr) - return self.get_indexer(keyarr) - - @doc(Index._convert_arr_indexer) - def _convert_arr_indexer(self, keyarr): - keyarr = com.asarray_tuplesafe(keyarr) + @doc(Index._maybe_cast_slice_bound) + def _maybe_cast_slice_bound(self, label, side: str, kind): + if kind == "loc": + return label - if self.categories._defer_to_indexing: - return keyarr + return super()._maybe_cast_slice_bound(label, side, kind) - return self._shallow_copy(keyarr) + # -------------------------------------------------------------------- - @doc(Index._convert_index_indexer) - def _convert_index_indexer(self, keyarr): - return self._shallow_copy(keyarr) + def _is_comparable_dtype(self, dtype): + return self.categories._is_comparable_dtype(dtype) def take_nd(self, *args, **kwargs): """Alias for `take`""" @@ -628,13 +565,6 @@ def take_nd(self, *args, **kwargs): ) return self.take(*args, **kwargs) - @doc(Index._maybe_cast_slice_bound) - def _maybe_cast_slice_bound(self, label, side, kind): - if kind == "loc": - return label - - return super()._maybe_cast_slice_bound(label, side, kind) - def map(self, mapper): """ Map values using input correspondence (a dict, Series, or function). @@ -705,53 +635,19 @@ def map(self, mapper): mapped = self._values.map(mapper) return Index(mapped, name=self.name) - def delete(self, loc): - """ - Make new Index with passed location(-s) deleted - - Returns - ------- - new_index : Index - """ - return self._create_from_codes(np.delete(self.codes, loc)) - - def insert(self, loc: int, item): - """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values - - Parameters - ---------- - loc : int - item : object - - Returns - ------- - new_index : Index - - Raises - ------ - ValueError if the item is not in the categories - - """ - code = self.categories.get_indexer([item]) - if (code == -1) and not (is_scalar(item) and isna(item)): - raise TypeError( - "cannot insert an item into a CategoricalIndex " - "that is not already an existing category" - ) - - codes = self.codes - codes = np.concatenate((codes[:loc], code, codes[loc:])) - return self._create_from_codes(codes) - - def _concat(self, to_concat, name): + def _concat(self, to_concat: List["Index"], name: Label) -> Index: # if calling index is category, don't check dtype of others - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) - result = self._create_from_codes(codes, name=name) - # if name is None, _create_from_codes sets self.name - result.name = name - return result + try: + codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) + except TypeError: + # not all to_concat elements are among our categories (or NA) + from pandas.core.dtypes.concat import concat_compat + + res = concat_compat(to_concat) + return Index(res, name=name) + else: + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat, name=name) def _delegate_method(self, name: str, *args, **kwargs): """ method delegation to the ._values """ @@ -762,14 +658,3 @@ def _delegate_method(self, name: str, *args, **kwargs): if is_scalar(res): return res return CategoricalIndex(res, name=self.name) - - def _wrap_joined_index( - self, joined: np.ndarray, other: "CategoricalIndex" - ) -> "CategoricalIndex": - name = get_op_result_name(self, other) - return self._create_from_codes(joined, name=name) - - -CategoricalIndex._add_numeric_methods_add_sub_disabled() -CategoricalIndex._add_numeric_methods_disabled() -CategoricalIndex._add_logical_methods_disabled() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7be6aa50fa16b..f0d4d36531e0d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,21 +2,19 @@ Base and utility classes for tseries type pandas objects. """ from datetime import datetime -from typing import Any, List, Optional, TypeVar, Union, cast +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, TypeVar, Union, cast import numpy as np from pandas._libs import NaT, Timedelta, iNaT, join as libjoin, lib -from pandas._libs.tslibs import BaseOffset, Resolution, Tick, timezones -from pandas._libs.tslibs.parsing import DateParseError -from pandas._typing import Label +from pandas._libs.tslibs import BaseOffset, Resolution, Tick +from pandas._typing import Callable, Label from pandas.compat.numpy import function as nv -from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( - ensure_int64, is_bool_dtype, + is_categorical_dtype, is_dtype_equal, is_integer, is_list_like, @@ -24,26 +22,24 @@ is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCSeries -from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin -from pandas.core.base import IndexOpsMixin import pandas.core.common as com -from pandas.core.construction import array as pd_array, extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.extension import ( - ExtensionIndex, + NDArrayBackedExtensionIndex, inherit_names, make_wrapped_arith_op, ) from pandas.core.indexes.numeric import Int64Index -from pandas.core.ops import get_op_result_name -from pandas.core.sorting import ensure_key_mapped from pandas.core.tools.timedeltas import to_timedelta +if TYPE_CHECKING: + from pandas import CategoricalIndex + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _T = TypeVar("_T", bound="DatetimeIndexOpsMixin") @@ -54,18 +50,25 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): Create the join wrapper methods. """ - @staticmethod # type: ignore + # error: 'staticmethod' used with a non-method + @staticmethod # type: ignore[misc] def wrapper(left, right): - if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + # Note: these only get called with left.dtype == right.dtype + if isinstance( + left, (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin) + ): left = left.view("i8") - if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + if isinstance( + right, + (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin), + ): right = right.view("i8") results = joinf(left, right) if with_indexers: # dtype should be timedelta64[ns] for TimedeltaIndex # and datetime64[ns] for DatetimeIndex - dtype = left.dtype.base + dtype = cast(np.dtype, left.dtype).base join_index, left_indexer, right_indexer = results join_index = join_index.view(dtype) @@ -76,43 +79,65 @@ def wrapper(left, right): @inherit_names( - ["inferred_freq", "_isnan", "_resolution_obj", "resolution"], + ["inferred_freq", "_resolution_obj", "resolution"], DatetimeLikeArrayMixin, cache=True, ) -@inherit_names( - ["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin, -) -class DatetimeIndexOpsMixin(ExtensionIndex): +@inherit_names(["mean", "asi8", "freq", "freqstr"], DatetimeLikeArrayMixin) +class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. """ + _can_hold_strings = False _data: Union[DatetimeArray, TimedeltaArray, PeriodArray] + _data_cls: Union[Type[DatetimeArray], Type[TimedeltaArray], Type[PeriodArray]] freq: Optional[BaseOffset] freqstr: Optional[str] _resolution_obj: Resolution _bool_ops: List[str] = [] _field_ops: List[str] = [] - hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore + # error: "Callable[[Any], Any]" has no attribute "fget" + hasnans = cache_readonly( + DatetimeLikeArrayMixin._hasnans.fget # type: ignore[attr-defined] + ) _hasnans = hasnans # for index / array -agnostic code + @classmethod + def _simple_new( + cls, + values: Union[DatetimeArray, TimedeltaArray, PeriodArray], + name: Label = None, + ): + assert isinstance(values, cls._data_cls), type(values) + + result = object.__new__(cls) + result._data = values + result._name = name + result._cache = {} + + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + + result._reset_identity() + return result + @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: return True # ------------------------------------------------------------------------ # Abstract data attributes @property - def values(self): + def values(self) -> np.ndarray: # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result): @@ -126,24 +151,35 @@ def __array_wrap__(self, result, context=None): # ------------------------------------------------------------------------ - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. """ if self.is_(other): return True - if not isinstance(other, ABCIndexClass): + if not isinstance(other, Index): + return False + elif other.dtype.kind in ["f", "i", "u", "c"]: return False elif not isinstance(other, type(self)): - try: - other = type(self)(other) - except (ValueError, TypeError, OverflowError): - # e.g. - # ValueError -> cannot parse str entry, or OutOfBoundsDatetime - # TypeError -> trying to convert IntervalIndex to DatetimeIndex - # OverflowError -> Index([very_large_timedeltas]) - return False + should_try = False + inferrable = self._data._infer_matches + if other.dtype == object: + should_try = other.inferred_type in inferrable + elif is_categorical_dtype(other.dtype): + other = cast("CategoricalIndex", other) + should_try = other.categories.inferred_type in inferrable + + if should_try: + try: + other = type(self)(other) + except (ValueError, TypeError, OverflowError): + # e.g. + # ValueError -> cannot parse str entry, or OutOfBoundsDatetime + # TypeError -> trying to convert IntervalIndex to DatetimeIndex + # OverflowError -> Index([very_large_timedeltas]) + return False if not is_dtype_equal(self.dtype, other.dtype): # have different timezone @@ -162,46 +198,20 @@ def __contains__(self, key: Any) -> bool: is_scalar(res) or isinstance(res, slice) or (is_list_like(res) and len(res)) ) - def sort_values(self, return_indexer=False, ascending=True, key=None): - """ - Return sorted copy of Index. - """ - idx = ensure_key_mapped(self, key) - - _as = idx.argsort() - if not ascending: - _as = _as[::-1] - sorted_index = self.take(_as) - - if return_indexer: - return sorted_index, _as - else: - return sorted_index - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) - indices = ensure_int64(indices) + nv.validate_take((), kwargs) + indices = np.asarray(indices, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) - if isinstance(maybe_slice, slice): - return self[maybe_slice] - return ExtensionIndex.take( + result = NDArrayBackedExtensionIndex.take( self, indices, axis, allow_fill, fill_value, **kwargs ) - - @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") - def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, str): - raise TypeError( - "searchsorted requires compatible dtype or scalar, " - f"not {type(value).__name__}" - ) - if isinstance(value, Index): - value = value._data - - return self._data.searchsorted(value, side=side, sorter=sorter) + if isinstance(maybe_slice, slice): + freq = self._data._get_getitem_freq(maybe_slice) + result._data._freq = freq + return result _can_hold_na = True @@ -238,23 +248,23 @@ def min(self, axis=None, skipna=True, *args, **kwargs): return self._na_value i8 = self.asi8 - try: + + if len(i8) and self.is_monotonic_increasing: # quick check - if len(i8) and self.is_monotonic: - if i8[0] != iNaT: - return self._box_func(i8[0]) - - if self.hasnans: - if skipna: - min_stamp = self[~self._isnan].asi8.min() - else: - return self._na_value - else: - min_stamp = i8.min() - return self._box_func(min_stamp) - except ValueError: + if i8[0] != iNaT: + return self._data._box_func(i8[0]) + + if self.hasnans: + if not skipna: + return self._na_value + i8 = i8[~self._isnan] + + if not len(i8): return self._na_value + min_stamp = i8.min() + return self._data._box_func(min_stamp) + def argmin(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the minimum values along an axis. @@ -295,23 +305,23 @@ def max(self, axis=None, skipna=True, *args, **kwargs): return self._na_value i8 = self.asi8 - try: + + if len(i8) and self.is_monotonic: # quick check - if len(i8) and self.is_monotonic: - if i8[-1] != iNaT: - return self._box_func(i8[-1]) - - if self.hasnans: - if skipna: - max_stamp = self[~self._isnan].asi8.max() - else: - return self._na_value - else: - max_stamp = i8.max() - return self._box_func(max_stamp) - except ValueError: + if i8[-1] != iNaT: + return self._data._box_func(i8[-1]) + + if self.hasnans: + if not skipna: + return self._na_value + i8 = i8[~self._isnan] + + if not len(i8): return self._na_value + max_stamp = i8.max() + return self._data._box_func(max_stamp) + def argmax(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the maximum values along an axis. @@ -338,14 +348,39 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # -------------------------------------------------------------------- # Rendering Methods - def _format_with_header(self, header, na_rep="NaT", date_format=None) -> List[str]: + def format( + self, + name: bool = False, + formatter: Optional[Callable] = None, + na_rep: str = "NaT", + date_format: Optional[str] = None, + ) -> List[str]: + """ + Render a string representation of the Index. + """ + header = [] + if name: + header.append( + ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + + def _format_with_header( + self, header: List[str], na_rep: str = "NaT", date_format: Optional[str] = None + ) -> List[str]: return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) ) @property def _formatter_func(self): - raise AbstractMethodError(self) + return self._data._formatter() def _format_attrs(self): """ @@ -360,6 +395,36 @@ def _format_attrs(self): attrs.append(("freq", freq)) return attrs + def _summary(self, name=None) -> str: + """ + Return a summarized representation. + + Parameters + ---------- + name : str + Name to use in the summary representation. + + Returns + ------- + str + Summarized representation of the index. + """ + formatter = self._formatter_func + if len(self) > 0: + index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" + else: + index_summary = "" + + if name is None: + name = type(self).__name__ + result = f"{name}: {len(self)} entries{index_summary}" + if self.freq: + result += f"\nFreq: {self.freqstr}" + + # display as values, not quoted + result = result.replace("'", "") + return result + # -------------------------------------------------------------------- # Indexing Methods @@ -373,16 +438,12 @@ def _partial_date_slice( self, reso: Resolution, parsed: datetime, - use_lhs: bool = True, - use_rhs: bool = True, ): """ Parameters ---------- reso : Resolution parsed : datetime - use_lhs : bool, default True - use_rhs : bool, default True Returns ------- @@ -391,14 +452,13 @@ def _partial_date_slice( self._validate_partial_date_slice(reso) t1, t2 = self._parsed_string_to_bounds(reso, parsed) - i8vals = self.asi8 - unbox = self._data._unbox_scalar + vals = self._data._ndarray + unbox = self._data._unbox - if self.is_monotonic: + if self.is_monotonic_increasing: if len(self) and ( - (use_lhs and t1 < self[0] and t2 < self[0]) - or ((use_rhs and t1 > self[-1] and t2 > self[-1])) + (t1 < self[0] and t2 < self[0]) or (t1 > self[-1] and t2 > self[-1]) ): # we are out of range raise KeyError @@ -406,14 +466,13 @@ def _partial_date_slice( # TODO: does this depend on being monotonic _increasing_? # a monotonic (sorted) series can be sliced - # Use asi8.searchsorted to avoid re-validating Periods/Timestamps - left = i8vals.searchsorted(unbox(t1), side="left") if use_lhs else None - right = i8vals.searchsorted(unbox(t2), side="right") if use_rhs else None + left = vals.searchsorted(unbox(t1), side="left") + right = vals.searchsorted(unbox(t2), side="right") return slice(left, right) else: - lhs_mask = (i8vals >= unbox(t1)) if use_lhs else True - rhs_mask = (i8vals <= unbox(t2)) if use_rhs else True + lhs_mask = vals >= unbox(t1) + rhs_mask = vals <= unbox(t2) # try to find the dates return (lhs_mask & rhs_mask).nonzero()[0] @@ -438,75 +497,6 @@ def _partial_date_slice( __truediv__ = make_wrapped_arith_op("__truediv__") __rtruediv__ = make_wrapped_arith_op("__rtruediv__") - def isin(self, values, level=None): - """ - Compute boolean array of whether each index value is found in the - passed set of values. - - Parameters - ---------- - values : set or sequence of values - - Returns - ------- - is_contained : ndarray (boolean dtype) - """ - if level is not None: - self._validate_index_level(level) - - if not isinstance(values, type(self)): - try: - values = type(self)(values) - except ValueError: - return self.astype(object).isin(values) - - return algorithms.isin(self.asi8, values.asi8) - - @Appender(Index.where.__doc__) - def where(self, cond, other=None): - values = self.view("i8") - - try: - other = self._data._validate_where_value(other) - except (TypeError, ValueError) as err: - # Includes tzawareness mismatch and IncompatibleFrequencyError - oth = getattr(other, "dtype", other) - raise TypeError(f"Where requires matching dtype, not {oth}") from err - - result = np.where(cond, values, other).astype("i8") - arr = type(self._data)._simple_new(result, dtype=self.dtype) - return type(self)._simple_new(arr, name=self.name) - - def _summary(self, name=None) -> str: - """ - Return a summarized representation. - - Parameters - ---------- - name : str - Name to use in the summary representation. - - Returns - ------- - str - Summarized representation of the index. - """ - formatter = self._formatter_func - if len(self) > 0: - index_summary = f", {formatter(self[0])} to {formatter(self[-1])}" - else: - index_summary = "" - - if name is None: - name = type(self).__name__ - result = f"{name}: {len(self)} entries{index_summary}" - if self.freq: - result += f"\nFreq: {self.freqstr}" - - # display as values, not quoted - result = result.replace("'", "") - return result - def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -545,56 +535,95 @@ def shift(self, periods=1, freq=None): # -------------------------------------------------------------------- # List-like Methods - def delete(self, loc): - new_i8s = np.delete(self.asi8, loc) + def _get_delete_freq(self, loc: int): + """ + Find the `freq` for self.delete(loc). + """ + freq = None + if is_period_dtype(self.dtype): + freq = self.freq + elif self.freq is not None: + if is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + loc = lib.maybe_indices_to_slice( + np.asarray(loc, dtype=np.intp), len(self) + ) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq + return freq + + def _get_insert_freq(self, loc, item): + """ + Find the `freq` for self.insert(loc, item). + """ + value = self._data._validate_scalar(item) + item = self._data._box_func(value) freq = None if is_period_dtype(self.dtype): freq = self.freq - elif is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): + elif self.freq is not None: + # freq can be preserved on edge cases + if self.size: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + else: + # Adding a single item to an empty index may preserve freq + if self.freq.is_on_offset(item): freq = self.freq + return freq - arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) - return type(self)._simple_new(arr, name=self.name) + @doc(NDArrayBackedExtensionIndex.delete) + def delete(self, loc): + result = super().delete(loc) + result._data._freq = self._get_delete_freq(loc) + return result + + @doc(NDArrayBackedExtensionIndex.insert) + def insert(self, loc: int, item): + result = super().insert(loc, item) + + result._data._freq = self._get_insert_freq(loc, item) + return result # -------------------------------------------------------------------- # Join/Set Methods - def _wrap_joined_index(self, joined: np.ndarray, other): - assert other.dtype == self.dtype, (other.dtype, self.dtype) - name = get_op_result_name(self, other) + def _can_union_without_object_cast(self, other) -> bool: + return is_dtype_equal(self.dtype, other.dtype) + def _get_join_freq(self, other): + """ + Get the freq to attach to the result of a join operation. + """ if is_period_dtype(self.dtype): freq = self.freq else: self = cast(DatetimeTimedeltaMixin, self) freq = self.freq if self._can_fast_union(other) else None - new_data = type(self._data)._simple_new(joined, dtype=self.dtype, freq=freq) + return freq + + def _wrap_joined_index(self, joined: np.ndarray, other): + assert other.dtype == self.dtype, (other.dtype, self.dtype) - return type(self)._simple_new(new_data, name=name) + result = super()._wrap_joined_index(joined, other) + result._data._freq = self._get_join_freq(other) + return result @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): - if lib.infer_dtype(keyarr) == "string": - # Weak reasoning that indexer is a list of strings - # representing datetime or timedelta or period - try: - extension_arr = pd_array(keyarr, self.dtype) - except (ValueError, DateParseError): - # Fail to infer keyarr from self.dtype - return keyarr - - converted_arr = extract_array(extension_arr, extract_numpy=True) - else: - converted_arr = com.asarray_tuplesafe(keyarr) - return converted_arr + try: + return self._data._validate_listlike(keyarr, allow_object=True) + except (ValueError, TypeError): + return com.asarray_tuplesafe(keyarr) class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): @@ -612,20 +641,13 @@ def _with_freq(self, freq): arr = self._data._with_freq(freq) return type(self)._simple_new(arr, name=self.name) - def _shallow_copy(self, values=None, name: Label = lib.no_default): - name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - - if values is None: - values = self._data - - if isinstance(values, np.ndarray): - # TODO: We would rather not get here - values = type(self._data)(values, dtype=self.dtype) + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return False - result = type(self)._simple_new(values, name=name) - result._cache = cache - return result + def is_type_compatible(self, kind: str) -> bool: + return kind in self._data._infer_matches # -------------------------------------------------------------------- # Set Operation Methods @@ -664,33 +686,35 @@ def intersection(self, other, sort=False): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - res_name = get_op_result_name(self, other) + other, _ = self._convert_can_do_setop(other) if self.equals(other): + if self.has_duplicates: + return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) + return self._intersection(other, sort=sort) + + def _intersection(self, other: Index, sort=False) -> Index: + """ + intersection specialized to the case with matching dtypes. + """ if len(self) == 0: - return self.copy() + return self.copy()._get_reconciled_name_object(other) if len(other) == 0: - return other.copy() + return other.copy()._get_reconciled_name_object(self) if not isinstance(other, type(self)): result = Index.intersection(self, other, sort=sort) - if isinstance(result, type(self)): - if result.freq is None: - # TODO: no tests rely on this; needed? - result = result._with_freq("infer") - assert result.name == res_name return result elif not self._can_fast_intersect(other): - result = Index.intersection(self, other, sort=sort) - assert result.name == res_name - # We need to invalidate the freq because Index.intersection + result = Index._intersection(self, other, sort=sort) + # We need to invalidate the freq because Index._intersection # uses _shallow_copy on a view of self._data, which will preserve # self.freq if we're not careful. - result = result._with_freq(None)._with_freq("infer") - return result + result = self._wrap_setop_result(other, result) + return result._with_freq(None)._with_freq("infer") # to make our life easier, "sort" the two ranges if self[0] <= other[0]: @@ -704,11 +728,16 @@ def intersection(self, other, sort=False): start = right[0] if end < start: - return type(self)(data=[], dtype=self.dtype, freq=self.freq, name=res_name) + result = self[:0] else: lslice = slice(*left.slice_locs(start, end)) left_chunk = left._values[lslice] - return type(self)._simple_new(left_chunk, name=res_name) + # error: Argument 1 to "_simple_new" of "DatetimeIndexOpsMixin" has + # incompatible type "Union[ExtensionArray, Any]"; expected + # "Union[DatetimeArray, TimedeltaArray, PeriodArray]" [arg-type] + result = type(self)._simple_new(left_chunk) # type: ignore[arg-type] + + return self._wrap_setop_result(other, result) def _can_fast_intersect(self: _T, other: _T) -> bool: if self.freq is None: @@ -803,7 +832,7 @@ def _fast_union(self, other, sort=None): # The can_fast_union check ensures that the result.freq # should match self.freq dates = type(self._data)(dates, freq=self.freq) - result = type(self)._simple_new(dates, name=self.name) + result = type(self)._simple_new(dates) return result else: return left @@ -828,10 +857,14 @@ def _union(self, other, sort): result = result._with_freq("infer") return result else: - i8self = Int64Index._simple_new(self.asi8, name=self.name) - i8other = Int64Index._simple_new(other.asi8, name=other.name) + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) i8result = i8self._union(i8other, sort=sort) - result = type(self)(i8result, dtype=self.dtype, freq="infer") + # pandas\core\indexes\datetimelike.py:887: error: Unexpected + # keyword argument "freq" for "DatetimeTimedeltaMixin" [call-arg] + result = type(self)( + i8result, dtype=self.dtype, freq="infer" # type: ignore[call-arg] + ) return result # -------------------------------------------------------------------- @@ -851,11 +884,11 @@ def join( """ See Index.join """ - if self._is_convertible_to_index_for_join(other): - try: - other = type(self)(other) - except (TypeError, ValueError): - pass + pself, pother = self._maybe_promote(other) + if pself is not self or pother is not other: + return pself.join( + pother, how=how, level=level, return_indexers=return_indexers, sort=sort + ) this, other = self._maybe_utc_convert(other) return Index.join( @@ -867,84 +900,18 @@ def join( sort=sort, ) - def _maybe_utc_convert(self, other): - this = self - if not hasattr(self, "tz"): - return this, other - - if isinstance(other, type(self)): - if self.tz is not None: - if other.tz is None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - elif other.tz is not None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert("UTC") - other = other.tz_convert("UTC") - return this, other - - @classmethod - def _is_convertible_to_index_for_join(cls, other: Index) -> bool: - """ - return a boolean whether I can attempt conversion to a - DatetimeIndex/TimedeltaIndex - """ - if isinstance(other, cls): - return False - elif len(other) > 0 and other.inferred_type not in ( - "floating", - "mixed-integer", - "integer", - "integer-na", - "mixed-integer-float", - "mixed", - ): - return True - return False + def _maybe_utc_convert(self: _T, other: Index) -> Tuple[_T, Index]: + # Overridden by DatetimeIndex + return self, other # -------------------------------------------------------------------- # List-Like Methods + @Appender(DatetimeIndexOpsMixin.insert.__doc__) def insert(self, loc, item): - """ - Make new Index inserting new item at location - - Parameters - ---------- - loc : int - item : object - if not either a Python datetime or a numpy integer-like, returned - Index dtype will be object rather than datetime. - - Returns - ------- - new_index : Index - """ if isinstance(item, str): # TODO: Why are strings special? # TODO: Should we attempt _scalar_from_string? return self.astype(object).insert(loc, item) - item = self._data._validate_insert_value(item) - - freq = None - # check freq can be preserved on edge cases - if self.freq is not None: - if self.size: - if item is NaT: - pass - elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - else: - # Adding a single item to an empty index may preserve freq - if self.freq.is_on_offset(item): - freq = self.freq - - item = self._data._unbox_scalar(item) - - new_i8s = np.concatenate([self[:loc].asi8, [item], self[loc:].asi8]) - arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) - return type(self)._simple_new(arr, name=self.name) + return DatetimeIndexOpsMixin.insert(self, loc, item) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6d2e592f024ed..8329c41a74596 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,6 +1,6 @@ from datetime import date, datetime, time, timedelta, tzinfo import operator -from typing import Optional +from typing import TYPE_CHECKING, Optional, Tuple import warnings import numpy as np @@ -14,28 +14,28 @@ to_offset, ) from pandas._libs.tslibs.offsets import prefix_mapping -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( DT64NS_DTYPE, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_float, - is_integer, is_scalar, ) from pandas.core.dtypes.missing import is_valid_nat_for_dtype from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype import pandas.core.common as com -from pandas.core.indexes.base import Index, maybe_extract_name +from pandas.core.indexes.base import Index, get_unanimous_names, maybe_extract_name from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin from pandas.core.indexes.extension import inherit_names from pandas.core.tools.times import to_time +if TYPE_CHECKING: + from pandas import DataFrame, Float64Index, PeriodIndex, TimedeltaIndex + def _new_DatetimeIndex(cls, d): """ @@ -70,12 +70,11 @@ def _new_DatetimeIndex(cls, d): @inherit_names( - ["to_perioddelta", "to_julian_date", "strftime", "isocalendar"] - + DatetimeArray._field_ops + DatetimeArray._field_ops + [ method for method in DatetimeArray._datetimelike_methods - if method not in ("tz_localize",) + if method not in ("tz_localize", "tz_convert") ], DatetimeArray, wrap=True, @@ -97,6 +96,7 @@ def _new_DatetimeIndex(cls, d): "date", "time", "timetz", + "std", ] + DatetimeArray._bool_ops, DatetimeArray, @@ -162,9 +162,11 @@ class DatetimeIndex(DatetimeTimedeltaMixin): time timetz dayofyear + day_of_year weekofyear week dayofweek + day_of_week weekday quarter tz @@ -197,6 +199,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): month_name day_name mean + std See Also -------- @@ -214,6 +217,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _typ = "datetimeindex" + _data_cls = DatetimeArray _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True @@ -223,10 +227,21 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _is_numeric_dtype = False _data: DatetimeArray + inferred_freq: Optional[str] tz: Optional[tzinfo] # -------------------------------------------------------------------- - # methods that dispatch to array and wrap result in DatetimeIndex + # methods that dispatch to DatetimeArray and wrap result + + @doc(DatetimeArray.strftime) + def strftime(self, date_format) -> Index: + arr = self._data.strftime(date_format) + return Index(arr, name=self.name) + + @doc(DatetimeArray.tz_convert) + def tz_convert(self, tz) -> "DatetimeIndex": + arr = self._data.tz_convert(tz) + return type(self)._simple_new(arr, name=self.name) @doc(DatetimeArray.tz_localize) def tz_localize( @@ -236,9 +251,30 @@ def tz_localize( return type(self)._simple_new(arr, name=self.name) @doc(DatetimeArray.to_period) - def to_period(self, freq=None) -> "DatetimeIndex": + def to_period(self, freq=None) -> "PeriodIndex": + from pandas.core.indexes.api import PeriodIndex + arr = self._data.to_period(freq) - return type(self)._simple_new(arr, name=self.name) + return PeriodIndex._simple_new(arr, name=self.name) + + @doc(DatetimeArray.to_perioddelta) + def to_perioddelta(self, freq) -> "TimedeltaIndex": + from pandas.core.indexes.api import TimedeltaIndex + + arr = self._data.to_perioddelta(freq) + return TimedeltaIndex._simple_new(arr, name=self.name) + + @doc(DatetimeArray.to_julian_date) + def to_julian_date(self) -> "Float64Index": + from pandas.core.indexes.api import Float64Index + + arr = self._data.to_julian_date() + return Float64Index._simple_new(arr, name=self.name) + + @doc(DatetimeArray.isocalendar) + def isocalendar(self) -> "DataFrame": + df = self._data.isocalendar() + return df.set_index(self) # -------------------------------------------------------------------- # Constructors @@ -268,7 +304,7 @@ def __new__( name = maybe_extract_name(name, data, cls) - dtarr = DatetimeArray._from_sequence( + dtarr = DatetimeArray._from_sequence_not_strict( data, dtype=dtype, copy=copy, @@ -282,20 +318,6 @@ def __new__( subarr = cls._simple_new(dtarr, name=name) return subarr - @classmethod - def _simple_new(cls, values: DatetimeArray, name: Label = None): - assert isinstance(values, DatetimeArray), type(values) - - result = object.__new__(cls) - result._data = values - result.name = name - result._cache = {} - result._no_setting_name = False - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - result._reset_identity() - return result - # -------------------------------------------------------------------- @cache_readonly @@ -307,33 +329,29 @@ def _is_dates_only(self) -> bool: ------- bool """ - from pandas.io.formats.format import _is_dates_only + from pandas.io.formats.format import is_dates_only - return self.tz is None and _is_dates_only(self._values) + return self.tz is None and is_dates_only(self._values) def __reduce__(self): # we use a special reduce here because we need # to simply set the .tz (and not reinterpret it) - d = dict(data=self._data) + d = {"data": self._data} d.update(self._get_attributes_dict()) return _new_DatetimeIndex, (type(self), d), None - def _convert_for_op(self, value): + def _validate_fill_value(self, value): """ Convert value to be insertable to ndarray. """ - if self._has_same_tz(value): - return Timestamp(value).asm8 - raise ValueError("Passed item and index have different timezone") + return self._data._validate_setitem_value(value) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ - if not is_datetime64_any_dtype(dtype): - return False if self.tz is not None: # If we have tz, we can compare to tzaware return is_datetime64tz_dtype(dtype) @@ -349,10 +367,10 @@ def _mpl_repr(self): @property def _formatter_func(self): - from pandas.io.formats.format import _get_format_datetime64 + from pandas.io.formats.format import get_format_datetime64 - formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: f"'{formatter(x, tz=self.tz)}'" + formatter = get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: f"'{formatter(x)}'" # -------------------------------------------------------------------- # Set Operation Methods @@ -380,8 +398,27 @@ def union_many(self, others): this = this._fast_union(other) else: this = Index.union(this, other) + + res_name = get_unanimous_names(self, *others)[0] + if this.name != res_name: + return this.rename(res_name) return this + def _maybe_utc_convert(self, other: Index) -> Tuple["DatetimeIndex", Index]: + this = self + + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other + # -------------------------------------------------------------------- def _get_time_micros(self): @@ -392,9 +429,7 @@ def _get_time_micros(self): ------- ndarray[int64_t] """ - values = self.asi8 - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._data._local_timestamps() + values = self._data._local_timestamps() nanos = values % (24 * 3600 * 1_000_000_000) micros = nanos // 1000 @@ -504,6 +539,9 @@ def snap(self, freq="S"): dta = DatetimeArray(snapped, dtype=self.dtype) return DatetimeIndex._simple_new(dta, name=self.name) + # -------------------------------------------------------------------- + # Indexing Methods + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): """ Calculate datetime bounds for parsed time string and its resolution. @@ -574,6 +612,28 @@ def _validate_partial_date_slice(self, reso: Resolution): # _parsed_string_to_bounds allows it. raise KeyError + def _deprecate_mismatched_indexing(self, key): + # GH#36148 + # we get here with isinstance(key, self._data._recognized_scalars) + try: + self._data._assert_tzawareness_compat(key) + except TypeError: + if self.tz is None: + msg = ( + "Indexing a timezone-naive DatetimeIndex with a " + "timezone-aware datetime is deprecated and will " + "raise KeyError in a future version. " + "Use a timezone-naive object instead." + ) + else: + msg = ( + "Indexing a timezone-aware DatetimeIndex with a " + "timezone-naive datetime is deprecated and will " + "raise KeyError in a future version. " + "Use a timezone-aware object instead." + ) + warnings.warn(msg, FutureWarning, stacklevel=5) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -591,6 +651,7 @@ def get_loc(self, key, method=None, tolerance=None): if isinstance(key, self._data._recognized_scalars): # needed to localize naive datetimes + self._deprecate_mismatched_indexing(key) key = self._maybe_cast_for_get_loc(key) elif isinstance(key, str): @@ -627,7 +688,7 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(orig_key) from err def _maybe_cast_for_get_loc(self, key) -> Timestamp: - # needed to localize naive datetimes + # needed to localize naive datetimes or dates (GH 35690) key = Timestamp(key) if key.tzinfo is None: key = key.tz_localize(self.tz) @@ -655,12 +716,13 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): """ assert kind in ["loc", "getitem", None] - if is_float(label) or isinstance(label, time) or is_integer(label): - self._invalid_indexer("slice", label) - if isinstance(label, str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - parsed, reso = parsing.parse_time_string(label, freq) + try: + parsed, reso = parsing.parse_time_string(label, freq) + except parsing.DateParseError as err: + raise self._invalid_indexer("slice", label) from err + reso = Resolution.from_attrname(reso) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: @@ -672,14 +734,18 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): if self._is_strictly_monotonic_decreasing and len(self) > 1: return upper if side == "left" else lower return lower if side == "left" else upper + elif isinstance(label, (self._data._recognized_scalars, date)): + self._deprecate_mismatched_indexing(label) else: - return label + raise self._invalid_indexer("slice", label) + + return self._maybe_cast_for_get_loc(label) - def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + def _get_string_slice(self, key: str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) parsed, reso = parsing.parse_time_string(key, freq) reso = Resolution.from_attrname(reso) - loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) + loc = self._partial_date_slice(reso, parsed) return loc def slice_indexer(self, start=None, end=None, step=None, kind=None): @@ -723,15 +789,26 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): if (start is None or isinstance(start, str)) and ( end is None or isinstance(end, str) ): - mask = True + mask = np.array(True) + deprecation_mask = np.array(True) if start is not None: start_casted = self._maybe_cast_slice_bound(start, "left", kind) mask = start_casted <= self + deprecation_mask = start_casted == self if end is not None: end_casted = self._maybe_cast_slice_bound(end, "right", kind) mask = (self <= end_casted) & mask - + deprecation_mask = (end_casted == self) | deprecation_mask + + if not deprecation_mask.any(): + warnings.warn( + "Value based partial slicing on non-monotonic DatetimeIndexes " + "with non-existing keys is deprecated and will raise a " + "KeyError in a future Version.", + FutureWarning, + stacklevel=5, + ) indexer = mask.nonzero()[0][::step] if len(indexer) == len(self): return slice(None) @@ -742,9 +819,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- - def is_type_compatible(self, typ) -> bool: - return typ == self.inferred_type or typ == "datetime" - @property def inferred_type(self) -> str: # b/c datetime is represented as microseconds since the epoch, make @@ -842,10 +916,6 @@ def indexer_between_time( return mask.nonzero()[0] -DatetimeIndex._add_numeric_methods_disabled() -DatetimeIndex._add_logical_methods_disabled() - - def date_range( start=None, end=None, diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index c9367b7e2ee1d..92bd82f8263e9 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -1,10 +1,12 @@ """ Shared methods for Index subclasses backed by ExtensionArray. """ -from typing import List +from typing import List, Optional, TypeVar import numpy as np +from pandas._libs import lib +from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc @@ -13,10 +15,13 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.arrays import ExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.base import Index from pandas.core.ops import get_op_result_name +_T = TypeVar("_T", bound="NDArrayBackedExtensionIndex") + def inherit_from_data(name: str, delegate, cache: bool = False, wrap: bool = False): """ @@ -208,6 +213,24 @@ class ExtensionIndex(Index): __le__ = _make_wrapped_comparison_op("__le__") __ge__ = _make_wrapped_comparison_op("__ge__") + @doc(Index._shallow_copy) + def _shallow_copy( + self, values: Optional[ExtensionArray] = None, name: Label = lib.no_default + ): + name = self.name if name is lib.no_default else name + + if values is not None: + return self._simple_new(values, name=name) + + result = self._simple_new(self._data, name=name) + result._cache = self._cache + return result + + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return True + # --------------------------------------------------------------------- # NDarray-Like Methods @@ -217,24 +240,42 @@ def __getitem__(self, key): if result.ndim == 1: return type(self)(result, name=self.name) # Unpack to ndarray for MPL compat - result = result._data + # pandas\core\indexes\extension.py:220: error: "ExtensionArray" has + # no attribute "_data" [attr-defined] + result = result._data # type: ignore[attr-defined] # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) return result + def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: + # overriding IndexOpsMixin improves performance GH#38083 + return self._data.searchsorted(value, side=side, sorter=sorter) + # --------------------------------------------------------------------- + def _check_indexing_method(self, method): + """ + Raise if we have a get_indexer `method` that is not supported or valid. + """ + # GH#37871 for now this is only for IntervalIndex and CategoricalIndex + if method is None: + return + + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + raise NotImplementedError( + f"method {method} not yet implemented for {type(self).__name__}" + ) + + raise ValueError("Invalid fill method") + def _get_engine_target(self) -> np.ndarray: - # NB: _values_for_argsort happens to match the desired engine targets - # for all of our existing EA-backed indexes, but in general - # cannot be relied upon to exist. - return self._data._values_for_argsort() + return np.asarray(self._data) def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), dict(axis=axis)) + nv.validate_repeat((), {"axis": axis}) result = self._data.repeat(repeats, axis=axis) - return self._shallow_copy(result) + return type(self)._simple_new(result, name=self.name) def insert(self, loc: int, item): # ExtensionIndex subclasses must override Index.insert @@ -277,3 +318,85 @@ def astype(self, dtype, copy=True): # pass copy=False because any copying will be done in the # _data.astype call above return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) + + @cache_readonly + def _isnan(self) -> np.ndarray: + return self._data.isna() + + @doc(Index.equals) + def equals(self, other) -> bool: + # Dispatch to the ExtensionArray's .equals method. + if self.is_(other): + return True + + if not isinstance(other, type(self)): + return False + + return self._data.equals(other._data) + + +class NDArrayBackedExtensionIndex(ExtensionIndex): + """ + Index subclass for indexes backed by NDArrayBackedExtensionArray. + """ + + _data: NDArrayBackedExtensionArray + + def _get_engine_target(self) -> np.ndarray: + return self._data._ndarray + + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted + + Returns + ------- + new_index : Index + """ + new_vals = np.delete(self._data._ndarray, loc) + arr = self._data._from_backing_data(new_vals) + return type(self)._simple_new(arr, name=self.name) + + def insert(self, loc: int, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values. + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + + Raises + ------ + ValueError if the item is not valid for this dtype. + """ + arr = self._data + code = arr._validate_scalar(item) + + new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:])) + new_arr = arr._from_backing_data(new_vals) + return type(self)._simple_new(new_arr, name=self.name) + + @doc(Index.where) + def where(self, cond, other=None): + res_values = self._data.where(cond, other) + return type(self)._simple_new(res_values, name=self.name) + + def putmask(self, mask, value): + res_values = self._data.copy() + try: + res_values.putmask(mask, value) + except (TypeError, ValueError): + return self.astype(object).putmask(mask, value) + + return type(self)._simple_new(res_values, name=self.name) + + def _wrap_joined_index(self: _T, joined: np.ndarray, other: _T) -> _T: + name = get_op_result_name(self, other) + arr = self._data._from_backing_data(joined) + return type(self)._simple_new(arr, name=name) diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 909643d50e9d7..8c4437f2cdeb9 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -103,5 +103,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return f"{type(self).__name__}({str(self)})" - __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled - pop = append = extend = remove = sort = insert = _disabled + __setitem__ = __setslice__ = _disabled # type: ignore[assignment] + __delitem__ = __delslice__ = _disabled # type: ignore[assignment] + pop = append = extend = _disabled # type: ignore[assignment] + remove = sort = insert = _disabled # type: ignore[assignment] diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9548ebbd9c3b2..ee25a9d81a60f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,7 +1,8 @@ """ define the IntervalIndex """ +from functools import wraps from operator import le, lt import textwrap -from typing import Any, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, cast import numpy as np @@ -10,7 +11,7 @@ from pandas._libs import lib from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset -from pandas._typing import AnyArrayLike, Label +from pandas._typing import AnyArrayLike, DtypeObj, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception @@ -18,7 +19,8 @@ from pandas.core.dtypes.cast import ( find_common_type, infer_dtype_from_scalar, - maybe_downcast_to_dtype, + maybe_box_datetimelike, + maybe_downcast_numeric, ) from pandas.core.dtypes.common import ( ensure_platform_int, @@ -36,7 +38,7 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs @@ -49,6 +51,7 @@ default_pprint, ensure_index, maybe_extract_name, + unpack_nested_dtype, ) from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.extension import ExtensionIndex, inherit_names @@ -56,21 +59,23 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.core.ops import get_op_result_name -_VALID_CLOSED = {"left", "right", "both", "neither"} +if TYPE_CHECKING: + from pandas import CategoricalIndex + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - dict( - klass="IntervalIndex", - qualname="IntervalIndex", - target_klass="IntervalIndex or list of Intervals", - name=textwrap.dedent( + { + "klass": "IntervalIndex", + "qualname": "IntervalIndex", + "target_klass": "IntervalIndex or list of Intervals", + "name": textwrap.dedent( """\ name : object, optional Name to be stored in the index. """ ), - ) + } ) @@ -110,55 +115,40 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -class SetopCheck: +def setop_check(method): """ This is called to decorate the set operations of IntervalIndex to perform the type check in advance. """ + op_name = method.__name__ - def __init__(self, op_name): - self.op_name = op_name - - def __call__(self, setop): - def func(intvidx_self, other, sort=False): - intvidx_self._assert_can_do_setop(other) - other = ensure_index(other) - - if not isinstance(other, IntervalIndex): - result = getattr(intvidx_self.astype(object), self.op_name)(other) - if self.op_name in ("difference",): - result = result.astype(intvidx_self.dtype) - return result - elif intvidx_self.closed != other.closed: - raise ValueError( - "can only do set operations between two IntervalIndex " - "objects that are closed on the same side" - ) + @wraps(method) + def wrapped(self, other, sort=False): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, _ = self._convert_can_do_setop(other) - # GH 19016: ensure set op will not return a prohibited dtype - subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] - common_subtype = find_common_type(subtypes) - if is_object_dtype(common_subtype): - raise TypeError( - f"can only do {self.op_name} between two IntervalIndex " - "objects that have compatible dtypes" - ) + if not isinstance(other, IntervalIndex): + result = getattr(self.astype(object), op_name)(other) + if op_name in ("difference",): + result = result.astype(self.dtype) + return result - return setop(intvidx_self, other, sort) + return method(self, other, sort) - return func + return wrapped @Appender( _interval_shared_docs["class"] - % dict( - klass="IntervalIndex", - summary="Immutable index of intervals that are closed on the same side.", - name=_index_doc_kwargs["name"], - versionadded="0.20.0", - extra_attributes="is_overlapping\nvalues\n", - extra_methods="", - examples=textwrap.dedent( + % { + "klass": "IntervalIndex", + "summary": "Immutable index of intervals that are closed on the same side.", + "name": _index_doc_kwargs["name"], + "versionadded": "0.20.0", + "extra_attributes": "is_overlapping\nvalues\n", + "extra_methods": "", + "examples": textwrap.dedent( """\ Examples -------- @@ -178,27 +168,23 @@ def func(intvidx_self, other, sort=False): mentioned constructor methods. """ ), - ) + } ) @inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) -@inherit_names( - ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray, -) -@inherit_names( - ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True, -) +@inherit_names(["__array__", "overlaps", "contains"], IntervalArray) +@inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" _comparables = ["name"] - _attributes = ["name"] + _attributes = ["name", "closed"] # we would like our indexing holder to defer to us _defer_to_indexing = True - # Immutable, so we are able to cache computations like isna in '_mask' - _mask = None - _data: IntervalArray + _values: IntervalArray + _can_hold_strings = False + # -------------------------------------------------------------------- # Constructors @@ -242,16 +228,15 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): result._data = array result.name = name result._cache = {} - result._no_setting_name = False result._reset_identity() return result @classmethod @Appender( _interval_shared_docs["from_breaks"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( + % { + "klass": "IntervalIndex", + "examples": textwrap.dedent( """\ Examples -------- @@ -261,7 +246,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): dtype='interval[int64]') """ ), - ) + } ) def from_breaks( cls, breaks, closed: str = "right", name=None, copy: bool = False, dtype=None @@ -275,9 +260,9 @@ def from_breaks( @classmethod @Appender( _interval_shared_docs["from_arrays"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( + % { + "klass": "IntervalIndex", + "examples": textwrap.dedent( """\ Examples -------- @@ -287,7 +272,7 @@ def from_breaks( dtype='interval[int64]') """ ), - ) + } ) def from_arrays( cls, @@ -307,9 +292,9 @@ def from_arrays( @classmethod @Appender( _interval_shared_docs["from_tuples"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( + % { + "klass": "IntervalIndex", + "examples": textwrap.dedent( """\ Examples -------- @@ -319,7 +304,7 @@ def from_arrays( dtype='interval[int64]') """ ), - ) + } ) def from_tuples( cls, data, closed: str = "right", name=None, copy: bool = False, dtype=None @@ -330,26 +315,6 @@ def from_tuples( # -------------------------------------------------------------------- - @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, name: Label = lib.no_default): - name = self.name if name is lib.no_default else name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - - result = self._simple_new(values, name=name) - result._cache = cache - return result - - @cache_readonly - def _isnan(self): - """ - Return a mask indicating if each value is NA. - """ - if self._mask is None: - self._mask = isna(self.left) - return self._mask - @cache_readonly def _engine(self): left = self._maybe_convert_i8(self.left) @@ -390,27 +355,20 @@ def values(self) -> IntervalArray: """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result def __reduce__(self): - d = dict(left=self.left, right=self.right) + d = {"left": self.left, "right": self.right} d.update(self._get_attributes_dict()) return _new_IntervalIndex, (type(self), d), None @Appender(Index.astype.__doc__) - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self._values.astype(dtype, copy=copy) - if is_interval_dtype(new_values.dtype): - return self._shallow_copy(new_values) - return Index.astype(self, dtype, copy=copy) + return Index(new_values, dtype=new_values.dtype, name=self.name) @property def inferred_type(self) -> str: @@ -434,7 +392,7 @@ def is_monotonic_decreasing(self) -> bool: return self[::-1].is_monotonic_increasing @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: """ Return True if the IntervalIndex contains unique elements, else False. """ @@ -512,49 +470,11 @@ def is_overlapping(self) -> bool: # GH 23309 return self._engine.is_overlapping - def _should_fallback_to_positional(self) -> bool: - # integer lookups in Series.__getitem__ are unambiguously - # positional in this case - return self.dtype.subtype.kind in ["m", "M"] - - def _maybe_cast_slice_bound(self, label, side, kind): - return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - - @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr): - """ - we are passed a list-like indexer. Return the - indexer for matching intervals. - """ - locs = self.get_indexer_for(keyarr) - - # we have missing values - if (locs == -1).any(): - raise KeyError - - return locs - - def _can_reindex(self, indexer: np.ndarray) -> None: - """ - Check if we are allowing reindexing with this particular indexer. - - Parameters - ---------- - indexer : an integer indexer - - Raises - ------ - ValueError if its a duplicate axis - """ - # trying to reindex on an axis with duplicates - if self.is_overlapping and len(indexer): - raise ValueError("cannot reindex from an overlapping axis") - def _needs_i8_conversion(self, key) -> bool: """ Check if a given key needs i8 conversion. Conversion is necessary for Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An - Interval-like requires conversion if it's endpoints are one of the + Interval-like requires conversion if its endpoints are one of the aforementioned types. Assumes that any list-like data has already been cast to an Index. @@ -576,7 +496,7 @@ def _needs_i8_conversion(self, key) -> bool: def _maybe_convert_i8(self, key): """ - Maybe convert a given key to it's equivalent i8 value(s). Used as a + Maybe convert a given key to its equivalent i8 value(s). Used as a preprocessing step prior to IntervalTree queries (self._engine), which expects numeric data. @@ -609,11 +529,13 @@ def _maybe_convert_i8(self, key): if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) + if lib.is_period(key): + key_i8 = key.ordinal else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) if key.hasnans: - # convert NaT from it's i8 value to np.nan so it's not viewed + # convert NaT from its i8 value to np.nan so it's not viewed # as a valid value, maybe causing errors (e.g. is_overlapping) key_i8 = key_i8.where(~key._isnan) @@ -628,17 +550,6 @@ def _maybe_convert_i8(self, key): return key_i8 - def _check_method(self, method): - if method is None: - return - - if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: - raise NotImplementedError( - f"method {method} not yet implemented for IntervalIndex" - ) - - raise ValueError("Invalid fill method") - def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: raise KeyError( @@ -665,6 +576,9 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) + # -------------------------------------------------------------------- + # Indexing Methods + def get_loc( self, key, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: @@ -706,7 +620,7 @@ def get_loc( >>> index.get_loc(pd.Interval(0, 1)) 0 """ - self._check_method(method) + self._check_indexing_method(method) if not is_scalar(key): raise InvalidIndexError(key) @@ -749,15 +663,15 @@ def get_loc( ) ) @Appender(_index_shared_docs["get_indexer"]) - def get_indexer( + def _get_indexer( self, - target: AnyArrayLike, + target: Index, method: Optional[str] = None, limit: Optional[int] = None, tolerance: Optional[Any] = None, ) -> np.ndarray: - self._check_method(method) + self._check_indexing_method(method) if self.is_overlapping: raise InvalidIndexError( @@ -765,47 +679,33 @@ def get_indexer( "use IntervalIndex.get_indexer_non_unique" ) - target_as_index = ensure_index(target) - - if isinstance(target_as_index, IntervalIndex): + if isinstance(target, IntervalIndex): # equal indexes -> 1:1 positional match - if self.equals(target_as_index): + if self.equals(target): return np.arange(len(self), dtype="intp") - # different closed or incompatible subtype -> no matches - common_subtype = find_common_type( - [self.dtype.subtype, target_as_index.dtype.subtype] - ) - if self.closed != target_as_index.closed or is_object_dtype(common_subtype): - return np.repeat(np.intp(-1), len(target_as_index)) + if self._is_non_comparable_own_type(target): + # different closed or incompatible subtype -> no matches + return np.repeat(np.intp(-1), len(target)) - # non-overlapping -> at most one match per interval in target_as_index + # non-overlapping -> at most one match per interval in target # want exact matches -> need both left/right to match, so defer to # left/right get_indexer, compare elementwise, equality -> match - left_indexer = self.left.get_indexer(target_as_index.left) - right_indexer = self.right.get_indexer(target_as_index.right) + left_indexer = self.left.get_indexer(target.left) + right_indexer = self.right.get_indexer(target.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) - elif is_categorical_dtype(target_as_index.dtype): + elif is_categorical_dtype(target.dtype): + target = cast("CategoricalIndex", target) # get an indexer for unique categories then propagate to codes via take_1d - categories_indexer = self.get_indexer(target_as_index.categories) - indexer = take_1d(categories_indexer, target_as_index.codes, fill_value=-1) - elif not is_object_dtype(target_as_index): + categories_indexer = self.get_indexer(target.categories) + indexer = take_1d(categories_indexer, target.codes, fill_value=-1) + elif not is_object_dtype(target): # homogeneous scalar index: use IntervalTree - target_as_index = self._maybe_convert_i8(target_as_index) - indexer = self._engine.get_indexer(target_as_index.values) + target = self._maybe_convert_i8(target) + indexer = self._engine.get_indexer(target.values) else: # heterogeneous scalar index: defer elementwise to get_loc - # (non-overlapping so get_loc guarantees scalar of KeyError) - indexer = [] - for key in target_as_index: - try: - loc = self.get_loc(key) - except KeyError: - loc = -1 - except InvalidIndexError as err: - # i.e. non-scalar key - raise TypeError(key) from err - indexer.append(loc) + return self._get_indexer_pointwise(target)[0] return ensure_platform_int(indexer) @@ -817,10 +717,8 @@ def get_indexer_non_unique( # check that target_as_index IntervalIndex is compatible if isinstance(target_as_index, IntervalIndex): - common_subtype = find_common_type( - [self.dtype.subtype, target_as_index.dtype.subtype] - ) - if self.closed != target_as_index.closed or is_object_dtype(common_subtype): + + if self._is_non_comparable_own_type(target_as_index): # different closed or incompatible subtype -> no matches return ( np.repeat(-1, len(target_as_index)), @@ -831,18 +729,8 @@ def get_indexer_non_unique( target_as_index, IntervalIndex ): # target_as_index might contain intervals: defer elementwise to get_loc - indexer, missing = [], [] - for i, key in enumerate(target_as_index): - try: - locs = self.get_loc(key) - if isinstance(locs, slice): - locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") - locs = np.array(locs, ndmin=1) - except KeyError: - missing.append(i) - locs = np.array([-1]) - indexer.append(locs) - indexer = np.concatenate(indexer) + return self._get_indexer_pointwise(target_as_index) + else: target_as_index = self._maybe_convert_i8(target_as_index) indexer, missing = self._engine.get_indexer_non_unique( @@ -851,21 +739,33 @@ def get_indexer_non_unique( return ensure_platform_int(indexer), ensure_platform_int(missing) - def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: + def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]: """ - Guaranteed return of an indexer even when overlapping. - - This dispatches to get_indexer or get_indexer_non_unique - as appropriate. - - Returns - ------- - numpy.ndarray - List of indices. + pointwise implementation for get_indexer and get_indexer_non_unique. """ - if self.is_overlapping: - return self.get_indexer_non_unique(target)[0] - return self.get_indexer(target, **kwargs) + indexer, missing = [], [] + for i, key in enumerate(target): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + # Only needed for get_indexer_non_unique + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + except InvalidIndexError as err: + # i.e. non-scalar key + raise TypeError(key) from err + + indexer.append(locs) + + indexer = np.concatenate(indexer) + return ensure_platform_int(indexer), ensure_platform_int(missing) + + @property + def _index_as_unique(self): + return not self.is_overlapping def _convert_slice_indexer(self, key: slice, kind: str): if not (key.step is None or key.step == 1): @@ -881,13 +781,91 @@ def _convert_slice_indexer(self, key: slice, kind: str): return super()._convert_slice_indexer(key, kind) + def _should_fallback_to_positional(self) -> bool: + # integer lookups in Series.__getitem__ are unambiguously + # positional in this case + return self.dtype.subtype.kind in ["m", "M"] + + def _maybe_cast_slice_bound(self, label, side: str, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + @Appender(Index._convert_list_indexer.__doc__) + def _convert_list_indexer(self, keyarr): + """ + we are passed a list-like indexer. Return the + indexer for matching intervals. + """ + locs = self.get_indexer_for(keyarr) + + # we have missing values + if (locs == -1).any(): + raise KeyError(keyarr[locs == -1].tolist()) + + return locs + + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + if not isinstance(dtype, IntervalDtype): + return False + common_subtype = find_common_type([self.dtype.subtype, dtype.subtype]) + return not is_object_dtype(common_subtype) + + def _should_compare(self, other) -> bool: + if not super()._should_compare(other): + return False + other = unpack_nested_dtype(other) + return other.closed == self.closed + + # TODO: use should_compare and get rid of _is_non_comparable_own_type + def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool: + # different closed or incompatible subtype -> no matches + + # TODO: once closed is part of IntervalDtype, we can just define + # is_comparable_dtype GH#19371 + if self.closed != other.closed: + return True + return not self._is_comparable_dtype(other.dtype) + + # -------------------------------------------------------------------- + + @cache_readonly + def left(self) -> Index: + return Index(self._data.left, copy=False) + + @cache_readonly + def right(self) -> Index: + return Index(self._data.right, copy=False) + + @cache_readonly + def mid(self): + return Index(self._data.mid, copy=False) + + @property + def length(self): + return Index(self._data.length, copy=False) + + def putmask(self, mask, value): + arr = self._data.copy() + try: + value_left, value_right = arr._validate_setitem_value(value) + except (ValueError, TypeError): + return self.astype(object).putmask(mask, value) + + if isinstance(self._data._left, np.ndarray): + np.putmask(arr._left, mask, value_left) + np.putmask(arr._right, mask, value_right) + else: + # TODO: special case not needed with __array_function__ + arr._left.putmask(mask, value_left) + arr._right.putmask(mask, value_right) + return type(self)._simple_new(arr, name=self.name) + @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: other = self._na_value values = np.where(cond, self._values, other) result = IntervalArray(values) - return self._shallow_copy(result) + return type(self)._simple_new(result, name=self.name) def delete(self, loc): """ @@ -900,7 +878,7 @@ def delete(self, loc): new_left = self.left.delete(loc) new_right = self.right.delete(loc) result = self._data._shallow_copy(new_left, new_right) - return self._shallow_copy(result) + return type(self)._simple_new(result, name=self.name) def insert(self, loc, item): """ @@ -917,38 +895,18 @@ def insert(self, loc, item): ------- IntervalIndex """ - if isinstance(item, Interval): - if item.closed != self.closed: - raise ValueError( - "inserted item must be closed on the same side as the index" - ) - left_insert = item.left - right_insert = item.right - elif is_scalar(item) and isna(item): - # GH 18295 - left_insert = right_insert = item - else: - raise ValueError( - "can only insert Interval objects and NA into an IntervalIndex" - ) + left_insert, right_insert = self._data._validate_scalar(item) new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) result = self._data._shallow_copy(new_left, new_right) - return self._shallow_copy(result) - - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - result = self._data.take( - indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs - ) - return self._shallow_copy(result) + return type(self)._simple_new(result, name=self.name) # -------------------------------------------------------------------- # Rendering Methods # __repr__ associated methods are based on MultiIndex - def _format_with_header(self, header, na_rep="NaN") -> List[str]: + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: return header + list(self._format_native_types(na_rep=na_rep)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): @@ -1001,35 +959,41 @@ def _format_space(self) -> str: return f"\n{space}" # -------------------------------------------------------------------- + # Set Operations - def argsort(self, *args, **kwargs) -> np.ndarray: - return np.lexsort((self.right, self.left)) + def _assert_can_do_setop(self, other): + super()._assert_can_do_setop(other) - def equals(self, other) -> bool: - """ - Determines if two IntervalIndex objects contain the same elements. - """ - if self.is_(other): - return True + if isinstance(other, IntervalIndex) and self._is_non_comparable_own_type(other): + # GH#19016: ensure set op will not return a prohibited dtype + raise TypeError( + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side " + "and have compatible dtypes" + ) + + @Appender(Index.intersection.__doc__) + def intersection(self, other, sort=False) -> Index: + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, _ = self._convert_can_do_setop(other) + + if self.equals(other): + if self.has_duplicates: + return self.unique()._get_reconciled_name_object(other) + return self._get_reconciled_name_object(other) - # if we can coerce to an II - # then we can compare if not isinstance(other, IntervalIndex): - if not is_interval_dtype(other): - return False - other = Index(other) + return self.astype(object).intersection(other) - return ( - self.left.equals(other.left) - and self.right.equals(other.right) - and self.closed == other.closed - ) + result = self._intersection(other, sort=sort) + return self._wrap_setop_result(other, result) - @Appender(Index.intersection.__doc__) - @SetopCheck(op_name="intersection") - def intersection( - self, other: "IntervalIndex", sort: bool = False - ) -> "IntervalIndex": + def _intersection(self, other, sort): + """ + intersection specialized to the case with matching dtypes. + """ + # For IntervalIndex we also know other.closed == self.closed if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: @@ -1048,7 +1012,7 @@ def intersection( def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ Used when the IntervalIndex does not have any common endpoint, - no mater left or right. + no matter left or right. Return the intersection with another IntervalIndex. Parameters @@ -1095,8 +1059,11 @@ def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex": return self[mask] def _setop(op_name: str, sort=None): - @SetopCheck(op_name=op_name) def func(self, other, sort=sort): + # At this point we are assured + # isinstance(other, IntervalIndex) + # other.closed == self.closed + result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) @@ -1108,38 +1075,25 @@ def func(self, other, sort=sort): return type(self).from_tuples(result, closed=self.closed, name=result_name) - return func + func.__name__ = op_name + return setop_check(func) + + _union = _setop("union") + difference = _setop("difference") + symmetric_difference = _setop("symmetric_difference") + + # -------------------------------------------------------------------- @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: """ This is False even when left/right contain datetime-like objects, as the check is done on the Interval itself """ return False - union = _setop("union") - difference = _setop("difference") - symmetric_difference = _setop("symmetric_difference") - # TODO: arithmetic operations - # GH#30817 until IntervalArray implements inequalities, get them from Index - def __lt__(self, other): - return Index.__lt__(self, other) - - def __le__(self, other): - return Index.__le__(self, other) - - def __gt__(self, other): - return Index.__gt__(self, other) - - def __ge__(self, other): - return Index.__ge__(self, other) - - -IntervalIndex._add_logical_methods_disabled() - def _is_valid_endpoint(endpoint) -> bool: """ @@ -1259,8 +1213,8 @@ def interval_range( IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], closed='both', dtype='interval[int64]') """ - start = com.maybe_box_datetimelike(start) - end = com.maybe_box_datetimelike(end) + start = maybe_box_datetimelike(start) + end = maybe_box_datetimelike(end) endpoint = start if start is not None else end if freq is None and com.any_none(periods, start, end): @@ -1320,14 +1274,12 @@ def interval_range( breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com.not_none(start, end, freq)): # np.linspace always produces float output - breaks = maybe_downcast_to_dtype(breaks, "int64") + breaks = maybe_downcast_numeric(breaks, np.dtype("int64")) else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): - range_func = date_range + breaks = date_range(start=start, end=end, periods=periods, freq=freq) else: - range_func = timedelta_range - - breaks = range_func(start=start, end=end, periods=periods, freq=freq) + breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 15db6c51a1f2f..e4e29f32e62e6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,7 +1,9 @@ +from functools import wraps from sys import getsizeof from typing import ( TYPE_CHECKING, Any, + Callable, Hashable, Iterable, List, @@ -18,7 +20,7 @@ from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike, Scalar +from pandas._typing import AnyArrayLike, DtypeObj, Label, Scalar, Shape from pandas.compat.numpy import function as nv from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -45,10 +47,15 @@ from pandas.core.arrays.categorical import factorize_from_iterables import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index +from pandas.core.indexes.base import ( + Index, + _index_shared_docs, + ensure_index, + get_unanimous_names, +) from pandas.core.indexes.frozen import FrozenList from pandas.core.indexes.numeric import Int64Index -import pandas.core.missing as missing +from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( get_group_index, indexer_from_factorized, @@ -62,11 +69,11 @@ ) if TYPE_CHECKING: - from pandas import Series # noqa:F401 + from pandas import Series _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") + {"klass": "MultiIndex", "target_klass": "MultiIndex or list of tuples"} ) @@ -150,6 +157,25 @@ def _codes_to_ints(self, codes): return np.bitwise_or.reduce(codes, axis=1) +def names_compat(meth): + """ + A decorator to allow either `name` or `names` keyword but not both. + + This makes it easier to share code with base class. + """ + + @wraps(meth) + def new_meth(self_or_cls, *args, **kwargs): + if "name" in kwargs and "names" in kwargs: + raise TypeError("Can only provide one of `names` and `name`") + elif "name" in kwargs: + kwargs["names"] = kwargs.pop("name") + + return meth(self_or_cls, *args, **kwargs) + + return new_meth + + class MultiIndex(Index): """ A multi-level, or hierarchical, index object for pandas objects. @@ -231,7 +257,7 @@ class MultiIndex(Index): of the mentioned helper methods. """ - _deprecations = Index._deprecations | frozenset() + _hidden_attrs = Index._hidden_attrs | frozenset() # initialize to zero-length tuples to make everything work _typ = "multiindex" @@ -241,7 +267,6 @@ class MultiIndex(Index): _comparables = ["names"] rename = Index.set_names - _tuples = None sortorder: Optional[int] # -------------------------------------------------------------------- @@ -257,7 +282,6 @@ def __new__( copy=False, name=None, verify_integrity: bool = True, - _set_identity: bool = True, ): # compat with Index @@ -291,8 +315,7 @@ def __new__( new_codes = result._verify_integrity() result._codes = new_codes - if _set_identity: - result._reset_identity() + result._reset_identity() return result @@ -439,7 +462,7 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> "MultiInde if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] - return MultiIndex( + return cls( levels=levels, codes=codes, sortorder=sortorder, @@ -448,7 +471,13 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> "MultiInde ) @classmethod - def from_tuples(cls, tuples, sortorder=None, names=None): + @names_compat + def from_tuples( + cls, + tuples, + sortorder: Optional[int] = None, + names: Optional[Sequence[Label]] = None, + ): """ Convert list of tuples to MultiIndex. @@ -489,6 +518,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): elif is_iterator(tuples): tuples = list(tuples) + arrays: List[Sequence[Label]] if len(tuples) == 0: if names is None: raise TypeError("Cannot infer number of levels from empty list") @@ -503,7 +533,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): else: arrays = zip(*tuples) - return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + return cls.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod def from_product(cls, iterables, sortorder=None, names=lib.no_default): @@ -562,7 +592,7 @@ def from_product(cls, iterables, sortorder=None, names=lib.no_default): # codes are all ndarrays, so cartesian_product is lossless codes = cartesian_product(codes) - return MultiIndex(levels, codes, sortorder=sortorder, names=names) + return cls(levels, codes, sortorder=sortorder, names=names) @classmethod def from_frame(cls, df, sortorder=None, names=None): @@ -632,16 +662,9 @@ def from_frame(cls, df, sortorder=None, names=None): # -------------------------------------------------------------------- - @property + @cache_readonly def _values(self): # We override here, since our parent uses _data, which we don't use. - return self.values - - @property - def values(self): - if self._tuples is not None: - return self._tuples - values = [] for i in range(self.nlevels): @@ -655,8 +678,12 @@ def values(self): vals = np.array(vals, copy=False) values.append(vals) - self._tuples = lib.fast_zip(values) - return self._tuples + arr = lib.fast_zip(values) + return arr + + @property + def values(self): + return self._values @property def array(self): @@ -674,7 +701,7 @@ def array(self): ) @property - def shape(self): + def shape(self) -> Shape: """ Return a tuple of the shape of the underlying data. """ @@ -702,8 +729,13 @@ def levels(self): return FrozenList(result) def _set_levels( - self, levels, level=None, copy=False, validate=True, verify_integrity=False - ): + self, + levels, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly @@ -721,10 +753,10 @@ def _set_levels( ) else: level_numbers = [self._get_level_number(lev) for lev in level] - new_levels = list(self._levels) + new_levels_list = list(self._levels) for lev_num, lev in zip(level_numbers, levels): - new_levels[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() - new_levels = FrozenList(new_levels) + new_levels_list[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() + new_levels = FrozenList(new_levels_list) if verify_integrity: new_codes = self._verify_integrity(levels=new_levels) @@ -735,10 +767,9 @@ def _set_levels( if any(names): self._set_names(names) - self._tuples = None self._reset_cache() - def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): + def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -750,12 +781,15 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool, default True If True, checks that levels and codes are compatible. Returns ------- - new index (of same type and class...etc) + new index (of same type and class...etc) or None + The same type as the caller or None if ``inplace=True``. Examples -------- @@ -820,6 +854,15 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -849,6 +892,15 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.nlevels + 3 """ return len(self._levels) @@ -856,6 +908,15 @@ def nlevels(self) -> int: def levshape(self): """ A tuple with the length of each level. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) + >>> mi + MultiIndex([('a', 'b', 'c')], + ) + >>> mi.levshape + (1, 1, 1) """ return tuple(len(x) for x in self.levels) @@ -867,8 +928,13 @@ def codes(self): return self._codes def _set_codes( - self, codes, level=None, copy=False, validate=True, verify_integrity=False - ): + self, + codes, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: if validate: if level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") @@ -882,21 +948,22 @@ def _set_codes( ) else: level_numbers = [self._get_level_number(lev) for lev in level] - new_codes = list(self._codes) + new_codes_list = list(self._codes) for lev_num, level_codes in zip(level_numbers, codes): lev = self.levels[lev_num] - new_codes[lev_num] = _coerce_indexer_frozen(level_codes, lev, copy=copy) - new_codes = FrozenList(new_codes) + new_codes_list[lev_num] = _coerce_indexer_frozen( + level_codes, lev, copy=copy + ) + new_codes = FrozenList(new_codes_list) if verify_integrity: new_codes = self._verify_integrity(codes=new_codes) self._codes = new_codes - self._tuples = None self._reset_cache() - def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): + def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -912,12 +979,15 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool (default True) If True, checks that levels and codes are compatible. Returns ------- - new index (of same type and class...etc) + new index (of same type and class...etc) or None + The same type as the caller or None if ``inplace=True``. Examples -------- @@ -956,6 +1026,15 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): (1, 'two')], names=['foo', 'bar']) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if level is not None and not is_list_like(level): if not is_list_like(codes): raise TypeError("Codes must be list-like") @@ -983,7 +1062,7 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): def _engine(self): # Calculate the number of bits needed to represent labels in each # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) + sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) # Sum bit counts, starting from the _right_.... lev_bits = np.cumsum(sizes[::-1])[::-1] @@ -1003,57 +1082,26 @@ def _engine(self): @property def _constructor(self): - return MultiIndex.from_tuples + return type(self).from_tuples @doc(Index._shallow_copy) - def _shallow_copy( - self, - values=None, - name=lib.no_default, - levels=None, - codes=None, - dtype=None, - sortorder=None, - names=lib.no_default, - _set_identity: bool = True, - ): - if names is not lib.no_default and name is not lib.no_default: - raise TypeError("Can only provide one of `names` and `name`") - elif names is lib.no_default: - names = name if name is not lib.no_default else self.names + def _shallow_copy(self, values=None, name=lib.no_default): + names = name if name is not lib.no_default else self.names if values is not None: - assert levels is None and codes is None and dtype is None - return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) + return type(self).from_tuples(values, sortorder=None, names=names) - levels = levels if levels is not None else self.levels - codes = codes if codes is not None else self.codes - - result = MultiIndex( - levels=levels, - codes=codes, - dtype=dtype, - sortorder=sortorder, + result = type(self)( + levels=self.levels, + codes=self.codes, + sortorder=None, names=names, verify_integrity=False, - _set_identity=_set_identity, ) result._cache = self._cache.copy() result._cache.pop("levels", None) # GH32669 return result - def symmetric_difference(self, other, result_name=None, sort=None): - # On equal symmetric_difference MultiIndexes the difference is empty. - # Therefore, an empty MultiIndex is returned GH13490 - tups = Index.symmetric_difference(self, other, result_name, sort) - if len(tups) == 0: - return MultiIndex( - levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - names=tups.name, - ) - return type(self).from_tuples(tups, names=tups.name) - # -------------------------------------------------------------------- def copy( @@ -1064,7 +1112,6 @@ def copy( codes=None, deep=False, name=None, - _set_identity=False, ): """ Make a copy of this object. Names, dtype, levels and codes can be @@ -1074,8 +1121,14 @@ def copy( ---------- names : sequence, optional dtype : numpy dtype or pandas type, optional + + .. deprecated:: 1.2.0 levels : sequence, optional + + .. deprecated:: 1.2.0 codes : sequence, optional + + .. deprecated:: 1.2.0 deep : bool, default False name : Label Kept for compatibility with 1-dimensional Index. Should not be used. @@ -1091,6 +1144,21 @@ def copy( This could be potentially expensive on large MultiIndex objects. """ names = self._validate_names(name=name, names=names, deep=deep) + if levels is not None: + warnings.warn( + "parameter levels is deprecated and will be removed in a future " + "version. Use the set_levels method instead.", + FutureWarning, + stacklevel=2, + ) + if codes is not None: + warnings.warn( + "parameter codes is deprecated and will be removed in a future " + "version. Use the set_codes method instead.", + FutureWarning, + stacklevel=2, + ) + if deep: from copy import deepcopy @@ -1099,14 +1167,28 @@ def copy( if codes is None: codes = deepcopy(self.codes) - return self._shallow_copy( + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes + + new_index = type(self)( levels=levels, codes=codes, - names=names, - dtype=dtype, sortorder=self.sortorder, - _set_identity=_set_identity, + names=names, + verify_integrity=False, ) + new_index._cache = self._cache.copy() + new_index._cache.pop("levels", None) # GH32669 + + if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) + new_index = new_index.astype(dtype) + return new_index def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ @@ -1134,10 +1216,10 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """ return a boolean if we need a qualified .info display """ - def f(l): - return "mixed" in l or "string" in l or "unicode" in l + def f(level): + return "mixed" in level or "string" in level or "unicode" in level - return any(f(l) for l in self._inferred_type_levels) + return any(f(level) for level in self._inferred_type_levels) @doc(Index.memory_usage) def memory_usage(self, deep: bool = False) -> int: @@ -1231,13 +1313,17 @@ def _format_native_types(self, na_rep="nan", **kwargs): def format( self, - space=2, + name: Optional[bool] = None, + formatter: Optional[Callable] = None, + na_rep: Optional[str] = None, + names: bool = False, + space: int = 2, sparsify=None, - adjoin=True, - names=False, - na_rep=None, - formatter=None, - ): + adjoin: bool = True, + ) -> List: + if name is not None: + names = name + if len(self) == 0: return [] @@ -1265,13 +1351,13 @@ def format( stringified_levels.append(formatted) result_levels = [] - for lev, name in zip(stringified_levels, self.names): + for lev, lev_name in zip(stringified_levels, self.names): level = [] if names: level.append( - pprint_thing(name, escape_chars=("\t", "\r", "\n")) - if name is not None + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None else "" ) @@ -1283,20 +1369,19 @@ def format( if sparsify: sentinel = "" - # GH3547 - # use value of sparsify as sentinel, unless it's an obvious - # "Truthy" value - if sparsify not in [True, 1]: + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify in [False, lib.no_default]: sentinel = sparsify # little bit of a kludge job for #1217 - result_levels = _sparsify( + result_levels = sparsify_labels( result_levels, start=int(names), sentinel=sentinel ) if adjoin: - from pandas.io.formats.format import _get_adjustment + from pandas.io.formats.format import get_adjustment - adj = _get_adjustment() + adj = get_adjustment() return adj.adjoin(space, *result_levels).split("\n") else: return result_levels @@ -1360,13 +1445,30 @@ def _set_names(self, names, level=None, validate=True): raise TypeError( f"{type(self).__name__}.name must be a hashable type" ) - self._names[lev] = name + # pandas\core\indexes\multi.py:1448: error: Cannot determine type + # of '__setitem__' [has-type] + self._names[lev] = name # type: ignore[has-type] # If .levels has been accessed, the names in our cache will be stale. self._reset_cache() names = property( - fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" + fset=_set_names, + fget=_get_names, + doc=""" + Names of levels in MultiIndex. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays( + ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) + >>> mi + MultiIndex([(1, 3, 5), + (2, 4, 6)], + names=['x', 'y', 'z']) + >>> mi.names + FrozenList(['x', 'y', 'z']) + """, ) # -------------------------------------------------------------------- @@ -1450,7 +1552,10 @@ def is_monotonic_increasing(self) -> bool: return if the index is monotonic increasing (only equal or increasing) values. """ - if all(x.is_monotonic for x in self.levels): + if any(-1 in code for code in self.codes): + return False + + if all(level.is_monotonic for level in self.levels): # If each level is sorted, we can operate on the codes directly. GH27495 return libalgos.is_lexsorted( [x.astype("int64", copy=False) for x in self.codes] @@ -1506,7 +1611,7 @@ def dropna(self, how="any"): raise ValueError(f"invalid how option: {how}") new_codes = [level_codes[~indexer] for level_codes in self.codes] - return self.copy(codes=new_codes, deep=True) + return self.set_codes(codes=new_codes) def _get_level_values(self, level, unique=False): """ @@ -1578,10 +1683,6 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def _to_safe_for_reshape(self): - """ convert to object if we are a categorical """ - return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) - def to_frame(self, index=True, name=None): """ Create a DataFrame with the levels of the MultiIndex as columns. @@ -1607,6 +1708,32 @@ def to_frame(self, index=True, name=None): -------- DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) + >>> mi + MultiIndex([('a', 'c'), + ('b', 'd')], + ) + + >>> df = mi.to_frame() + >>> df + 0 1 + a c a c + b d b d + + >>> df = mi.to_frame(index=False) + >>> df + 0 1 + 0 a c + 1 b d + + >>> df = mi.to_frame(name=['x', 'y']) + >>> df + x y + a c a c + b d b d """ from pandas import DataFrame @@ -1664,7 +1791,7 @@ def to_flat_index(self): return Index(self._values, tupleize_cols=False) @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: return False def is_lexsorted(self) -> bool: @@ -1879,12 +2006,12 @@ def remove_unused_levels(self): def __reduce__(self): """Necessary for making this object picklable""" - d = dict( - levels=list(self.levels), - codes=list(self.codes), - sortorder=self.sortorder, - names=list(self.names), - ) + d = { + "levels": list(self.levels), + "codes": list(self.codes), + "sortorder": self.sortorder, + "names": list(self.names), + } return ibase._new_Index, (type(self), d), None # -------------------------------------------------------------------- @@ -1924,31 +2051,15 @@ def __getitem__(self, key): @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) + nv.validate_take((), kwargs) indices = ensure_platform_int(indices) - taken = self._assert_take_fillable( - self.codes, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1, - ) - return MultiIndex( - levels=self.levels, codes=taken, names=self.names, verify_integrity=False - ) - def _assert_take_fillable( - self, values, indices, allow_fill=True, fill_value=None, na_value=None - ): - """ Internal method to handle NA filling of take """ # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" - ) - raise ValueError(msg) + allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) + + na_value = -1 + + if allow_fill: taken = [lab.take(indices) for lab in self.codes] mask = indices == -1 if mask.any(): @@ -1960,7 +2071,10 @@ def _assert_take_fillable( taken = masked else: taken = [lab.take(indices) for lab in self.codes] - return taken + + return MultiIndex( + levels=self.levels, codes=taken, names=self.names, verify_integrity=False + ) def append(self, other): """ @@ -2001,7 +2115,7 @@ def argsort(self, *args, **kwargs) -> np.ndarray: @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), dict(axis=axis)) + nv.validate_repeat((), {"axis": axis}) repeats = ensure_platform_int(repeats) return MultiIndex( levels=self.levels, @@ -2024,7 +2138,7 @@ def drop(self, codes, level=None, errors="raise"): Parameters ---------- codes : array-like - Must be a list of tuples + Must be a list of tuples when level is not specified level : int or level name, default None errors : str, default 'raise' @@ -2050,7 +2164,8 @@ def drop(self, codes, level=None, errors="raise"): if isinstance(loc, int): inds.append(loc) elif isinstance(loc, slice): - inds.extend(range(loc.start, loc.stop)) + step = loc.step if loc.step is not None else 1 + inds.extend(range(loc.start, loc.stop, step)) elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: warnings.warn( @@ -2075,10 +2190,17 @@ def _drop_from_level(self, codes, level, errors="raise"): i = self._get_level_number(level) index = self.levels[i] values = index.get_indexer(codes) - + # If nan should be dropped it will equal -1 here. We have to check which values + # are not nan and equal -1, this means they are missing in the index + nan_codes = isna(codes) + values[(np.equal(nan_codes, False)) & (values == -1)] = -2 + if index.shape[0] == self.shape[0]: + values[np.equal(nan_codes, True)] = -2 + + not_found = codes[values == -2] + if len(not_found) != 0 and errors != "ignore": + raise KeyError(f"labels {not_found} not found in level") mask = ~algos.isin(self.codes[i], values) - if mask.all() and errors != "ignore": - raise KeyError(f"labels {codes} not found in level") return self[mask] @@ -2153,6 +2275,24 @@ def reorder_levels(self, order): Returns ------- MultiIndex + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) + >>> mi + MultiIndex([(1, 3), + (2, 4)], + names=['x', 'y']) + + >>> mi.reorder_levels(order=[1, 0]) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) + + >>> mi.reorder_levels(order=['y', 'x']) + MultiIndex([(3, 1), + (4, 2)], + names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: @@ -2170,7 +2310,7 @@ def reorder_levels(self, order): def _get_codes_for_sorting(self): """ - we categorizing our codes by using the + we are categorizing our codes by using the available categories (all, not just observed) excluding any missing ones (-1); this is in preparation for sorting, where we need to disambiguate that -1 is not @@ -2211,6 +2351,34 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): Resulting index. indexer : np.ndarray Indices of output values in original index. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) + >>> mi + MultiIndex([(0, 2), + (0, 1)], + ) + + >>> mi.sortlevel() + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(sort_remaining=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) + + >>> mi.sortlevel(1) + (MultiIndex([(0, 1), + (0, 2)], + ), array([1, 0])) + + >>> mi.sortlevel(1, ascending=False) + (MultiIndex([(0, 2), + (0, 1)], + ), array([0, 1])) """ if isinstance(level, (str, int)): level = [level] @@ -2358,6 +2526,10 @@ def _get_values_for_loc(self, series: "Series", loc, key): if is_scalar(loc): return new_values + if len(new_values) == 1 and not self.nlevels > 1: + # If more than one level left, we can not return a scalar + return new_values[0] + new_index = self[loc] new_index = maybe_droplevels(new_index, key) new_ser = series._constructor(new_values, index=new_index, name=series.name) @@ -2405,7 +2577,7 @@ def _get_partial_string_timestamp_match_key(self, key): if isinstance(key, str) and self.levels[0]._supports_partial_string_indexing: # Convert key '2016-01-01' to # ('2016-01-01'[, slice(None, None, None)]+) - key = tuple([key] + [slice(None)] * (len(self.levels) - 1)) + key = (key,) + (slice(None),) * (len(self.levels) - 1) if isinstance(key, tuple): # Convert (..., '2016-01-01', ...) in tuple to @@ -2424,9 +2596,7 @@ def _get_partial_string_timestamp_match_key(self, key): return key @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): - method = missing.clean_reindex_fill_method(method) - target = ensure_index(target) + def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None): # empty indexer if is_list_like(target) and not len(target): @@ -2464,10 +2634,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return ensure_platform_int(indexer) - @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - return super().get_indexer_non_unique(target) - def get_slice_bound( self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str ) -> int: @@ -2599,9 +2765,17 @@ def _partial_tup_index(self, tup, side="left"): return start + section.searchsorted(loc, side=side) idx = self._get_loc_single_level_index(lev, lab) - if k < n - 1: + if isinstance(idx, slice) and k < n - 1: + # Get start and end value from slice, necessary when a non-integer + # interval is given as input GH#37707 + start = idx.start + end = idx.stop + elif k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") + elif isinstance(idx, slice): + idx = idx.start + return start + section.searchsorted(idx, side=side) else: return start + section.searchsorted(idx, side=side) @@ -2677,9 +2851,11 @@ def get_loc(self, key, method=None): "currently supported for MultiIndex" ) + hash(key) + def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" - if not isinstance(loc, np.ndarray) or loc.dtype != "int64": + if not isinstance(loc, np.ndarray) or loc.dtype != np.intp: return loc loc = lib.maybe_indices_to_slice(loc, len(self)) @@ -2691,8 +2867,7 @@ def _maybe_to_slice(loc): mask[loc] = True return mask - if not isinstance(key, (tuple, list)): - # not including list here breaks some indexing, xref #30892 + if not isinstance(key, tuple): loc = self._get_level_indexer(key, level=0) return _maybe_to_slice(loc) @@ -2727,7 +2902,7 @@ def _maybe_to_slice(loc): stacklevel=10, ) - loc = np.arange(start, stop, dtype="int64") + loc = np.arange(start, stop, dtype=np.intp) for i, k in enumerate(follow_key, len(lead_key)): mask = self.codes[i][loc] == self._get_loc_single_level_index( @@ -2778,16 +2953,29 @@ def get_loc_level(self, key, level=0, drop_level: bool = True): >>> mi.get_loc_level(['b', 'e']) (1, None) """ + if not isinstance(level, (list, tuple)): + level = self._get_level_number(level) + else: + level = [self._get_level_number(lev) for lev in level] + return self._get_loc_level(key, level=level, drop_level=drop_level) + + def _get_loc_level( + self, key, level: Union[int, List[int]] = 0, drop_level: bool = True + ): + """ + get_loc_level but with `level` known to be positional, not name-based. + """ + # different name to distinguish from maybe_droplevels def maybe_mi_droplevels(indexer, levels, drop_level: bool): if not drop_level: return self[indexer] # kludge around orig_index = new_index = self[indexer] - levels = [self._get_level_number(i) for i in levels] + for i in sorted(levels, reverse=True): try: - new_index = new_index.droplevel(i) + new_index = new_index._drop_level_numbers([i]) except ValueError: # no dropping here @@ -2801,7 +2989,7 @@ def maybe_mi_droplevels(indexer, levels, drop_level: bool): ) result = None for lev, k in zip(level, key): - loc, new_index = self.get_loc_level(k, level=lev) + loc, new_index = self._get_loc_level(k, level=lev) if isinstance(loc, slice): mask = np.zeros(len(self), dtype=bool) mask[loc] = True @@ -2811,8 +2999,6 @@ def maybe_mi_droplevels(indexer, levels, drop_level: bool): return result, maybe_mi_droplevels(result, level, drop_level) - level = self._get_level_number(level) - # kludge for #1796 if isinstance(key, list): key = tuple(key) @@ -2877,7 +3063,8 @@ def partial_selection(key, indexer=None): indexer = self._get_level_indexer(key, level=level) return indexer, maybe_mi_droplevels(indexer, [level], drop_level) - def _get_level_indexer(self, key, level=0, indexer=None): + def _get_level_indexer(self, key, level: int = 0, indexer=None): + # `level` kwarg is _always_ positional, never name # return an indexer, boolean array or a slice showing where the key is # in the totality of values # if the indexer is provided, then use this @@ -2889,8 +3076,11 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): # given the inputs and the codes/indexer, compute an indexer set # if we have a provided indexer, then this need not consider # the entire labels set - + if step is not None and step < 0: + # Switch elements for negative step size + start, stop = stop - 1, start - 1 r = np.arange(start, stop, step) + if indexer is not None and len(indexer) != len(codes): # we have an indexer which maps the locations in the labels @@ -2924,6 +3114,8 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): start = 0 if key.stop is not None: stop = level_index.get_loc(key.stop) + elif isinstance(start, slice): + stop = len(level_index) else: stop = len(level_index) - 1 step = key.step @@ -2958,22 +3150,27 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): else: - code = self._get_loc_single_level_index(level_index, key) + idx = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted - locs = np.array(level_codes == code, dtype=bool, copy=False) + locs = np.array(level_codes == idx, dtype=bool, copy=False) if not locs.any(): # The label is present in self.levels[level] but unused: raise KeyError(key) return locs - i = level_codes.searchsorted(code, side="left") - j = level_codes.searchsorted(code, side="right") - if i == j: + if isinstance(idx, slice): + start = idx.start + end = idx.stop + else: + start = level_codes.searchsorted(idx, side="left") + end = level_codes.searchsorted(idx, side="right") + + if start == end: # The label is present in self.levels[level] but unused: raise KeyError(key) - return slice(i, j) + return slice(start, end) def get_locs(self, seq): """ @@ -3009,7 +3206,6 @@ def get_locs(self, seq): >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP array([2], dtype=int64) """ - from pandas.core.indexes.numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -3039,44 +3235,53 @@ def _convert_to_indexer(r) -> Int64Index: r = r.nonzero()[0] return Int64Index(r) - def _update_indexer(idxr, indexer=indexer): + def _update_indexer( + idxr: Optional[Index], indexer: Optional[Index], key + ) -> Index: if indexer is None: indexer = Index(np.arange(n)) if idxr is None: return indexer - return indexer & idxr + indexer_intersection = indexer.intersection(idxr) + if indexer_intersection.empty and not idxr.empty and not indexer.empty: + raise KeyError(key) + return indexer_intersection for i, k in enumerate(seq): if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) + indexer = _update_indexer( + _convert_to_indexer(k), indexer=indexer, key=seq + ) elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) - indexers = None + indexers: Optional[Int64Index] = None for x in k: try: idxrs = _convert_to_indexer( self._get_level_indexer(x, level=i, indexer=indexer) ) - indexers = idxrs if indexers is None else indexers | idxrs + indexers = (idxrs if indexers is None else indexers).union( + idxrs, sort=False + ) except KeyError: # ignore not founds continue if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer) + indexer = _update_indexer(indexers, indexer=indexer, key=seq) else: # no matches we are done return np.array([], dtype=np.int64) elif com.is_null_slice(k): # empty slice - indexer = _update_indexer(None, indexer=indexer) + indexer = _update_indexer(None, indexer=indexer, key=seq) elif isinstance(k, slice): @@ -3086,6 +3291,7 @@ def _update_indexer(idxr, indexer=indexer): self._get_level_indexer(k, level=i, indexer=indexer) ), indexer=indexer, + key=seq, ) else: # a single label @@ -3094,6 +3300,7 @@ def _update_indexer(idxr, indexer=indexer): self.get_loc_level(k, level=i, drop_level=False)[0] ), indexer=indexer, + key=seq, ) # empty indexer @@ -3105,6 +3312,8 @@ def _update_indexer(idxr, indexer=indexer): return indexer._values + # -------------------------------------------------------------------- + def _reorder_indexer( self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], @@ -3134,16 +3343,21 @@ def _reorder_indexer( k_codes = k_codes[k_codes >= 0] # Filter absent keys # True if the given codes are not ordered need_sort = (k_codes[:-1] > k_codes[1:]).any() + elif isinstance(k, slice) and k.step is not None and k.step < 0: + need_sort = True # Bail out if both index and seq are sorted if not need_sort: return indexer n = len(self) - keys: Tuple[np.ndarray, ...] = tuple() + keys: Tuple[np.ndarray, ...] = () # For each level of the sequence in seq, map the level codes with the # order they appears in a list-like sequence # This mapping is then use to reorder the indexer for i, k in enumerate(seq): + if is_scalar(k): + # GH#34603 we want to treat a scalar the same as an all equal list + k = [k] if com.is_bool_indexer(k): new_order = np.arange(n)[indexer] elif is_list_like(k): @@ -3157,6 +3371,11 @@ def _reorder_indexer( key_order_map[level_indexer] = np.arange(len(level_indexer)) new_order = key_order_map[self.codes[i][indexer]] + elif isinstance(k, slice) and k.step is not None and k.step < 0: + new_order = np.arange(n)[k][indexer] + elif isinstance(k, slice) and k.start is None and k.stop is None: + # slice(None) should not determine order GH#31330 + new_order = np.ones((n,))[indexer] else: # For all other case, use the same order as the level new_order = np.arange(n)[indexer] @@ -3200,7 +3419,7 @@ def truncate(self, before=None, after=None): verify_integrity=False, ) - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -3215,21 +3434,19 @@ def equals(self, other) -> bool: if not isinstance(other, Index): return False + if len(self) != len(other): + return False + if not isinstance(other, MultiIndex): # d-level MultiIndex can equal d-tuple Index if not is_object_dtype(other.dtype): # other cannot contain tuples, so cannot match self return False - elif len(self) != len(other): - return False return array_equivalent(self._values, other._values) if self.nlevels != other.nlevels: return False - if len(self) != len(other): - return False - for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] @@ -3243,11 +3460,10 @@ def equals(self, other) -> bool: np.asarray(other.levels[i]._values), other_codes, allow_fill=False ) - # since we use NaT both datetime64 and timedelta64 - # we can have a situation where a level is typed say - # timedelta64 in self (IOW it has other values than NaT) - # but types datetime64 in other (where its all NaT) - # but these are equivalent + # since we use NaT both datetime64 and timedelta64 we can have a + # situation where a level is typed say timedelta64 in self (IOW it + # has other values than NaT) but types datetime64 in other (where + # its all NaT) but these are equivalent if len(self_values) == 0 and len(other_values) == 0: continue @@ -3348,7 +3564,12 @@ def union(self, other, sort=None): other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): - return self + return self.rename(result_names) + + return self._union(other, sort=sort) + + def _union(self, other, sort): + other, result_names = self._convert_can_do_setop(other) # TODO: Index.union returns other when `len(self)` is 0. @@ -3364,6 +3585,9 @@ def union(self, other, sort=None): zip(*uniq_tuples), sortorder=0, names=result_names ) + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + return is_object_dtype(dtype) + def intersection(self, other, sort=False): """ Form the intersection of two MultiIndex objects. @@ -3390,17 +3614,18 @@ def intersection(self, other, sort=False): other, result_names = self._convert_can_do_setop(other) if self.equals(other): - return self + if self.has_duplicates: + return self.unique().rename(result_names) + return self.rename(result_names) - if not is_object_dtype(other.dtype): + return self._intersection(other, sort=sort) + + def _intersection(self, other, sort=False): + other, result_names = self._convert_can_do_setop(other) + + if not self._is_comparable_dtype(other.dtype): # The intersection is empty - # TODO: we have no tests that get here - return MultiIndex( - levels=self.levels, - codes=[[]] * self.nlevels, - names=result_names, - verify_integrity=False, - ) + return self[:0].rename(result_names) lvals = self._values rvals = other._values @@ -3408,16 +3633,23 @@ def intersection(self, other, sort=False): uniq_tuples = None # flag whether _inner_indexer was successful if self.is_monotonic and other.is_monotonic: try: - uniq_tuples = self._inner_indexer(lvals, rvals)[0] - sort = False # uniq_tuples is already sorted + inner_tuples = self._inner_indexer(lvals, rvals)[0] + sort = False # inner_tuples is already sorted except TypeError: pass + else: + uniq_tuples = algos.unique(inner_tuples) if uniq_tuples is None: other_uniq = set(rvals) seen = set() + # pandas\core\indexes\multi.py:3503: error: "add" of "set" does not + # return a value [func-returns-value] uniq_tuples = [ - x for x in lvals if x in other_uniq and not (x in seen or seen.add(x)) + x + for x in lvals + if x in other_uniq + and not (x in seen or seen.add(x)) # type: ignore[func-returns-value] ] if sort is None: @@ -3461,7 +3693,7 @@ def difference(self, other, sort=None): other, result_names = self._convert_can_do_setop(other) if len(other) == 0: - return self + return self.rename(result_names) if self.equals(other): return MultiIndex( @@ -3497,21 +3729,32 @@ def _convert_can_do_setop(self, other): if not isinstance(other, Index): if len(other) == 0: - other = MultiIndex( - levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - verify_integrity=False, - ) + return self[:0], self.names else: msg = "other must be a MultiIndex or a list of tuples" try: - other = MultiIndex.from_tuples(other) - except TypeError as err: + other = MultiIndex.from_tuples(other, names=self.names) + except (ValueError, TypeError) as err: + # ValueError raised by tuples_to_object_array if we + # have non-object dtype raise TypeError(msg) from err else: - result_names = self.names if self.names == other.names else None + result_names = get_unanimous_names(self, other) + return other, result_names + def symmetric_difference(self, other, result_name=None, sort=None): + # On equal symmetric_difference MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + tups = Index.symmetric_difference(self, other, result_name, sort) + if len(tups) == 0: + return type(self)( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + names=tups.name, + ) + return type(self).from_tuples(tups, names=tups.name) + # -------------------------------------------------------------------- @doc(Index.astype) @@ -3522,13 +3765,22 @@ def astype(self, dtype, copy=True): raise NotImplementedError(msg) elif not is_object_dtype(dtype): raise TypeError( - f"Setting {type(self)} dtype to anything other " - "than object is not supported" + "Setting a MultiIndex dtype to anything other than object " + "is not supported" ) elif copy is True: return self._shallow_copy() return self + def _validate_fill_value(self, item): + if not isinstance(item, tuple): + # Pad the key with empty strings if lower levels of the key + # aren't specified: + item = (item,) + ("",) * (self.nlevels - 1) + elif len(item) != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + return item + def insert(self, loc: int, item): """ Make new MultiIndex inserting new item at location @@ -3543,12 +3795,7 @@ def insert(self, loc: int, item): ------- new_index : Index """ - # Pad the key with empty strings if lower levels of the key - # aren't specified: - if not isinstance(item, tuple): - item = (item,) + ("",) * (self.nlevels - 1) - elif len(item) != self.nlevels: - raise ValueError("Item must have length equal to number of levels.") + item = self._validate_fill_value(item) new_levels = [] new_codes = [] @@ -3558,7 +3805,12 @@ def insert(self, loc: int, item): # must insert at end otherwise you have to recompute all the # other codes lev_loc = len(level) - level = level.insert(lev_loc, k) + try: + level = level.insert(lev_loc, k) + except TypeError: + # TODO: Should this be done inside insert? + # TODO: smarter casting rules? + level = level.astype(object).insert(lev_loc, k) else: lev_loc = level.get_loc(k) @@ -3585,10 +3837,6 @@ def delete(self, loc): verify_integrity=False, ) - def _wrap_joined_index(self, joined, other): - names = self.names if self.names == other.names else None - return MultiIndex.from_tuples(joined, names=names) - @doc(Index.isin) def isin(self, values, level=None): if level is None: @@ -3602,13 +3850,35 @@ def isin(self, values, level=None): return np.zeros(len(levs), dtype=np.bool_) return levs.isin(values) - -MultiIndex._add_numeric_methods_disabled() -MultiIndex._add_numeric_methods_add_sub_disabled() -MultiIndex._add_logical_methods_disabled() - - -def _sparsify(label_list, start: int = 0, sentinel=""): + # --------------------------------------------------------------- + # Arithmetic/Numeric Methods - Disabled + + __add__ = make_invalid_op("__add__") + __radd__ = make_invalid_op("__radd__") + __iadd__ = make_invalid_op("__iadd__") + __sub__ = make_invalid_op("__sub__") + __rsub__ = make_invalid_op("__rsub__") + __isub__ = make_invalid_op("__isub__") + __pow__ = make_invalid_op("__pow__") + __rpow__ = make_invalid_op("__rpow__") + __mul__ = make_invalid_op("__mul__") + __rmul__ = make_invalid_op("__rmul__") + __floordiv__ = make_invalid_op("__floordiv__") + __rfloordiv__ = make_invalid_op("__rfloordiv__") + __truediv__ = make_invalid_op("__truediv__") + __rtruediv__ = make_invalid_op("__rtruediv__") + __mod__ = make_invalid_op("__mod__") + __rmod__ = make_invalid_op("__rmod__") + __divmod__ = make_invalid_op("__divmod__") + __rdivmod__ = make_invalid_op("__rdivmod__") + # Unary methods disabled + __neg__ = make_invalid_op("__neg__") + __pos__ = make_invalid_op("__pos__") + __abs__ = make_invalid_op("__abs__") + __inv__ = make_invalid_op("__inv__") + + +def sparsify_labels(label_list, start: int = 0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) @@ -3658,13 +3928,13 @@ def maybe_droplevels(index, key): if isinstance(key, tuple): for _ in key: try: - index = index.droplevel(0) + index = index._drop_level_numbers([0]) except ValueError: # we have dropped too much, so back out return original_index else: try: - index = index.droplevel(0) + index = index._drop_level_numbers([0]) except ValueError: pass diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 5020a25c88ff4..ed76e26a57634 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,10 +1,11 @@ from typing import Any +import warnings import numpy as np from pandas._libs import index as libindex, lib -from pandas._typing import Dtype, Label -from pandas.util._decorators import cache_readonly, doc +from pandas._typing import Dtype, DtypeObj, Label +from pandas.util._decorators import doc from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -15,27 +16,20 @@ is_float, is_float_dtype, is_integer_dtype, + is_numeric_dtype, is_scalar, is_signed_integer_dtype, is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCFloat64Index, - ABCInt64Index, - ABCRangeIndex, - ABCSeries, - ABCUInt64Index, -) -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import algorithms import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name -from pandas.core.ops import get_op_result_name -_num_index_shared_docs = dict() +_num_index_shared_docs = {} class NumericIndex(Index): @@ -45,7 +39,10 @@ class NumericIndex(Index): This is an abstract class. """ + _default_dtype: np.dtype + _is_numeric_dtype = True + _can_hold_strings = False def __new__(cls, data=None, dtype=None, copy=False, name=None): cls._validate_dtype(dtype) @@ -95,13 +92,18 @@ def _validate_dtype(cls, dtype: Dtype) -> None: f"Incorrect `dtype` passed: expected {expected}, received {dtype}" ) + # ---------------------------------------------------------------- + # Indexing Methods + @doc(Index._maybe_cast_slice_bound) - def _maybe_cast_slice_bound(self, label, side, kind): + def _maybe_cast_slice_bound(self, label, side: str, kind): assert kind in ["loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) + # ---------------------------------------------------------------- + @doc(Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = lib.no_default): if values is not None and not self._can_hold_na and values.dtype.kind == "f": @@ -110,7 +112,7 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): return Float64Index._simple_new(values, name=name) return super()._shallow_copy(values=values, name=name) - def _convert_for_op(self, value): + def _validate_fill_value(self, value): """ Convert value to be insertable to ndarray. """ @@ -118,6 +120,14 @@ def _convert_for_op(self, value): # force conversion to object # so we don't lose the bools raise TypeError + elif isinstance(value, str) or lib.is_complex(value): + raise TypeError + elif is_scalar(value) and isna(value): + if is_valid_nat_for_dtype(value, self.dtype): + value = self._na_value + else: + # NaT, np.datetime64("NaT"), np.timedelta64("NaT") + raise TypeError return value @@ -138,6 +148,10 @@ def _convert_tolerance(self, tolerance, target): ) return tolerance + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + # If we ever have BoolIndex or ComplexIndex, this may need to be tightened + return is_numeric_dtype(dtype) + @classmethod def _assert_safe_casting(cls, data, subarr): """ @@ -148,7 +162,7 @@ def _assert_safe_casting(cls, data, subarr): pass @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: """ Checks that all the labels are datetime objects. """ @@ -156,9 +170,11 @@ def is_all_dates(self) -> bool: @doc(Index.insert) def insert(self, loc: int, item): - # treat NA values as nans: - if is_scalar(item) and isna(item): - item = self._na_value + try: + item = self._validate_fill_value(item) + except TypeError: + return self.astype(object).insert(loc, item) + return super().insert(loc, item) def _union(self, other, sort): @@ -182,7 +198,7 @@ def _union(self, other, sort): _num_index_shared_docs[ "class_descr" ] = """ - Immutable ndarray implementing an ordered, sliceable set. The basic object + Immutable sequence used for indexing and alignment. The basic object storing axis labels for all pandas objects. %(klass)s is a special case of `Index` with purely %(ltype)s labels. %(extra)s. @@ -212,7 +228,12 @@ def _union(self, other, sort): An Index instance can **only** contain hashable objects. """ -_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="") +_int64_descr_args = { + "klass": "Int64Index", + "ltype": "integer", + "dtype": "int64", + "extra": "", +} class IntegerIndex(NumericIndex): @@ -221,6 +242,20 @@ class IntegerIndex(NumericIndex): """ _default_dtype: np.dtype + _can_hold_na = False + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented with matching signed-ness. + """ + if data.dtype.kind != cls._default_dtype.kind: + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return other.dtype == "f8" or other.dtype == self.dtype def __contains__(self, key) -> bool: """ @@ -244,6 +279,11 @@ def inferred_type(self) -> str: @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak + warnings.warn( + "Index.asi8 is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=2, + ) return self._values.view(self._default_dtype) @@ -251,46 +291,28 @@ class Int64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args _typ = "int64index" - _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.dtype(np.int64) - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return Int64Index(joined, name=name) - - @classmethod - def _assert_safe_casting(cls, data, subarr): - """ - Ensure incoming data can be represented as ints. - """ - if not issubclass(data.dtype.type, np.signedinteger): - if not np.array_equal(data, subarr): - raise TypeError("Unsafe NumPy casting, you must explicitly cast") - - def _is_compatible_with_other(self, other) -> bool: - return super()._is_compatible_with_other(other) or all( - isinstance(obj, (ABCInt64Index, ABCFloat64Index, ABCRangeIndex)) - for obj in [self, other] - ) - -Int64Index._add_numeric_methods() -Int64Index._add_logical_methods() - -_uint64_descr_args = dict( - klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" -) +_uint64_descr_args = { + "klass": "UInt64Index", + "ltype": "unsigned integer", + "dtype": "uint64", + "extra": "", +} class UInt64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args _typ = "uint64index" - _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) + # ---------------------------------------------------------------- + # Indexing Methods + @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so that the values returned @@ -303,40 +325,13 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=dtype) - @doc(Index._convert_index_indexer) - def _convert_index_indexer(self, keyarr): - # Cast the indexer to uint64 if possible so - # that the values returned from indexing are - # also uint64. - if keyarr.is_integer(): - return keyarr.astype(np.uint64) - return keyarr - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return UInt64Index(joined, name=name) - - @classmethod - def _assert_safe_casting(cls, data, subarr): - """ - Ensure incoming data can be represented as uints. - """ - if not issubclass(data.dtype.type, np.unsignedinteger): - if not np.array_equal(data, subarr): - raise TypeError("Unsafe NumPy casting, you must explicitly cast") - - def _is_compatible_with_other(self, other) -> bool: - return super()._is_compatible_with_other(other) or all( - isinstance(obj, (ABCUInt64Index, ABCFloat64Index)) for obj in [self, other] - ) - -UInt64Index._add_numeric_methods() -UInt64Index._add_logical_methods() - -_float64_descr_args = dict( - klass="Float64Index", dtype="float64", ltype="float", extra="" -) +_float64_descr_args = { + "klass": "Float64Index", + "dtype": "float64", + "ltype": "float", + "extra": "", +} class Float64Index(NumericIndex): @@ -344,7 +339,7 @@ class Float64Index(NumericIndex): _typ = "float64index" _engine_type = libindex.Float64Engine - _default_dtype = np.float64 + _default_dtype = np.dtype(np.float64) @property def inferred_type(self) -> str: @@ -383,6 +378,22 @@ def _convert_slice_indexer(self, key: slice, kind: str): # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) + @doc(Index.get_loc) + def get_loc(self, key, method=None, tolerance=None): + if is_bool(key): + # Catch this to avoid accidentally casting to 1.0 + raise KeyError(key) + + if is_float(key) and np.isnan(key): + nan_idxs = self._nan_idxs + if not len(nan_idxs): + raise KeyError(key) + elif len(nan_idxs) == 1: + return nan_idxs[0] + return nan_idxs + + return super().get_loc(key, method=method, tolerance=tolerance) + # ---------------------------------------------------------------- def _format_native_types( @@ -400,28 +411,6 @@ def _format_native_types( ) return formatter.get_result_as_array() - def equals(self, other) -> bool: - """ - Determines if two Index objects contain the same elements. - """ - if self is other: - return True - - if not isinstance(other, Index): - return False - - # need to compare nans locations and make sure that they are the same - # since nans don't compare equal this is a bit tricky - try: - if not isinstance(other, Float64Index): - other = self._constructor(other) - if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape: - return False - left, right = self._values, other._values - return ((left == right) | (self._isnan & other._isnan)).all() - except (TypeError, ValueError): - return False - def __contains__(self, other: Any) -> bool: hash(other) if super().__contains__(other): @@ -429,40 +418,6 @@ def __contains__(self, other: Any) -> bool: return is_float(other) and np.isnan(other) and self.hasnans - @doc(Index.get_loc) - def get_loc(self, key, method=None, tolerance=None): - if is_bool(key): - # Catch this to avoid accidentally casting to 1.0 - raise KeyError(key) - - if is_float(key) and np.isnan(key): - nan_idxs = self._nan_idxs - if not len(nan_idxs): - raise KeyError(key) - elif len(nan_idxs) == 1: - return nan_idxs[0] - return nan_idxs - - return super().get_loc(key, method=method, tolerance=tolerance) - - @cache_readonly - def is_unique(self) -> bool: - return super().is_unique and self._nan_idxs.size < 2 - - @doc(Index.isin) - def isin(self, values, level=None): - if level is not None: - self._validate_index_level(level) - return algorithms.isin(np.array(self), values) - - def _is_compatible_with_other(self, other) -> bool: - return super()._is_compatible_with_other(other) or all( - isinstance( - obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), - ) - for obj in [self, other] - ) - - -Float64Index._add_numeric_methods() -Float64Index._add_logical_methods_disabled() + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return is_numeric_dtype(other.dtype) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 03e11b652477f..ac9fb31a4c35b 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,18 +1,17 @@ from datetime import datetime, timedelta from typing import Any +import warnings import numpy as np -from pandas._libs import index as libindex -from pandas._libs.lib import no_default +from pandas._libs import index as libindex, lib from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick from pandas._libs.tslibs.parsing import DateParseError, parse_time_string -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( - ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, @@ -44,7 +43,7 @@ from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) +_index_doc_kwargs.update({"target_klass": "PeriodIndex or list of Periods"}) # --- Period index sketch @@ -61,12 +60,12 @@ def _new_PeriodIndex(cls, **d): @inherit_names( - ["strftime", "to_timestamp", "start_time", "end_time"] + PeriodArray._field_ops, + ["strftime", "start_time", "end_time"] + PeriodArray._field_ops, PeriodArray, wrap=True, ) @inherit_names(["is_leap_year", "_format_native_types"], PeriodArray) -class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): +class PeriodIndex(DatetimeIndexOpsMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time. @@ -96,7 +95,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): ---------- day dayofweek + day_of_week dayofyear + day_of_year days_in_month daysinmonth end_time @@ -145,17 +146,42 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): _data: PeriodArray freq: BaseOffset + _data_cls = PeriodArray _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True # -------------------------------------------------------------------- - # methods that dispatch to array and wrap result in PeriodIndex + # methods that dispatch to array and wrap result in Index + # These are defined here instead of via inherit_names for mypy @doc(PeriodArray.asfreq) def asfreq(self, freq=None, how: str = "E") -> "PeriodIndex": arr = self._data.asfreq(freq, how) return type(self)._simple_new(arr, name=self.name) + @doc(PeriodArray.to_timestamp) + def to_timestamp(self, freq=None, how="start") -> DatetimeIndex: + arr = self._data.to_timestamp(freq, how) + return DatetimeIndex._simple_new(arr, name=self.name) + + # error: Decorated property not supported [misc] + @property # type:ignore[misc] + @doc(PeriodArray.hour.fget) + def hour(self) -> Int64Index: + return Int64Index(self._data.hour, name=self.name) + + # error: Decorated property not supported [misc] + @property # type:ignore[misc] + @doc(PeriodArray.minute.fget) + def minute(self) -> Int64Index: + return Int64Index(self._data.minute, name=self.name) + + # error: Decorated property not supported [misc] + @property # type:ignore[misc] + @doc(PeriodArray.second.fget) + def second(self) -> Int64Index: + return Int64Index(self._data.second, name=self.name) + # ------------------------------------------------------------------------ # Index Constructors @@ -209,7 +235,7 @@ def __new__( if data is None and ordinal is not None: # we strangely ignore `ordinal` if data is passed. ordinal = np.asarray(ordinal, dtype=np.int64) - data = PeriodArray(ordinal, freq) + data = PeriodArray(ordinal, freq=freq) else: # don't pass copy here, since we copy later. data = period_array(data=data, freq=freq) @@ -219,49 +245,12 @@ def __new__( return cls._simple_new(data, name=name) - @classmethod - def _simple_new(cls, values: PeriodArray, name: Label = None): - """ - Create a new PeriodIndex. - - Parameters - ---------- - values : PeriodArray - Values that can be converted to a PeriodArray without inference - or coercion. - """ - assert isinstance(values, PeriodArray), type(values) - - result = object.__new__(cls) - result._data = values - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - result.name = name - result._cache = {} - result._reset_identity() - return result - # ------------------------------------------------------------------------ # Data @property - def values(self): - return np.asarray(self) - - @property - def _has_complex_internals(self): - # used to avoid libreduction code paths, which raise or require conversion - return True - - def _shallow_copy(self, values=None, name: Label = no_default): - name = name if name is not no_default else self.name - cache = self._cache.copy() if values is None else {} - if values is None: - values = self._data - - result = self._simple_new(values, name=name) - result._cache = cache - return result + def values(self) -> np.ndarray: + return np.asarray(self, dtype=object) def _maybe_convert_timedelta(self, other): """ @@ -314,10 +303,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return self.astype(object)._values - @property - def _formatter_func(self): - return self.array._formatter(boxed=False) - # ------------------------------------------------------------------------ # Indexing @@ -345,10 +330,13 @@ def _int64index(self) -> Int64Index: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. Needs additional handling as - PeriodIndex stores internal data as int dtype + Gets called after a ufunc and other functions. + + Needs additional handling as PeriodIndex stores internal data as int + dtype - Replace this to __numpy_ufunc__ in future version + Replace this to __numpy_ufunc__ in future version and implement + __array_function__ for Indexes """ if isinstance(context, tuple) and len(context) > 0: func = context[0] @@ -376,39 +364,39 @@ def __array_wrap__(self, result, context=None): # cannot pass _simple_new as it is return type(self)(result, freq=self.freq, name=self.name) - def asof_locs(self, where, mask: np.ndarray) -> np.ndarray: + def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: """ where : array of timestamps mask : array of booleans where data is not NA """ - where_idx = where - if isinstance(where_idx, DatetimeIndex): - where_idx = PeriodIndex(where_idx._values, freq=self.freq) - elif not isinstance(where_idx, PeriodIndex): + if isinstance(where, DatetimeIndex): + where = PeriodIndex(where._values, freq=self.freq) + elif not isinstance(where, PeriodIndex): raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex") - elif where_idx.freq != self.freq: - raise raise_on_incompatible(self, where_idx) - locs = self.asi8[mask].searchsorted(where_idx.asi8, side="right") - - locs = np.where(locs > 0, locs - 1, 0) - result = np.arange(len(self))[mask].take(locs) - - first = mask.argmax() - result[(locs == 0) & (where_idx.asi8 < self.asi8[first])] = -1 - - return result + return super().asof_locs(where, mask) @doc(Index.astype) - def astype(self, dtype, copy=True, how="start"): + def astype(self, dtype, copy: bool = True, how=lib.no_default): dtype = pandas_dtype(dtype) + if how is not lib.no_default: + # GH#37982 + warnings.warn( + "The 'how' keyword in PeriodIndex.astype is deprecated and " + "will be removed in a future version. " + "Use index.to_timestamp(how=how) instead", + FutureWarning, + stacklevel=2, + ) + else: + how = "start" + if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) - # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) @property @@ -419,7 +407,7 @@ def is_full(self) -> bool: """ if len(self) == 0: return True - if not self.is_monotonic: + if not self.is_monotonic_increasing: raise ValueError("Index is not monotonic") values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() @@ -430,17 +418,44 @@ def inferred_type(self) -> str: # indexing return "period" + def insert(self, loc: int, item): + if not isinstance(item, Period) or self.freq != item.freq: + return self.astype(object).insert(loc, item) + + return DatetimeIndexOpsMixin.insert(self, loc, item) + + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + """ + See Index.join + """ + self._assert_can_do_setop(other) + + if not isinstance(other, PeriodIndex): + return self.astype(object).join( + other, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + # _assert_can_do_setop ensures we have matching dtype + result = super().join( + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + return result + + # ------------------------------------------------------------------------ + # Indexing Methods + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): - target = ensure_index(target) + def _get_indexer(self, target: Index, method=None, limit=None, tolerance=None): - if isinstance(target, PeriodIndex): - if target.freq != self.freq: - # No matches - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches + if not self._should_compare(target): + return self._get_indexer_non_comparable(target, method, unique=True) - target = target.asi8 + if isinstance(target, PeriodIndex): + target = target._int64index # i.e. target.asi8 self_index = self._int64index else: self_index = self @@ -451,20 +466,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): # convert tolerance to i8 tolerance = self._maybe_convert_timedelta(tolerance) - return Index.get_indexer(self_index, target, method, limit, tolerance) - - @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = ensure_index(target) - - if not self._is_comparable_dtype(target.dtype): - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches - - target = target.asi8 - - indexer, missing = self._int64index.get_indexer_non_unique(target) - return ensure_platform_int(indexer), missing + return Index._get_indexer(self_index, target, method, limit, tolerance) def get_loc(self, key, method=None, tolerance=None): """ @@ -501,7 +503,7 @@ def get_loc(self, key, method=None, tolerance=None): try: asdt, reso = parse_time_string(key, self.freq) - except DateParseError as err: + except (ValueError, DateParseError) as err: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") from err @@ -572,10 +574,9 @@ def _maybe_cast_slice_bound(self, label, side: str, kind: str): return bounds[0 if side == "left" else 1] except ValueError as err: # string cannot be parsed as datetime-like - # TODO: we need tests for this case - raise KeyError(label) from err + raise self._invalid_indexer("slice", label) from err elif is_integer(label) or is_float(label): - self._invalid_indexer("slice", label) + raise self._invalid_indexer("slice", label) return label @@ -595,47 +596,14 @@ def _validate_partial_date_slice(self, reso: Resolution): # why is that check not needed? raise ValueError - def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): - # TODO: Check for non-True use_lhs/use_rhs + def _get_string_slice(self, key: str): parsed, reso = parse_time_string(key, self.freq) reso = Resolution.from_attrname(reso) try: - return self._partial_date_slice(reso, parsed, use_lhs, use_rhs) + return self._partial_date_slice(reso, parsed) except KeyError as err: raise KeyError(key) from err - def insert(self, loc, item): - if not isinstance(item, Period) or self.freq != item.freq: - return self.astype(object).insert(loc, item) - - i8result = np.concatenate( - (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) - ) - arr = type(self._data)._simple_new(i8result, dtype=self.dtype) - return type(self)._simple_new(arr, name=self.name) - - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - self._assert_can_do_setop(other) - - if not isinstance(other, PeriodIndex): - return self.astype(object).join( - other, how=how, level=level, return_indexers=return_indexers, sort=sort - ) - - # _assert_can_do_setop ensures we have matching dtype - result = Int64Index.join( - self, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - return result - # ------------------------------------------------------------------------ # Set Operation Methods @@ -667,15 +635,21 @@ def _setop(self, other, sort, opname: str): def intersection(self, other, sort=False): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other = ensure_index(other) + other, _ = self._convert_can_do_setop(other) if self.equals(other): + if self.has_duplicates: + return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) - elif is_object_dtype(other.dtype): + return self._intersection(other, sort=sort) + + def _intersection(self, other, sort=False): + + if is_object_dtype(other.dtype): return self.astype("O").intersection(other, sort=sort) - elif not is_dtype_equal(self.dtype, other.dtype): + elif not self._is_comparable_dtype(other.dtype): # We can infer that the intersection is empty. # assert_can_do_setop ensures that this is not just a mismatched freq this = self[:0].astype("O") @@ -687,11 +661,14 @@ def intersection(self, other, sort=False): def difference(self, other, sort=None): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other = ensure_index(other) + other, result_name = self._convert_can_do_setop(other) if self.equals(other): - # pass an empty PeriodArray with the appropriate dtype - return type(self)._simple_new(self._data[:0], name=self.name) + return self[:0].rename(result_name) + + return self._difference(other, sort=sort) + + def _difference(self, other, sort): if is_object_dtype(other): return self.astype(object).difference(other).astype(self.dtype) @@ -717,17 +694,13 @@ def _union(self, other, sort): # ------------------------------------------------------------------------ - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: result = super().memory_usage(deep=deep) if hasattr(self, "_cache") and "_int64index" in self._cache: result += self._int64index.memory_usage(deep=deep) return result -PeriodIndex._add_numeric_methods_disabled() -PeriodIndex._add_logical_methods_disabled() - - def period_range( start=None, end=None, periods=None, freq=None, name=None ) -> PeriodIndex: diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6d9fd6efe54a3..f14c126180642 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List, Optional +from typing import Any, List, Optional, Tuple import warnings import numpy as np @@ -9,7 +9,6 @@ from pandas._libs import index as libindex from pandas._libs.lib import no_default from pandas._typing import Label -import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly, doc @@ -18,9 +17,9 @@ ensure_python_int, is_float, is_integer, - is_integer_dtype, is_list_like, is_scalar, + is_signed_integer_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCTimedeltaIndex @@ -30,11 +29,9 @@ from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name -from pandas.core.indexes.numeric import Int64Index +from pandas.core.indexes.numeric import Float64Index, Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.io.formats.printing import pprint_thing - _empty_range = range(0) @@ -55,10 +52,12 @@ class RangeIndex(Int64Index): If int and "stop" is not given, interpreted as "stop" instead. stop : int (default: 0) step : int (default: 1) - name : object, optional - Name to be stored in the index. + dtype : np.int64 + Unused, accepted for homogeneity with other index types. copy : bool, default False Unused, accepted for homogeneity with other index types. + name : object, optional + Name to be stored in the index. Attributes ---------- @@ -80,13 +79,11 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine _range: range - # check whether self._data has been called - _cached_data: Optional[np.ndarray] = None # -------------------------------------------------------------------- # Constructors def __new__( - cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, + cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None ): cls._validate_dtype(dtype) @@ -152,20 +149,14 @@ def _constructor(self): """ return the class to use for construction """ return Int64Index - @property + @cache_readonly def _data(self): """ An int array that for performance reasons is created only when needed. - The constructed array is saved in ``_cached_data``. This allows us to - check if the array has been created without accessing ``_data`` and - triggering the construction. + The constructed array is saved in ``_cache``. """ - if self._cached_data is None: - self._cached_data = np.arange( - self.start, self.stop, self.step, dtype=np.int64 - ) - return self._cached_data + return np.arange(self.start, self.stop, self.step, dtype=np.int64) @cache_readonly def _int64index(self) -> Int64Index: @@ -197,8 +188,14 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep="NaN") -> List[str]: - return header + list(map(pprint_thing, self._range)) + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: + if not len(self._range): + return header + first_val_str = str(self._range[0]) + last_val_str = str(self._range[-1]) + max_length = max(len(first_val_str), len(last_val_str)) + + return header + [f"{x:<{max_length}}" for x in self._range] # -------------------------------------------------------------------- _deprecation_message = ( @@ -342,6 +339,9 @@ def __contains__(self, key: Any) -> bool: return False return key in self._range + # -------------------------------------------------------------------- + # Indexing Methods + @doc(Int64Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if method is None and tolerance is None: @@ -355,9 +355,9 @@ def get_loc(self, key, method=None, tolerance=None): return super().get_loc(key, method=method, tolerance=tolerance) @Appender(_index_shared_docs["get_indexer"]) - def get_indexer(self, target, method=None, limit=None, tolerance=None): + def _get_indexer(self, target, method=None, limit=None, tolerance=None): if com.any_not_none(method, tolerance, limit) or not is_list_like(target): - return super().get_indexer( + return super()._get_indexer( target, method=method, tolerance=tolerance, limit=limit ) @@ -369,9 +369,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): start, stop, step = reverse.start, reverse.stop, reverse.step target_array = np.asarray(target) - if not (is_integer_dtype(target_array) and target_array.ndim == 1): + if not (is_signed_integer_dtype(target_array) and target_array.ndim == 1): # checks/conversions/roundings are delegated to general method - return super().get_indexer(target, method=method, tolerance=tolerance) + return super()._get_indexer(target, method=method, tolerance=tolerance) locs = target_array - start valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) @@ -383,26 +383,42 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): locs[valid] = len(self) - 1 - locs[valid] return ensure_platform_int(locs) + # -------------------------------------------------------------------- + def tolist(self): return list(self._range) + @doc(Int64Index.__iter__) + def __iter__(self): + yield from self._range + @doc(Int64Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name - if values is None: - result = self._simple_new(self._range, name=name) - result._cache = self._cache.copy() - return result - else: + if values is not None: + if values.dtype.kind == "f": + return Float64Index(values, name=name) return Int64Index._simple_new(values, name=name) + result = self._simple_new(self._range, name=name) + result._cache = self._cache + return result + @doc(Int64Index.copy) - def copy(self, name=None, deep=False, dtype=None, **kwargs): - self._validate_dtype(dtype) - if name is None: - name = self.name - return self.from_range(self._range, name=name) + def copy(self, name=None, deep=False, dtype=None, names=None): + name = self._validate_names(name=name, names=names, deep=deep)[0] + new_index = self._shallow_copy(name=name) + + if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) + new_index = new_index.astype(dtype) + return new_index def _minmax(self, meth: str): no_steps = len(self) - 1 @@ -445,7 +461,17 @@ def argsort(self, *args, **kwargs) -> np.ndarray: else: return np.arange(len(self) - 1, -1, -1) - def equals(self, other) -> bool: + def factorize( + self, sort: bool = False, na_sentinel: Optional[int] = -1 + ) -> Tuple[np.ndarray, "RangeIndex"]: + codes = np.arange(len(self), dtype=np.intp) + uniques = self + if sort and self.step < 0: + codes = codes[::-1] + uniques = uniques[::-1] + return codes, uniques + + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -453,34 +479,14 @@ def equals(self, other) -> bool: return self._range == other._range return super().equals(other) - def intersection(self, other, sort=False): - """ - Form the intersection of two Index objects. - - Parameters - ---------- - other : Index or array-like - sort : False or None, default False - Sort the resulting index if possible - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - Returns - ------- - intersection : Index - """ - self._validate_sort_keyword(sort) + # -------------------------------------------------------------------- + # Set Operations - if self.equals(other): - return self._get_reconciled_name_object(other) + def _intersection(self, other, sort=False): if not isinstance(other, RangeIndex): - return super().intersection(other, sort=sort) + # Int64Index + return super()._intersection(other, sort=sort) if not len(self) or not len(other): return self._simple_new(_empty_range) @@ -521,6 +527,7 @@ def intersection(self, other, sort=False): new_index = new_index[::-1] if sort is None: new_index = new_index.sort_values() + return new_index def _min_fitting_element(self, lower_limit: int) -> int: @@ -619,6 +626,63 @@ def _union(self, other, sort): return type(self)(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) + def difference(self, other, sort=None): + # optimized set operation if we have another RangeIndex + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + other, result_name = self._convert_can_do_setop(other) + + if not isinstance(other, RangeIndex): + return super().difference(other, sort=sort) + + res_name = ops.get_op_result_name(self, other) + + first = self._range[::-1] if self.step < 0 else self._range + overlap = self.intersection(other) + if overlap.step < 0: + overlap = overlap[::-1] + + if len(overlap) == 0: + return self._shallow_copy(name=res_name) + if len(overlap) == len(self): + return self[:0].rename(res_name) + if not isinstance(overlap, RangeIndex): + # We wont end up with RangeIndex, so fall back + return super().difference(other, sort=sort) + if overlap.step != first.step: + # In some cases we might be able to get a RangeIndex back, + # but not worth the effort. + return super().difference(other, sort=sort) + + if overlap[0] == first.start: + # The difference is everything after the intersection + new_rng = range(overlap[-1] + first.step, first.stop, first.step) + elif overlap[-1] == first[-1]: + # The difference is everything before the intersection + new_rng = range(first.start, overlap[0], first.step) + else: + # The difference is not range-like + return super().difference(other, sort=sort) + + new_index = type(self)._simple_new(new_rng, name=res_name) + if first is not self._range: + new_index = new_index[::-1] + return new_index + + def symmetric_difference(self, other, result_name=None, sort=None): + if not isinstance(other, RangeIndex) or sort is not None: + return super().symmetric_difference(other, result_name, sort) + + left = self.difference(other) + right = other.difference(self) + result = left.union(right) + + if result_name is not None: + result = result.rename(result_name) + return result + + # -------------------------------------------------------------------- + @doc(Int64Index.join) def join(self, other, how="left", level=None, return_indexers=False, sort=False): if how == "outer" and self is not other: @@ -731,89 +795,92 @@ def __floordiv__(self, other): return self._simple_new(new_range, name=self.name) return self._int64index // other - def all(self) -> bool: + # -------------------------------------------------------------------- + # Reductions + + def all(self, *args, **kwargs) -> bool: return 0 not in self._range - def any(self) -> bool: + def any(self, *args, **kwargs) -> bool: return any(self._range) - @classmethod - def _add_numeric_methods_binary(cls): - """ add in numeric methods, specialized to RangeIndex """ - - def _make_evaluate_binop(op, step=False): - """ - Parameters - ---------- - op : callable that accepts 2 params - perform the binary op - step : callable, optional, default to False - op to apply to the step parm if not None - if False, use the existing step - """ - - @unpack_zerodim_and_defer(op.__name__) - def _evaluate_numeric_binop(self, other): - if isinstance(other, ABCTimedeltaIndex): - # Defer to TimedeltaIndex implementation - return NotImplemented - elif isinstance(other, (timedelta, np.timedelta64)): - # GH#19333 is_integer evaluated True on timedelta64, - # so we need to catch these explicitly - return op(self._int64index, other) - elif is_timedelta64_dtype(other): - # Must be an np.ndarray; GH#22390 - return op(self._int64index, other) - - other = extract_array(other, extract_numpy=True) - attrs = self._get_attributes_dict() - - left, right = self, other - - try: - # apply if we have an override - if step: - with np.errstate(all="ignore"): - rstep = step(left.step, right) - - # we don't have a representable op - # so return a base index - if not is_integer(rstep) or not rstep: - raise ValueError + # -------------------------------------------------------------------- - else: - rstep = left.step + def _cmp_method(self, other, op): + if isinstance(other, RangeIndex) and self._range == other._range: + # Both are immutable so if ._range attr. are equal, shortcut is possible + return super()._cmp_method(self, op) + return super()._cmp_method(other, op) - with np.errstate(all="ignore"): - rstart = op(left.start, right) - rstop = op(left.stop, right) + def _arith_method(self, other, op): + """ + Parameters + ---------- + other : Any + op : callable that accepts 2 params + perform the binary op + """ + + if isinstance(other, ABCTimedeltaIndex): + # Defer to TimedeltaIndex implementation + return NotImplemented + elif isinstance(other, (timedelta, np.timedelta64)): + # GH#19333 is_integer evaluated True on timedelta64, + # so we need to catch these explicitly + return op(self._int64index, other) + elif is_timedelta64_dtype(other): + # Must be an np.ndarray; GH#22390 + return op(self._int64index, other) + + if op in [ + operator.pow, + ops.rpow, + operator.mod, + ops.rmod, + ops.rfloordiv, + divmod, + ops.rdivmod, + ]: + return op(self._int64index, other) + + step = False + if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]: + step = op + + other = extract_array(other, extract_numpy=True) + attrs = self._get_attributes_dict() + + left, right = self, other - result = type(self)(rstart, rstop, rstep, **attrs) + try: + # apply if we have an override + if step: + with np.errstate(all="ignore"): + rstep = step(left.step, right) - # for compat with numpy / Int64Index - # even if we can represent as a RangeIndex, return - # as a Float64Index if we have float-like descriptors - if not all(is_integer(x) for x in [rstart, rstop, rstep]): - result = result.astype("float64") + # we don't have a representable op + # so return a base index + if not is_integer(rstep) or not rstep: + raise ValueError - return result + else: + rstep = left.step - except (ValueError, TypeError, ZeroDivisionError): - # Defer to Int64Index implementation - return op(self._int64index, other) - # TODO: Do attrs get handled reliably? + with np.errstate(all="ignore"): + rstart = op(left.start, right) + rstop = op(left.stop, right) - name = f"__{op.__name__}__" - return compat.set_function_name(_evaluate_numeric_binop, name, cls) + result = type(self)(rstart, rstop, rstep, **attrs) - cls.__add__ = _make_evaluate_binop(operator.add) - cls.__radd__ = _make_evaluate_binop(ops.radd) - cls.__sub__ = _make_evaluate_binop(operator.sub) - cls.__rsub__ = _make_evaluate_binop(ops.rsub) - cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul) - cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul) - cls.__truediv__ = _make_evaluate_binop(operator.truediv, step=operator.truediv) - cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, step=ops.rtruediv) + # for compat with numpy / Int64Index + # even if we can represent as a RangeIndex, return + # as a Float64Index if we have float-like descriptors + if not all(is_integer(x) for x in [rstart, rstop, rstep]): + result = result.astype("float64") + return result -RangeIndex._add_numeric_methods() + except (ValueError, TypeError, ZeroDivisionError): + # Defer to Int64Index implementation + return op(self._int64index, other) + # TODO: Do attrs get handled reliably? diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index dccc8369c5366..fcab3e1f6a0a4 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -2,14 +2,12 @@ from pandas._libs import index as libindex, lib from pandas._libs.tslibs import Timedelta, to_offset -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import doc from pandas.core.dtypes.common import ( TD64NS_DTYPE, - is_float, - is_integer, is_scalar, is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -105,6 +103,7 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): _typ = "timedeltaindex" + _data_cls = TimedeltaArray _engine_type = libindex.TimedeltaEngine _comparables = ["name", "freq"] @@ -153,38 +152,15 @@ def __new__( # - Cases checked above all return/raise before reaching here - # - tdarr = TimedeltaArray._from_sequence( + tdarr = TimedeltaArray._from_sequence_not_strict( data, freq=freq, unit=unit, dtype=dtype, copy=copy ) return cls._simple_new(tdarr, name=name) - @classmethod - def _simple_new(cls, values: TimedeltaArray, name: Label = None): - assert isinstance(values, TimedeltaArray) - - result = object.__new__(cls) - result._data = values - result._name = name - result._cache = {} - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - - result._reset_identity() - return result - - # ------------------------------------------------------------------- - # Rendering Methods - - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_timedelta64 - - return _get_format_timedelta64(self, box=True) - # ------------------------------------------------------------------- @doc(Index.astype) - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): # Have to repeat the check for 'timedelta64' (not ns) dtype @@ -202,6 +178,9 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ return is_timedelta64_dtype(dtype) + # ------------------------------------------------------------------- + # Indexing Methods + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -214,7 +193,7 @@ def get_loc(self, key, method=None, tolerance=None): raise InvalidIndexError(key) try: - key = self._data._validate_scalar(key, cast_str=True) + key = self._data._validate_scalar(key, unbox=False) except TypeError as err: raise KeyError(key) from err @@ -243,22 +222,18 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): return lbound else: return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") - elif is_integer(label) or is_float(label): - self._invalid_indexer("slice", label) + elif not isinstance(label, self._data._recognized_scalars): + raise self._invalid_indexer("slice", label) return label - def is_type_compatible(self, typ) -> bool: - return typ == self.inferred_type or typ == "timedelta" + # ------------------------------------------------------------------- @property def inferred_type(self) -> str: return "timedelta64" -TimedeltaIndex._add_logical_methods_disabled() - - def timedelta_range( start=None, end=None, periods=None, freq=None, name=None, closed=None ) -> TimedeltaIndex: @@ -323,8 +298,8 @@ def timedelta_range( >>> pd.timedelta_range(start='1 day', end='5 days', periods=4) TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00', - '5 days 00:00:00'], - dtype='timedelta64[ns]', freq='32H') + '5 days 00:00:00'], + dtype='timedelta64[ns]', freq=None) """ if freq is None and com.any_none(periods, start, end): freq = "D" diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 04d1dbceb3342..e7cf8cae28b88 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,10 +1,12 @@ -from typing import TYPE_CHECKING, Hashable, List, Tuple, Union +from contextlib import suppress +from typing import TYPE_CHECKING, Any, Hashable, List, Sequence, Tuple, Union +import warnings import numpy as np from pandas._config.config import option_context -from pandas._libs.indexing import _NDFrameIndexerBase +from pandas._libs.indexing import NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc @@ -22,7 +24,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries -from pandas.core.dtypes.missing import _infer_fill_value, isna +from pandas.core.dtypes.missing import infer_fill_value, isna import pandas.core.common as com from pandas.core.construction import array as pd_array @@ -34,7 +36,7 @@ from pandas.core.indexes.api import Index if TYPE_CHECKING: - from pandas import DataFrame # noqa:F401 + from pandas import DataFrame, Series # "null slice" _NS = slice(None, None) @@ -59,7 +61,7 @@ class _IndexSlice: >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) >>> columns = ['foo', 'bar'] >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), - index=midx, columns=columns) + ... index=midx, columns=columns) Using the default slice command: @@ -255,15 +257,20 @@ def loc(self) -> "_LocIndexer": - A boolean array of the same length as the axis being sliced, e.g. ``[True, False, True]``. + - An alignable boolean Series. The index of the key will be aligned before + masking. + - An alignable Index. The Index of the returned selection will be the input. - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) - See more at :ref:`Selection by Label ` + See more at :ref:`Selection by Label `. Raises ------ KeyError If any items are not found. + IndexingError + If an indexed key is passed and its index is unalignable to the frame index. See Also -------- @@ -319,6 +326,21 @@ def loc(self) -> "_LocIndexer": max_speed shield sidewinder 7 8 + Alignable boolean Series: + + >>> df.loc[pd.Series([False, True, False], + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield + sidewinder 7 8 + + Index (same behavior as ``df.reindex``) + + >>> df.loc[pd.Index(["cobra", "viper"], name="foo")] + max_speed shield + foo + cobra 1 2 + viper 4 5 + Conditional that returns a boolean Series >>> df.loc[df['shield'] > 6] @@ -572,7 +594,7 @@ def iat(self) -> "_iAtIndexer": return _iAtIndexer("iat", self) -class _LocationIndexer(_NDFrameIndexerBase): +class _LocationIndexer(NDFrameIndexerBase): _valid_types: str axis = None @@ -598,17 +620,13 @@ def _get_setitem_indexer(self, key): ax = self.obj._get_axis(0) if isinstance(ax, ABCMultiIndex) and self.name != "iloc": - try: - return ax.get_loc(key) - except (TypeError, KeyError, InvalidIndexError): + with suppress(TypeError, KeyError, InvalidIndexError): # TypeError e.g. passed a bool - pass + return ax.get_loc(key) if isinstance(key, tuple): - try: + with suppress(IndexingError): return self._convert_tuple(key, is_setter=True) - except IndexingError: - pass if isinstance(key, range): return list(key) @@ -624,7 +642,7 @@ def _get_setitem_indexer(self, key): raise raise IndexingError(key) from e - def _ensure_listlike_indexer(self, key, axis=None): + def _ensure_listlike_indexer(self, key, axis=None, value=None): """ Ensure that a list-like of column labels are all present by adding them if they do not already exist. @@ -641,9 +659,9 @@ def _ensure_listlike_indexer(self, key, axis=None): if self.ndim != 2: return - if isinstance(key, tuple): + if isinstance(key, tuple) and not isinstance(self.obj.index, ABCMultiIndex): # key may be a tuple if we are .loc - # in that case, set key to the column part of key + # if index is not a MultiIndex, set key to column part key = key[column_axis] axis = column_axis @@ -654,9 +672,12 @@ def _ensure_listlike_indexer(self, key, axis=None): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - for k in key: - if k not in self.obj: - self.obj[k] = np.nan + # GH#38148 + keys = self.obj.columns.union(key, sort=False) + + self.obj._mgr = self.obj._mgr.reindex_axis( + keys, axis=0, copy=False, consolidate=False, only_slice=True + ) def __setitem__(self, key, value): if isinstance(key, tuple): @@ -667,7 +688,7 @@ def __setitem__(self, key, value): self._has_valid_setitem_indexer(key) iloc = self if self.name == "iloc" else self.obj.iloc - iloc._setitem_with_indexer(indexer, value) + iloc._setitem_with_indexer(indexer, value, self.name) def _validate_key(self, key, axis: int): """ @@ -695,9 +716,8 @@ def _has_valid_tuple(self, key: Tuple): """ Check the key for valid keys across my indexer. """ + self._validate_key_length(key) for i, k in enumerate(key): - if i >= self.ndim: - raise IndexingError("Too many indexers") try: self._validate_key(k, i) except ValueError as err: @@ -728,13 +748,17 @@ def _convert_tuple(self, key, is_setter: bool = False): else: keyidx.append(slice(None)) else: + self._validate_key_length(key) for i, k in enumerate(key): - if i >= self.ndim: - raise IndexingError("Too many indexers") idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) keyidx.append(idx) + return tuple(keyidx) + def _validate_key_length(self, key: Sequence[Any]) -> None: + if len(key) > self.ndim: + raise IndexingError("Too many indexers") + def _getitem_tuple_same_dim(self, tup: Tuple): """ Index with indexers that should return an object of the same dimension @@ -770,14 +794,10 @@ def _getitem_lowerdim(self, tup: Tuple): # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": - try: - result = self._handle_lowerdim_multi_index_axis0(tup) - return result - except IndexingError: - pass + with suppress(IndexingError): + return self._handle_lowerdim_multi_index_axis0(tup) - if len(tup) > self.ndim: - raise IndexingError("Too many indexers. handle elsewhere") + self._validate_key_length(tup) for i, key in enumerate(tup): if is_label_like(key): @@ -822,11 +842,8 @@ def _getitem_nested_tuple(self, tup: Tuple): if self.name != "loc": # This should never be reached, but lets be explicit about it raise ValueError("Too many indices") - try: - result = self._handle_lowerdim_multi_index_axis0(tup) - return result - except IndexingError: - pass + with suppress(IndexingError): + return self._handle_lowerdim_multi_index_axis0(tup) # this is a series with a multi-index specified a tuple of # selectors @@ -865,11 +882,9 @@ def __getitem__(self, key): if type(key) is tuple: key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): - try: - return self.obj._get_value(*key, takeable=self._takeable) - except (KeyError, IndexError, AttributeError): + with suppress(KeyError, IndexError, AttributeError): # AttributeError for IntervalTree get_value - pass + return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: # we by definition only have the 0th axis @@ -1010,7 +1025,7 @@ def _multi_take(self, tup: Tuple): def _getitem_iterable(self, key, axis: int): """ - Index current object with an an iterable collection of keys. + Index current object with an iterable collection of keys. Parameters ---------- @@ -1040,10 +1055,8 @@ def _getitem_iterable(self, key, axis: int): ) def _getitem_tuple(self, tup: Tuple): - try: + with suppress(IndexingError): return self._getitem_lowerdim(tup) - except IndexingError: - pass # no multi-index, so validate all of the indexers self._has_valid_tuple(tup) @@ -1064,13 +1077,13 @@ def _handle_lowerdim_multi_index_axis0(self, tup: Tuple): try: # fast path for series or for tup devoid of slices return self._get_label(tup, axis=axis) - except TypeError: + except (TypeError, InvalidIndexError): # slices are unhashable pass except KeyError as ek: # raise KeyError if number of indexers match # else IndexingError will be raised - if len(tup) <= self.obj.index.nlevels and len(tup) > self.ndim: + if self.ndim < len(tup) <= self.obj.index.nlevels: raise ek raise IndexingError("No label returned") @@ -1240,12 +1253,10 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): indexer, keyarr = ax._convert_listlike_indexer(key) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - self._validate_read_indexer( - keyarr, indexer, axis, raise_missing=raise_missing - ) + # _validate_read_indexer is a no-op if no -1s, so skip return ax[indexer], indexer - if ax.is_unique and not getattr(ax, "is_overlapping", False): + if ax._index_as_unique: indexer = ax.get_indexer_for(keyarr) keyarr = ax.reindex(keyarr)[0] else: @@ -1283,8 +1294,6 @@ def _validate_read_indexer( If at least one key was requested but none was found, and raise_missing=True. """ - ax = self.obj._get_axis(axis) - if len(key) == 0: return @@ -1297,27 +1306,23 @@ def _validate_read_indexer( axis_name = self.obj._get_axis_name(axis) raise KeyError(f"None of [{key}] are in the [{axis_name}]") + ax = self.obj._get_axis(axis) + # We (temporarily) allow for some missing keys with .loc, except in # some cases (e.g. setting) in which "raise_missing" will be False if raise_missing: not_found = list(set(key) - set(ax)) raise KeyError(f"{not_found} not in index") - # we skip the warning on Categorical - # as this check is actually done (check for - # non-missing values), but a bit later in the - # code, so we want to avoid warning & then - # just raising - if not ax.is_categorical(): - not_found = key[missing_mask] - - with option_context("display.max_seq_items", 10, "display.width", 80): - raise KeyError( - "Passing list-likes to .loc or [] with any missing labels " - "is no longer supported. " - f"The following labels were missing: {not_found}. " - "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 - ) + not_found = key[missing_mask] + + with option_context("display.max_seq_items", 10, "display.width", 80): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported. " + f"The following labels were missing: {not_found}. " + "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) @doc(IndexingMixin.iloc) @@ -1379,21 +1384,22 @@ def _has_valid_setitem_indexer(self, indexer) -> bool: """ if isinstance(indexer, dict): raise IndexError("iloc cannot enlarge its target object") - else: - if not isinstance(indexer, tuple): - indexer = _tuplify(self.ndim, indexer) - for ax, i in zip(self.obj.axes, indexer): - if isinstance(i, slice): - # should check the stop slice? - pass - elif is_list_like_indexer(i): - # should check the elements? - pass - elif is_integer(i): - if i >= len(ax): - raise IndexError("iloc cannot enlarge its target object") - elif isinstance(i, dict): + + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + + for ax, i in zip(self.obj.axes, indexer): + if isinstance(i, slice): + # should check the stop slice? + pass + elif is_list_like_indexer(i): + # should check the elements? + pass + elif is_integer(i): + if i >= len(ax): raise IndexError("iloc cannot enlarge its target object") + elif isinstance(i, dict): + raise IndexError("iloc cannot enlarge its target object") return True @@ -1441,10 +1447,8 @@ def _validate_integer(self, key: int, axis: int) -> None: def _getitem_tuple(self, tup: Tuple): self._has_valid_tuple(tup) - try: + with suppress(IndexingError): return self._getitem_lowerdim(tup) - except IndexingError: - pass return self._getitem_tuple_same_dim(tup) @@ -1520,7 +1524,7 @@ def _get_setitem_indexer(self, key): # ------------------------------------------------------------------- - def _setitem_with_indexer(self, indexer, value): + def _setitem_with_indexer(self, indexer, value, name="iloc"): """ _setitem_with_indexer is for setting values on a Series/DataFrame using positional indexers. @@ -1532,21 +1536,18 @@ def _setitem_with_indexer(self, indexer, value): since it goes from positional indexers back to labels when calling BlockManager methods, see GH#12991, GH#22046, GH#15686. """ - - # also has the side effect of consolidating in-place - from pandas import Series - info_axis = self.obj._info_axis_number # maybe partial set - take_split_path = self.obj._is_mixed_type + take_split_path = not self.obj._mgr.is_single_block # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._mgr.blocks: - (blk,) = self.obj._mgr.blocks - if 1 < blk.ndim: # in case of dict, keys are indices + if self.ndim > 1: + # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value + blk = self.obj._mgr.blocks[0] take_split_path = not blk._can_hold_element(val) # if we have any multi-indexes that have non-trivial slices @@ -1580,10 +1581,7 @@ def _setitem_with_indexer(self, indexer, value): # must have all defined axes if we have a scalar # or a list-like on the non-info axes if we have a # list-like - len_non_info_axes = ( - len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i - ) - if any(not l for l in len_non_info_axes): + if not len(self.obj): if not is_list_like_indexer(value): raise ValueError( "cannot set a frame with no " @@ -1593,12 +1591,16 @@ def _setitem_with_indexer(self, indexer, value): return # add a new item with the dtype setup - self.obj[key] = _infer_fill_value(value) + if com.is_null_slice(indexer[0]): + # We are setting an entire column + self.obj[key] = value + else: + self.obj[key] = infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes ) - self._setitem_with_indexer(new_indexer, value) + self._setitem_with_indexer(new_indexer, value, name) return @@ -1626,181 +1628,237 @@ def _setitem_with_indexer(self, indexer, value): self._setitem_with_indexer_missing(indexer, value) return - # set - item_labels = self.obj._get_axis(info_axis) - # align and set the values if take_split_path: - # Above we only set take_split_path to True for 2D cases - assert self.ndim == 2 - assert info_axis == 1 + # We have to operate column-wise + self._setitem_with_indexer_split_path(indexer, value, name) + else: + self._setitem_single_block(indexer, value, name) - if not isinstance(indexer, tuple): - indexer = _tuplify(self.ndim, indexer) + def _setitem_with_indexer_split_path(self, indexer, value, name: str): + """ + Setitem column-wise. + """ + # Above we only set take_split_path to True for 2D cases + assert self.ndim == 2 - if isinstance(value, ABCSeries): - value = self._align_series(indexer, value) - - info_idx = indexer[info_axis] - if is_integer(info_idx): - info_idx = [info_idx] - labels = item_labels[info_idx] - - # Ensure we have something we can iterate over - ilocs = info_idx - if isinstance(info_idx, slice): - ri = Index(range(len(self.obj.columns))) - ilocs = ri[info_idx] - - plane_indexer = indexer[:1] - lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) - # lplane_indexer gives the expected length of obj[indexer[0]] - - if len(labels) == 1: - # We can operate on a single column - - # require that we are setting the right number of values that - # we are indexing - if is_list_like_indexer(value) and 0 != lplane_indexer != len(value): - # Exclude zero-len for e.g. boolean masking that is all-false - raise ValueError( - "cannot set using a multi-index " - "selection indexer with a different " - "length than the value" - ) + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + if len(indexer) > self.ndim: + raise IndexError("too many indices for array") + if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: + raise ValueError(r"Cannot set values with ndim > 2") - pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer + if isinstance(value, ABCSeries) and name != "iloc": + value = self._align_series(indexer, value) - def isetter(loc, v): - # positional setting on column loc - ser = self.obj._ixs(loc, axis=1) + # Ensure we have something we can iterate over + info_axis = indexer[1] + ilocs = self._ensure_iterable_column_indexer(info_axis) - # perform the equivalent of a setitem on the info axis - # as we have a null slice or a slice with full bounds - # which means essentially reassign to the columns of a - # multi-dim object - # GH6149 (null slice), GH10408 (full bounds) - if isinstance(pi, tuple) and all( - com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) - for idx in pi - ): - ser = v - else: - # set the item, possibly having a dtype change - ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=pi, value=v) - ser._maybe_update_cacher(clear=True) - - # reset the sliced object if unique - self.obj._iset_item(loc, ser) - - # we need an iterable, with a ndim of at least 1 - # eg. don't pass through np.array(0) - if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: - - # we have an equal len Frame - if isinstance(value, ABCDataFrame): - sub_indexer = list(indexer) - multiindex_indexer = isinstance(labels, ABCMultiIndex) - # TODO: we are implicitly assuming value.columns is unique - - for loc in ilocs: - item = item_labels[loc] - if item in value: - sub_indexer[info_axis] = item - v = self._align_series( - tuple(sub_indexer), value[item], multiindex_indexer - ) - else: - v = np.nan - - isetter(loc, v) - - # we have an equal len ndarray/convertible to our labels - # hasattr first, to avoid coercing to ndarray without reason. - # But we may be relying on the ndarray coercion to check ndim. - # Why not just convert to an ndarray earlier on if needed? - elif np.ndim(value) == 2: - - # note that this coerces the dtype if we are mixed - # GH 7551 - value = np.array(value, dtype=object) - if len(ilocs) != value.shape[1]: - raise ValueError( - "Must have equal len keys and value " - "when setting with an ndarray" - ) + pi = indexer[0] + lplane_indexer = length_of_indexer(pi, self.obj.index) + # lplane_indexer gives the expected length of obj[indexer[0]] - for i, loc in enumerate(ilocs): - # setting with a list, re-coerces - isetter(loc, value[:, i].tolist()) + # we need an iterable, with a ndim of at least 1 + # eg. don't pass through np.array(0) + if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: - elif ( - len(labels) == 1 - and lplane_indexer == len(value) - and not is_scalar(plane_indexer[0]) - ): - # we have an equal len list/ndarray - # We only get here with len(labels) == len(ilocs) == 1 - isetter(ilocs[0], value) + if isinstance(value, ABCDataFrame): + self._setitem_with_indexer_frame_value(indexer, value, name) - elif lplane_indexer == 0 and len(value) == len(self.obj.index): - # We get here in one case via .loc with a all-False mask - pass + elif np.ndim(value) == 2: + self._setitem_with_indexer_2d_value(indexer, value) - else: - # per-label values - if len(ilocs) != len(value): - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) + elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): + # We are setting multiple rows in a single column. + self._setitem_single_column(ilocs[0], value, pi) - for loc, v in zip(ilocs, value): - isetter(loc, v) - else: + elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): + # We are trying to set N values into M entries of a single + # column, which is invalid for N != M + # Exclude zero-len for e.g. boolean masking that is all-false + + if len(value) == 1 and not is_integer(info_axis): + # This is a case like df.iloc[:3, [1]] = [0] + # where we treat as df.iloc[:3, 1] = 0 + return self._setitem_with_indexer((pi, info_axis[0]), value[0]) + + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) - # scalar value - for loc in ilocs: - isetter(loc, value) + elif lplane_indexer == 0 and len(value) == len(self.obj.index): + # We get here in one case via .loc with a all-False mask + pass + + elif len(ilocs) == len(value): + # We are setting multiple columns in a single row. + for loc, v in zip(ilocs, value): + self._setitem_single_column(loc, v, pi) + + elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: + # This is a setitem-with-expansion, see + # test_loc_setitem_empty_append_expands_rows_mixed_dtype + # e.g. df = DataFrame(columns=["x", "y"]) + # df["x"] = df["x"].astype(np.int64) + # df.loc[:, "x"] = [1, 2, 3] + self._setitem_single_column(ilocs[0], value, pi) + + else: + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) else: - if isinstance(indexer, tuple): - # if we are setting on the info axis ONLY - # set using those methods to avoid block-splitting - # logic here - if ( - len(indexer) > info_axis - and is_integer(indexer[info_axis]) - and all( - com.is_null_slice(idx) - for i, idx in enumerate(indexer) - if i != info_axis + # scalar value + for loc in ilocs: + self._setitem_single_column(loc, value, pi) + + def _setitem_with_indexer_2d_value(self, indexer, value): + # We get here with np.ndim(value) == 2, excluding DataFrame, + # which goes through _setitem_with_indexer_frame_value + pi = indexer[0] + + ilocs = self._ensure_iterable_column_indexer(indexer[1]) + + # GH#7551 Note that this coerces the dtype if we are mixed + value = np.array(value, dtype=object) + if len(ilocs) != value.shape[1]: + raise ValueError( + "Must have equal len keys and value when setting with an ndarray" + ) + + for i, loc in enumerate(ilocs): + # setting with a list, re-coerces + self._setitem_single_column(loc, value[:, i].tolist(), pi) + + def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name: str): + ilocs = self._ensure_iterable_column_indexer(indexer[1]) + + sub_indexer = list(indexer) + pi = indexer[0] + + multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) + + unique_cols = value.columns.is_unique + + # We do not want to align the value in case of iloc GH#37728 + if name == "iloc": + for i, loc in enumerate(ilocs): + val = value.iloc[:, i] + self._setitem_single_column(loc, val, pi) + + elif not unique_cols and value.columns.equals(self.obj.columns): + # We assume we are already aligned, see + # test_iloc_setitem_frame_duplicate_columns_multiple_blocks + for loc in ilocs: + item = self.obj.columns[loc] + if item in value: + sub_indexer[1] = item + val = self._align_series( + tuple(sub_indexer), + value.iloc[:, loc], + multiindex_indexer, ) - and item_labels.is_unique - ): - self.obj[item_labels[indexer[info_axis]]] = value - return + else: + val = np.nan - indexer = maybe_convert_ix(*indexer) + self._setitem_single_column(loc, val, pi) - if isinstance(value, (ABCSeries, dict)): - # TODO(EA): ExtensionBlock.setitem this causes issues with - # setting for extensionarrays that store dicts. Need to decide - # if it's worth supporting that. - value = self._align_series(indexer, Series(value)) + elif not unique_cols: + raise ValueError("Setting with non-unique columns is not allowed.") - elif isinstance(value, ABCDataFrame): - value = self._align_frame(indexer, value) + else: + for loc in ilocs: + item = self.obj.columns[loc] + if item in value: + sub_indexer[1] = item + val = self._align_series( + tuple(sub_indexer), value[item], multiindex_indexer + ) + else: + val = np.nan - # check for chained assignment - self.obj._check_is_chained_assignment_possible() + self._setitem_single_column(loc, val, pi) - # actually do the set - self.obj._consolidate_inplace() - self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) - self.obj._maybe_update_cacher(clear=True) + def _setitem_single_column(self, loc: int, value, plane_indexer): + """ + + Parameters + ---------- + loc : int + Indexer for column position + plane_indexer : int, slice, listlike[int] + The indexer we use for setitem along axis=0. + """ + pi = plane_indexer + + ser = self.obj._ixs(loc, axis=1) + + # perform the equivalent of a setitem on the info axis + # as we have a null slice or a slice with full bounds + # which means essentially reassign to the columns of a + # multi-dim object + # GH#6149 (null slice), GH#10408 (full bounds) + if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): + ser = value + else: + # set the item, possibly having a dtype change + ser = ser.copy() + ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value) + ser._maybe_update_cacher(clear=True) + + # reset the sliced object if unique + self.obj._iset_item(loc, ser) + + def _setitem_single_block(self, indexer, value, name: str): + """ + _setitem_with_indexer for the case when we have a single Block. + """ + from pandas import Series + + info_axis = self.obj._info_axis_number + item_labels = self.obj._get_axis(info_axis) + + if isinstance(indexer, tuple): + + # if we are setting on the info axis ONLY + # set using those methods to avoid block-splitting + # logic here + if ( + len(indexer) > info_axis + and is_integer(indexer[info_axis]) + and all( + com.is_null_slice(idx) + for i, idx in enumerate(indexer) + if i != info_axis + ) + and item_labels.is_unique + ): + self.obj[item_labels[indexer[info_axis]]] = value + return + + indexer = maybe_convert_ix(*indexer) + if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. + value = self._align_series(indexer, Series(value)) + + elif isinstance(value, ABCDataFrame) and name != "iloc": + value = self._align_frame(indexer, value) + + # check for chained assignment + self.obj._check_is_chained_assignment_possible() + + # actually do the set + self.obj._consolidate_inplace() + self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) + self.obj._maybe_update_cacher(clear=True) def _setitem_with_indexer_missing(self, indexer, value): """ @@ -1823,7 +1881,8 @@ def _setitem_with_indexer_missing(self, indexer, value): if index.is_unique: new_indexer = index.get_indexer([new_index[-1]]) if (new_indexer != -1).any(): - return self._setitem_with_indexer(new_indexer, value) + # We get only here with loc, so can hard code + return self._setitem_with_indexer(new_indexer, value, "loc") # this preserves dtype of the value new_values = Series([value])._values @@ -1862,7 +1921,21 @@ def _setitem_with_indexer_missing(self, indexer, value): self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) - def _align_series(self, indexer, ser: ABCSeries, multiindex_indexer: bool = False): + def _ensure_iterable_column_indexer(self, column_indexer): + """ + Ensure that our column indexer is something that can be iterated over. + """ + # Ensure we have something we can iterate over + if is_integer(column_indexer): + ilocs = [column_indexer] + elif isinstance(column_indexer, slice): + ri = Index(range(len(self.obj.columns))) + ilocs = ri[column_indexer] + else: + ilocs = column_indexer + return ilocs + + def _align_series(self, indexer, ser: "Series", multiindex_indexer: bool = False): """ Parameters ---------- @@ -1880,7 +1953,7 @@ def _align_series(self, indexer, ser: ABCSeries, multiindex_indexer: bool = Fals to the locations selected by `indexer` """ if isinstance(indexer, (slice, np.ndarray, list, Index)): - indexer = tuple([indexer]) + indexer = (indexer,) if isinstance(indexer, tuple): @@ -1953,7 +2026,7 @@ def ravel(i): raise ValueError("Incompatible indexer with Series") - def _align_frame(self, indexer, df: ABCDataFrame): + def _align_frame(self, indexer, df: "DataFrame"): is_frame = self.ndim == 2 if isinstance(indexer, tuple): @@ -2006,7 +2079,7 @@ def _align_frame(self, indexer, df: ABCDataFrame): raise ValueError("Incompatible indexer with DataFrame") -class _ScalarAccessIndexer(_NDFrameIndexerBase): +class _ScalarAccessIndexer(NDFrameIndexerBase): """ Access scalars quickly. """ @@ -2019,7 +2092,7 @@ def __getitem__(self, key): # we could have a convertible item here (e.g. Timestamp) if not is_list_like_indexer(key): - key = tuple([key]) + key = (key,) else: raise ValueError("Invalid call for scalar access (getting)!") @@ -2143,7 +2216,16 @@ def convert_to_index_sliceable(obj: "DataFrame", key): # slice here via partial string indexing if idx._supports_partial_string_indexing: try: - return idx._get_string_slice(key) + res = idx._get_string_slice(key) + warnings.warn( + "Indexing a DataFrame with a datetimelike index using a single " + "string to slice the rows, like `frame[string]`, is deprecated " + "and will be removed in a future version. Use `frame.loc[string]` " + "instead.", + FutureWarning, + stacklevel=3, + ) + return res except (KeyError, ValueError, NotImplementedError): return None @@ -2229,15 +2311,10 @@ def maybe_convert_ix(*args): """ We likely want to take the cross-product. """ - ixify = True for arg in args: if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)): - ixify = False - - if ixify: - return np.ix_(*args) - else: - return args + return args + return np.ix_(*args) def is_nested_tuple(tup, labels) -> bool: @@ -2280,7 +2357,7 @@ def need_slice(obj) -> bool: ) -def _non_reducing_slice(slice_): +def non_reducing_slice(slice_): """ Ensure that a slice doesn't reduce to a Series or Scalar. @@ -2319,7 +2396,7 @@ def pred(part) -> bool: return tuple(slice_) -def _maybe_numeric_slice(df, slice_, include_bool=False): +def maybe_numeric_slice(df, slice_, include_bool: bool = False): """ Want nice defaults for background_gradient that don't break with non-numeric data. But if slice_ is passed go with that. diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index e12e0d7760ea7..fbccac1c2af67 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -10,8 +10,8 @@ IntBlock, ObjectBlock, TimeDeltaBlock, - _safe_reshape, make_block, + safe_reshape, ) from pandas.core.internals.concat import concatenate_block_managers from pandas.core.internals.managers import ( @@ -33,7 +33,7 @@ "IntBlock", "ObjectBlock", "TimeDeltaBlock", - "_safe_reshape", + "safe_reshape", "make_block", "BlockManager", "SingleBlockManager", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6a4b3318d3aa7..fe07823a80783 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,17 +1,16 @@ from datetime import datetime, timedelta import inspect import re -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, Any, List, Optional, Type, Union, cast import warnings import numpy as np -from pandas._libs import NaT, algos as libalgos, lib, writers -import pandas._libs.internals as libinternals +from pandas._libs import NaT, algos as libalgos, internals as libinternals, lib, writers from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Scalar, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -20,6 +19,7 @@ find_common_type, infer_dtype_from, infer_dtype_from_scalar, + maybe_box_datetimelike, maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_infer_dtype_type, @@ -32,10 +32,12 @@ TD64NS_DTYPE, is_bool_dtype, is_categorical_dtype, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, + is_float, is_float_dtype, is_integer, is_integer_dtype, @@ -56,9 +58,10 @@ ABCPandasArray, ABCSeries, ) -from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat import pandas.core.algorithms as algos +from pandas.core.array_algos.replace import compare_or_regex_search, replace_regex from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -91,6 +94,8 @@ class Block(PandasObject): Index-ignorant; let the container take care of that """ + values: Union[np.ndarray, ExtensionArray] + __slots__ = ["_mgr_locs", "values", "ndim"] is_numeric = False is_float = False @@ -101,11 +106,9 @@ class Block(PandasObject): is_timedelta = False is_bool = False is_object = False - is_categorical = False is_extension = False _can_hold_na = False _can_consolidate = True - _verify_integrity = True _validate_ndim = True @classmethod @@ -121,10 +124,19 @@ def _simple_new( obj._mgr_locs = placement return obj - def __init__(self, values, placement, ndim=None): + def __init__(self, values, placement, ndim: int): + """ + Parameters + ---------- + values : np.ndarray or ExtensionArray + placement : BlockPlacement (or castable) + ndim : int + 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame + """ + # TODO(EA2D): ndim will be unnecessary with 2D EAs self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement - self.values = values + self.values = self._maybe_coerce_values(values) if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( @@ -132,6 +144,20 @@ def __init__(self, values, placement, ndim=None): f"placement implies {len(self.mgr_locs)}" ) + def _maybe_coerce_values(self, values): + """ + Ensure we have correctly-typed values. + + Parameters + ---------- + values : np.ndarray, ExtensionArray, Index + + Returns + ------- + np.ndarray or ExtensionArray + """ + return values + def _check_ndim(self, values, ndim): """ ndim inference and validation. @@ -175,12 +201,18 @@ def _holder(self): @property def _consolidate_key(self): - return (self._can_consolidate, self.dtype.name) + return self._can_consolidate, self.dtype.name @property def is_view(self) -> bool: """ return a boolean if I am possibly a view """ - return self.values.base is not None + values = self.values + values = cast(np.ndarray, values) + return values.base is not None + + @property + def is_categorical(self) -> bool: + return self._holder is Categorical @property def is_datelike(self) -> bool: @@ -319,7 +351,7 @@ def dtype(self): def iget(self, i): return self.values[i] - def set(self, locs, values): + def set_inplace(self, locs, values): """ Modify block values in-place with new item value. @@ -347,6 +379,27 @@ def apply(self, func, **kwargs) -> List["Block"]: return self._split_op_result(result) + def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: + # We will apply the function and reshape the result into a single-row + # Block with the same mgr_locs; squeezing will be done at a higher level + assert self.ndim == 2 + + try: + result = func(self.values) + except (TypeError, NotImplementedError): + if ignore_failures: + return [] + raise + + if np.ndim(result) == 0: + # TODO(EA2D): special case not needed with 2D EAs + res_values = np.array([[result]]) + else: + res_values = result.reshape(-1, 1) + + nb = self.make_block(res_values) + return [nb] + def _split_op_result(self, result) -> List["Block"]: # See also: split_and_operate if is_extension_array_dtype(result) and result.ndim > 1: @@ -374,8 +427,9 @@ def fillna( inplace = validate_bool_kwarg(inplace, "inplace") mask = isna(self.values) + mask = _extract_bool_array(mask) if limit is not None: - limit = libalgos._validate_limit(None, limit=limit) + limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: @@ -385,9 +439,10 @@ def fillna( return [self.copy()] if self._can_hold_element(value): - # equivalent: _try_coerce_args(value) would not raise - blocks = self.putmask(mask, value, inplace=inplace) - return self._maybe_downcast(blocks, downcast) + nb = self if inplace else self.copy() + nb._putmask_simple(mask, value) + # TODO: should be nb._maybe_downcast? + return self._maybe_downcast([nb], downcast) # we can't process the value, but nothing to do if not mask.any(): @@ -405,7 +460,23 @@ def f(mask, val, idx): return self.split_and_operate(None, f, inplace) - def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: + def _split(self) -> List["Block"]: + """ + Split a block into a list of single-column blocks. + """ + assert self.ndim == 2 + + new_blocks = [] + for i, ref_loc in enumerate(self.mgr_locs): + vals = self.values[slice(i, i + 1)] + + nb = self.make_block(vals, [ref_loc]) + new_blocks.append(nb) + return new_blocks + + def split_and_operate( + self, mask, f, inplace: bool, ignore_failures: bool = False + ) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -415,7 +486,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: ---------- mask : 2-d boolean mask f : callable accepting (1d-mask, 1d values, indexer) - inplace : boolean + inplace : bool + ignore_failures : bool, default False Returns ------- @@ -454,8 +526,16 @@ def make_a_block(nv, ref_loc): v = new_values[i] # need a new block - if m.any(): - nv = f(m, v, i) + if m.any() or m.size == 0: + # Apply our function; we may ignore_failures if this is a + # reduction that is dropping nuisance columns GH#37827 + try: + nv = f(m, v, i) + except TypeError: + if ignore_failures: + continue + else: + raise else: nv = v if inplace else v.copy() @@ -468,18 +548,16 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] # no need to downcast our float # unless indicated - if downcast is None and ( - self.is_float or self.is_timedelta or self.is_datetime - ): + if downcast is None and (self.is_float or self.is_datelike): return blocks - return _extend_blocks([b.downcast(downcast) for b in blocks]) + return extend_blocks([b.downcast(downcast) for b in blocks]) - def downcast(self, dtypes=None): + def downcast(self, dtypes=None) -> List["Block"]: """ try to downcast each item to the dict of dtypes if present """ # turn it off completely if dtypes is False: - return self + return [self] values = self.values @@ -490,11 +568,11 @@ def downcast(self, dtypes=None): dtypes = "infer" nv = maybe_downcast_to_dtype(values, dtypes) - return self.make_block(nv) + return [self.make_block(nv)] # ndim > 1 if dtypes is None: - return self + return [self] if not (dtypes == "infer" or isinstance(dtypes, dict)): raise ValueError( @@ -566,14 +644,19 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # force the copy here if self.is_extension: - # TODO: Should we try/except this astype? - values = self.values.astype(dtype) + try: + values = self.values.astype(dtype) + except (ValueError, TypeError): + if errors == "ignore": + values = self.values + else: + raise else: if issubclass(dtype.type, str): # use native type formatting for datetime/tz/timedelta if self.is_datelike: - values = self.to_native_types() + values = self.to_native_types().values # astype formatting else: @@ -600,7 +683,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): if isinstance(values, np.ndarray): values = values.reshape(self.shape) - newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim) + newb = self.make_block(values) if newb.is_numeric and self.is_numeric: if newb.shape != self.shape: @@ -617,14 +700,13 @@ def convert( datetime: bool = True, numeric: bool = True, timedelta: bool = True, - coerce: bool = False, - ): + ) -> List["Block"]: """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! """ - return self.copy() if copy else self + return [self.copy()] if copy else [self] def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ @@ -664,7 +746,7 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs): values = np.array(values, dtype="object") values[mask] = na_rep - return values + return self.make_block(values) # block actions # def copy(self, deep: bool = True): @@ -680,8 +762,7 @@ def replace( value, inplace: bool = False, regex: bool = False, - convert: bool = True, - ): + ) -> List["Block"]: """ replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. regex is not used here. @@ -690,43 +771,12 @@ def replace( inplace = validate_bool_kwarg(inplace, "inplace") original_to_replace = to_replace - # If we cannot replace with own dtype, convert to ObjectBlock and - # retry if not self._can_hold_element(to_replace): - if not isinstance(to_replace, list): - if inplace: - return [self] - return [self.copy()] - - to_replace = [x for x in to_replace if self._can_hold_element(x)] - if not len(to_replace): - # GH#28084 avoid costly checks since we can infer - # that there is nothing to replace in this block - if inplace: - return [self] - return [self.copy()] - - if len(to_replace) == 1: - # _can_hold_element checks have reduced this back to the - # scalar case and we can avoid a costly object cast - return self.replace( - to_replace[0], value, inplace=inplace, regex=regex, convert=convert, - ) - - # GH 22083, TypeError or ValueError occurred within error handling - # causes infinite loop. Cast and retry only if not objectblock. - if is_object_dtype(self): - raise AssertionError - - # try again with a compatible block - block = self.astype(object) - return block.replace( - to_replace=to_replace, - value=value, - inplace=inplace, - regex=regex, - convert=convert, - ) + # We cannot hold `to_replace`, so we know immediately that + # replacing it is a no-op. + # Note: If to_replace were a list, NDFrame.replace would call + # replace_list instead of replace. + return [self] if inplace else [self.copy()] values = self.values if lib.is_scalar(to_replace) and isinstance(values, np.ndarray): @@ -736,43 +786,132 @@ def replace( to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype) mask = missing.mask_missing(values, to_replace) + if not mask.any(): + # Note: we get here with test_replace_extension_other incorrectly + # bc _can_hold_element is incorrect. + return [self] if inplace else [self.copy()] - try: - blocks = self.putmask(mask, value, inplace=inplace) - # Note: it is _not_ the case that self._can_hold_element(value) - # is always true at this point. In particular, that can fail - # for: - # "2u" with bool-dtype, float-dtype - # 0.5 with int64-dtype - # np.nan with int64-dtype - except (TypeError, ValueError): - # GH 22083, TypeError or ValueError occurred within error handling - # causes infinite loop. Cast and retry only if not objectblock. - if is_object_dtype(self): - raise - - if not self.is_extension: - # TODO: https://github.com/pandas-dev/pandas/issues/32586 - # Need an ExtensionArray._can_hold_element to indicate whether - # a scalar value can be placed in the array. - assert not self._can_hold_element(value), value - - # try again with a compatible block - block = self.astype(object) - return block.replace( + if not self._can_hold_element(value): + blk = self.astype(object) + return blk.replace( to_replace=original_to_replace, value=value, - inplace=inplace, + inplace=True, regex=regex, - convert=convert, ) - if convert: - blocks = [b.convert(numeric=False, copy=not inplace) for b in blocks] + + blk = self if inplace else self.copy() + blk._putmask_simple(mask, value) + blocks = blk.convert(numeric=False, copy=not inplace) return blocks - def _replace_single(self, *args, **kwargs): - """ no-op on a non-ObjectBlock """ - return self if kwargs["inplace"] else self.copy() + def _replace_regex( + self, + to_replace, + value, + inplace: bool = False, + convert: bool = True, + mask=None, + ) -> List["Block"]: + """ + Replace elements by the given value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + List[Block] + """ + if not self._can_hold_element(to_replace): + # i.e. only ObjectBlock, but could in principle include a + # String ExtensionBlock + return [self] if inplace else [self.copy()] + + rx = re.compile(to_replace) + + new_values = self.values if inplace else self.values.copy() + replace_regex(new_values, rx, value, mask) + + block = self.make_block(new_values) + if convert: + nbs = block.convert(numeric=False) + else: + nbs = [block] + return nbs + + def _replace_list( + self, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> List["Block"]: + """ + See BlockManager._replace_list docstring. + """ + # Exclude anything that we know we won't contain + pairs = [ + (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + ] + if not len(pairs): + # shortcut, nothing to replace + return [self] if inplace else [self.copy()] + + src_len = len(pairs) - 1 + + def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: + """ + Generate a bool array by perform an equality check, or perform + an element-wise regular expression matching + """ + if isna(s): + return ~mask + + s = maybe_box_datetimelike(s) + return compare_or_regex_search(self.values, s, regex, mask) + + if self.is_object: + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(self.values) + masks = [comp(s[0], mask, regex) for s in pairs] + else: + # GH#38086 faster if we know we dont need to check for regex + masks = [missing.mask_missing(self.values, s[0]) for s in pairs] + + masks = [_extract_bool_array(x) for x in masks] + + rb = [self if inplace else self.copy()] + for i, (src, dest) in enumerate(pairs): + new_rb: List["Block"] = [] + for blk in rb: + m = masks[i] + convert = i == src_len # only convert once at the end + result = blk._replace_coerce( + to_replace=src, + value=dest, + mask=m, + inplace=inplace, + regex=regex, + ) + if convert and blk.is_object: + result = extend_blocks( + [b.convert(numeric=False, copy=True) for b in result] + ) + new_rb.extend(result) + rb = new_rb + return rb def setitem(self, indexer, value): """ @@ -890,8 +1029,37 @@ def setitem(self, indexer, value): block = self.make_block(values) return block + def _putmask_simple(self, mask: np.ndarray, value: Any): + """ + Like putmask but + + a) we do not cast on failure + b) we do not handle repeating or truncating like numpy. + + Parameters + ---------- + mask : np.ndarray[bool] + We assume _extract_bool_array has already been called. + value : Any + We assume self._can_hold_element(value) + """ + values = self.values + + if lib.is_scalar(value) and isinstance(values, np.ndarray): + value = convert_scalar_for_putitemlike(value, values.dtype) + + if self.is_extension or (self.is_object and not lib.is_scalar(value)): + # GH#19266 using np.putmask gives unexpected results with listlike value + if is_list_like(value) and len(value) == len(values): + values[mask] = value[mask] + else: + values[mask] = value + else: + # GH#37833 np.putmask is more performant than __setitem__ + np.putmask(values, mask, value) + def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ putmask the data to the block; it is possible that we may create a @@ -1041,39 +1209,15 @@ def coerce_to_target_dtype(self, other): # don't coerce float/complex to int return self - elif ( - self.is_datetime - or is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - ): - - # not a datetime - if not ( - (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) - and self.is_datetime - ): - return self.astype(object) - - # don't upcast timezone with different timezone or no timezone - mytz = getattr(self.dtype, "tz", None) - othertz = getattr(dtype, "tz", None) - - if not tz_compare(mytz, othertz): - return self.astype(object) - - raise AssertionError( - f"possible recursion in coerce_to_target_dtype: {self} {other}" - ) + elif self.is_datetime or is_datetime64_any_dtype(dtype): + # The is_dtype_equal check above ensures that at most one of + # these two conditions hold, so we must cast to object. + return self.astype(object) elif self.is_timedelta or is_timedelta64_dtype(dtype): - - # not a timedelta - if not (is_timedelta64_dtype(dtype) and self.is_timedelta): - return self.astype(object) - - raise AssertionError( - f"possible recursion in coerce_to_target_dtype: {self} {other}" - ) + # The is_dtype_equal check above ensures that at most one of + # these two conditions hold, so we must cast to object. + return self.astype(object) try: return self.astype(dtype) @@ -1097,8 +1241,8 @@ def interpolate( inplace = validate_bool_kwarg(inplace, "inplace") - # Only FloatBlocks will contain NaNs. timedelta subclasses IntBlock - if (self.is_bool or self.is_integer) and not self.is_timedelta: + if not self._can_hold_na: + # If there are no NAs, then interpolate is a no-op return self if inplace else self.copy() # a fill na type method @@ -1108,13 +1252,16 @@ def interpolate( m = None if m is not None: + if fill_value is not None: + # similar to validate_fillna_kwargs + raise ValueError("Cannot pass both fill_value and method") + return self._interpolate_with_fill( method=m, axis=axis, inplace=inplace, limit=limit, - fill_value=fill_value, - coerce=coerce, + limit_area=limit_area, downcast=downcast, ) # validate the interp method @@ -1141,34 +1288,22 @@ def _interpolate_with_fill( axis: int = 0, inplace: bool = False, limit: Optional[int] = None, - fill_value: Optional[Any] = None, - coerce: bool = False, + limit_area: Optional[str] = None, downcast: Optional[str] = None, ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") - # if we are coercing, then don't force the conversion - # if the block can't hold the type - if coerce: - if not self._can_hold_na: - if inplace: - return [self] - else: - return [self.copy()] + assert self._can_hold_na # checked by caller values = self.values if inplace else self.values.copy() - # We only get here for non-ExtensionBlock - fill_value = convert_scalar_for_putitemlike(fill_value, self.values.dtype) - values = missing.interpolate_2d( values, method=method, axis=axis, limit=limit, - fill_value=fill_value, - dtype=self.dtype, + limit_area=limit_area, ) blocks = [self.make_block_same_class(values, ndim=self.ndim)] @@ -1278,7 +1413,7 @@ def shift(self, periods: int, axis: int = 0, fill_value=None): return [self.make_block(new_values)] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1290,6 +1425,7 @@ def where( errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + try_cast: bool, default False axis : int, default 0 Returns @@ -1315,47 +1451,47 @@ def where( if values.ndim - 1 == other.ndim and axis == 1: other = other.reshape(tuple(other.shape + (1,))) elif transpose and values.ndim == self.ndim - 1: + # TODO(EA2D): not neceesssary with 2D EAs cond = cond.T if not hasattr(cond, "shape"): raise ValueError("where must have a condition that is ndarray like") - def where_func(cond, values, other): - - if not ( - (self.is_integer or self.is_bool) - and lib.is_float(other) - and np.isnan(other) - ): - # np.where will cast integer array to floats in this case - if not self._can_hold_element(other): - raise TypeError - if lib.is_scalar(other) and isinstance(values, np.ndarray): - # convert datetime to datetime64, timedelta to timedelta64 - other = convert_scalar_for_putitemlike(other, values.dtype) - - # By the time we get here, we should have all Series/Index - # args extracted to ndarray - fastres = expressions.where(cond, values, other) - return fastres - if cond.ravel("K").all(): result = values else: # see if we can operate on the entire block, or need item-by-item # or if we are a single block (ndim == 1) - try: - result = where_func(cond, values, other) - except TypeError: - + if ( + (self.is_integer or self.is_bool) + and lib.is_float(other) + and np.isnan(other) + ): + # GH#3733 special case to avoid object-dtype casting + # and go through numexpr path instead. + # In integer case, np.where will cast to floats + pass + elif not self._can_hold_element(other): # we cannot coerce, return a compat dtype # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) blocks = block.where( - orig_other, cond, errors=errors, try_cast=try_cast, axis=axis, + orig_other, cond, errors=errors, try_cast=try_cast, axis=axis ) return self._maybe_downcast(blocks, "infer") + if not ( + (self.is_integer or self.is_bool) + and lib.is_float(other) + and np.isnan(other) + ): + # convert datetime to datetime64, timedelta to timedelta64 + other = convert_scalar_for_putitemlike(other, values.dtype) + + # By the time we get here, we should have all Series/Index + # args extracted to ndarray + result = expressions.where(cond, values, other) + if self._can_hold_na or self.ndim == 1: if transpose: @@ -1368,9 +1504,10 @@ def where_func(cond, values, other): cond = cond.swapaxes(axis, 0) mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) - result_blocks = [] + result_blocks: List["Block"] = [] for m in [mask, ~mask]: if m.any(): + result = cast(np.ndarray, result) # EABlock overrides where taken = result.take(m.nonzero()[0], axis=axis) r = maybe_downcast_numeric(taken, self.dtype) nb = self.make_block(r.T, placement=self.mgr_locs[m]) @@ -1405,7 +1542,7 @@ def _unstack(self, unstacker, fill_value, new_placement): new_values = new_values.T[mask] new_placement = new_placement[mask] - blocks = [self.make_block_same_class(new_values, placement=new_placement)] + blocks = [make_block(new_values, placement=new_placement)] return blocks, mask def quantile(self, qs, interpolation="linear", axis: int = 0): @@ -1468,11 +1605,10 @@ def _replace_coerce( self, to_replace, value, + mask: np.ndarray, inplace: bool = True, regex: bool = False, - convert: bool = False, - mask=None, - ): + ) -> List["Block"]: """ Replace value corresponding to the given boolean array with another value. @@ -1483,33 +1619,36 @@ def _replace_coerce( Scalar to replace or regular expression to match. value : object Replacement object. + mask : np.ndarray[bool] + True indicate corresponding element is ignored. inplace : bool, default True Perform inplace modification. regex : bool, default False If true, perform regular expression substitution. - convert : bool, default True - If true, try to coerce any object types to better types. - mask : array-like of bool, optional - True indicate corresponding element is ignored. Returns ------- - A new block if there is anything to replace or the original block. + List[Block] """ if mask.any(): if not regex: - self = self.coerce_to_target_dtype(value) - return self.putmask(mask, value, inplace=inplace) + nb = self.coerce_to_target_dtype(value) + if nb is self and not inplace: + nb = nb.copy() + nb._putmask_simple(mask, value) + return [nb] else: - return self._replace_single( - to_replace, - value, - inplace=inplace, - regex=regex, - convert=convert, - mask=mask, - ) - return self + regex = _should_use_regex(regex, to_replace) + if regex: + return self._replace_regex( + to_replace, + value, + inplace=inplace, + convert=False, + mask=mask, + ) + return self.replace(to_replace, value, inplace=inplace, regex=False) + return [self] class ExtensionBlock(Block): @@ -1525,11 +1664,12 @@ class ExtensionBlock(Block): """ _can_consolidate = False - _verify_integrity = False _validate_ndim = False is_extension = True - def __init__(self, values, placement, ndim=None): + values: ExtensionArray + + def __init__(self, values, placement, ndim: int): """ Initialize a non-consolidatable block. @@ -1538,7 +1678,6 @@ def __init__(self, values, placement, ndim=None): This will call continue to call __init__ for the other base classes mixed in with this Mixin. """ - values = self._maybe_coerce_values(values) # Placement must be converted to BlockPlacement so that we can check # its length @@ -1561,8 +1700,8 @@ def __init__(self, values, placement, ndim=None): def shape(self): # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: - return ((len(self.values)),) - return (len(self.mgr_locs), len(self.values)) + return (len(self.values),) + return len(self.mgr_locs), len(self.values) def iget(self, col): @@ -1581,18 +1720,14 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def should_store(self, value: ArrayLike) -> bool: - """ - Can we set the given array-like value inplace? - """ - return isinstance(value, self._holder) - - def set(self, locs, values): + def set_inplace(self, locs, values): + # NB: This is a misnomer, is supposed to be inplace but is not, + # see GH#33457 assert locs.tolist() == [0] - self.values[:] = values + self.values = values def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ See Block.putmask.__doc__ @@ -1606,7 +1741,7 @@ def putmask( if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): new = new[mask] - mask = _safe_reshape(mask, new_values.shape) + mask = safe_reshape(mask, new_values.shape) new_values[mask] = new return [self.make_block(values=new_values)] @@ -1636,10 +1771,7 @@ def _holder(self): @property def fill_value(self): # Used in reindex_indexer - if is_sparse(self.values): - return self.values.dtype.fill_value - else: - return self.values.dtype.na_value + return self.values.dtype.na_value @property def _can_hold_na(self): @@ -1678,6 +1810,14 @@ def setitem(self, indexer, value): `indexer` is a direct slice/positional indexer. `value` must be a compatible shape. """ + if not self._can_hold_element(value): + # This is only relevant for DatetimeTZBlock, which has a + # non-trivial `_can_hold_element`. + # https://github.com/pandas-dev/pandas/issues/24020 + # Need a dedicated setitem until GH#24020 (type promotion in setitem + # for extension arrays) is designed and implemented. + return self.astype(object).setitem(indexer, value) + if isinstance(indexer, tuple): # TODO(EA2D): not needed with 2D EAs # we are always 1-D @@ -1705,7 +1845,7 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs): # TODO(EA2D): reshape not needed with 2D EAs # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) + return self.make_block(values) def take_nd( self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default @@ -1806,7 +1946,7 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: return super().diff(n, axis) def shift( - self, periods: int, axis: int = 0, fill_value: Any = None, + self, periods: int, axis: int = 0, fill_value: Any = None ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1823,7 +1963,7 @@ def shift( ] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: cond = _extract_bool_array(cond) @@ -1906,6 +2046,16 @@ class ObjectValuesExtensionBlock(ExtensionBlock): def external_values(self): return self.values.astype(object) + def _can_hold_element(self, element: Any) -> bool: + if is_valid_nat_for_dtype(element, self.dtype): + return True + if isinstance(element, list) and len(element) == 0: + return True + tipo = maybe_infer_dtype_type(element) + if tipo is not None: + return issubclass(tipo.type, self.dtype.type) + return isinstance(element, self.dtype.type) + class NumericBlock(Block): __slots__ = () @@ -1913,11 +2063,7 @@ class NumericBlock(Block): _can_hold_na = True -class FloatOrComplexBlock(NumericBlock): - __slots__ = () - - -class FloatBlock(FloatOrComplexBlock): +class FloatBlock(NumericBlock): __slots__ = () is_float = True @@ -1925,17 +2071,17 @@ def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( - tipo.type, (np.datetime64, np.timedelta64) + tipo.type, np.timedelta64 ) return isinstance( element, (float, int, np.floating, np.int_) ) and not isinstance( element, - (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64), + (bool, np.bool_, np.timedelta64), ) def to_native_types( - self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs, + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs ): """ convert to our native types format """ values = self.values @@ -1952,7 +2098,7 @@ def to_native_types( values = np.array(values, dtype="object") values[mask] = na_rep - return values + return self.make_block(values) from pandas.io.formats.format import FloatArrayFormatter @@ -1964,10 +2110,11 @@ def to_native_types( quoting=quoting, fixed_width=False, ) - return formatter.get_result_as_array() + res = formatter.get_result_as_array() + return self.make_block(res) -class ComplexBlock(FloatOrComplexBlock): +class ComplexBlock(NumericBlock): __slots__ = () is_complex = True @@ -1979,9 +2126,6 @@ def _can_hold_element(self, element: Any) -> bool: element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) - def should_store(self, value: ArrayLike) -> bool: - return issubclass(value.dtype.type, np.complexfloating) - class IntBlock(NumericBlock): __slots__ = () @@ -1993,22 +2137,18 @@ def _can_hold_element(self, element: Any) -> bool: if tipo is not None: return ( issubclass(tipo.type, np.integer) - and not issubclass(tipo.type, (np.datetime64, np.timedelta64)) + and not issubclass(tipo.type, np.timedelta64) and self.dtype.itemsize >= tipo.itemsize ) - return is_integer(element) + # We have not inferred an integer from the dtype + # check if we have a builtin int or a float equal to an int + return is_integer(element) or (is_float(element) and element.is_integer()) -class DatetimeLikeBlockMixin: +class DatetimeLikeBlockMixin(Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" - @property - def _holder(self): - return DatetimeArray - - @property - def fill_value(self): - return np.datetime64("NaT", "ns") + _can_hold_na = True def get_values(self, dtype=None): """ @@ -2031,24 +2171,53 @@ def iget(self, key): # TODO(EA2D): this can be removed if we ever have 2D EA return self.array_values().reshape(self.shape)[key] + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. + + Parameters + ---------- + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. + + Returns + ------- + A list with a new TimeDeltaBlock. + + Notes + ----- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + # TODO(EA2D): reshape not necessary with 2D EAs + values = self.array_values().reshape(self.shape) + + new_values = values - values.shift(n, axis=axis) + return [ + TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer, ndim=self.ndim) + ] + def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs values = self.array_values() new_values = values.shift(periods, fill_value=fill_value, axis=axis) return self.make_block_same_class(new_values) + def to_native_types(self, na_rep="NaT", **kwargs): + """ convert to our native types format """ + arr = self.array_values() -class DatetimeBlock(DatetimeLikeBlockMixin, Block): - __slots__ = () - is_datetime = True + result = arr._format_native_types(na_rep=na_rep, **kwargs) + return self.make_block(result) - def __init__(self, values, placement, ndim=None): - values = self._maybe_coerce_values(values) - super().__init__(values, placement=placement, ndim=ndim) - @property - def _can_hold_na(self): - return True +class DatetimeBlock(DatetimeLikeBlockMixin): + __slots__ = () + is_datetime = True + _holder = DatetimeArray + fill_value = np.datetime64("NaT", "ns") def _maybe_coerce_values(self, values): """ @@ -2088,9 +2257,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): if copy: # this should be the only copy values = values.copy() - if getattr(values, "tz", None) is None: - values = DatetimeArray(values).tz_localize("UTC") - values = values.tz_convert(dtype.tz) + values = DatetimeArray._simple_new(values.view("i8"), dtype=dtype) return self.make_block(values) # delegate @@ -2099,7 +2266,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: - if self.is_datetimetz: + if isinstance(element, list) and len(element) == 0: + # Following DatetimeArray._validate_setitem_value + # convention, we treat this as object-dtype + # (even though tipo is float64) + return True + + elif self.is_datetimetz: # require exact match, since non-nano does not exist return is_dtype_equal(tipo, self.dtype) or is_valid_nat_for_dtype( element, self.dtype @@ -2116,16 +2289,7 @@ def _can_hold_element(self, element: Any) -> bool: return is_valid_nat_for_dtype(element, self.dtype) - def to_native_types(self, na_rep="NaT", date_format=None, **kwargs): - """ convert to our native types format """ - dta = self.array_values() - - result = dta._format_native_types( - na_rep=na_rep, date_format=date_format, **kwargs - ) - return np.atleast_2d(result) - - def set(self, locs, values): + def set_inplace(self, locs, values): """ See Block.set.__doc__ """ @@ -2137,20 +2301,23 @@ def set(self, locs, values): class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ + values: DatetimeArray + __slots__ = () is_datetimetz = True is_extension = True internal_values = Block.internal_values + + _holder = DatetimeBlock._holder _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types - fill_value = np.datetime64("NaT", "ns") - should_store = Block.should_store - array_values = ExtensionBlock.array_values + diff = DatetimeBlock.diff + fillna = DatetimeBlock.fillna # i.e. Block.fillna + fill_value = DatetimeBlock.fill_value + _can_hold_na = DatetimeBlock._can_hold_na - @property - def _holder(self): - return DatetimeArray + array_values = ExtensionBlock.array_values def _maybe_coerce_values(self, values): """ @@ -2216,69 +2383,6 @@ def external_values(self): # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - def diff(self, n: int, axis: int = 0) -> List["Block"]: - """ - 1st discrete difference. - - Parameters - ---------- - n : int - Number of periods to diff. - axis : int, default 0 - Axis to diff upon. - - Returns - ------- - A list with a new TimeDeltaBlock. - - Notes - ----- - The arguments here are mimicking shift so they are called correctly - by apply. - """ - if axis == 0: - # TODO(EA2D): special case not needed with 2D EAs - # Cannot currently calculate diff across multiple blocks since this - # function is invoked via apply - raise NotImplementedError - - if n == 0: - # Fastpath avoids making a copy in `shift` - new_values = np.zeros(self.values.shape, dtype=np.int64) - else: - new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 - - # Reshape the new_values like how algos.diff does for timedelta data - new_values = new_values.reshape(1, len(new_values)) - new_values = new_values.astype("timedelta64[ns]") - return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - - def fillna(self, value, limit=None, inplace=False, downcast=None): - # We support filling a DatetimeTZ with a `value` whose timezone - # is different by coercing to object. - if self._can_hold_element(value): - return super().fillna(value, limit, inplace, downcast) - - # different timezones, or a non-tz - return self.astype(object).fillna( - value, limit=limit, inplace=inplace, downcast=downcast - ) - - def setitem(self, indexer, value): - # https://github.com/pandas-dev/pandas/issues/24020 - # Need a dedicated setitem until #24020 (type promotion in setitem - # for extension arrays) is designed and implemented. - if self._can_hold_element(value) or ( - isinstance(indexer, np.ndarray) and indexer.size == 0 - ): - return super().setitem(indexer, value) - - obj_vals = self.values.astype(object) - newb = make_block( - obj_vals, placement=self.mgr_locs, klass=ObjectBlock, ndim=self.ndim - ) - return newb.setitem(indexer, value) - def quantile(self, qs, interpolation="linear", axis=0): naive = self.values.view("M8[ns]") @@ -2292,22 +2396,46 @@ def quantile(self, qs, interpolation="linear", axis=0): aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) return self.make_block_same_class(aware, ndim=res_blk.ndim) + def _check_ndim(self, values, ndim): + """ + ndim inference and validation. -class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): + This is overriden by the DatetimeTZBlock to check the case of 2D + data (values.ndim == 2), which should only be allowed if ndim is + also 2. + The case of 1D array is still allowed with both ndim of 1 or 2, as + if the case for other EAs. Therefore, we are only checking + `values.ndim > ndim` instead of `values.ndim != ndim` as for + consolidated blocks. + """ + if ndim is None: + ndim = values.ndim + + if values.ndim > ndim: + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim != ndim [{values.ndim} != {ndim}]" + ) + return ndim + + +class TimeDeltaBlock(DatetimeLikeBlockMixin): __slots__ = () is_timedelta = True - _can_hold_na = True - is_numeric = False fill_value = np.timedelta64("NaT", "ns") - def __init__(self, values, placement, ndim=None): + def _maybe_coerce_values(self, values): if values.dtype != TD64NS_DTYPE: - # e.g. non-nano or int64 + # non-nano we will convert to nano + if values.dtype.kind != "m": + # caller is responsible for ensuring timedelta64 dtype + raise TypeError(values.dtype) # pragma: no cover + values = TimedeltaArray._from_sequence(values)._data if isinstance(values, TimedeltaArray): values = values._data assert isinstance(values, np.ndarray), type(values) - super().__init__(values, placement=placement, ndim=ndim) + return values @property def _holder(self): @@ -2324,9 +2452,8 @@ def _can_hold_element(self, element: Any) -> bool: return is_valid_nat_for_dtype(element, self.dtype) def fillna(self, value, **kwargs): - - # allow filling with integers to be - # interpreted as nanoseconds + # TODO(EA2D): if we operated on array_values, TDA.fillna would handle + # raising here. if is_integer(value): # Deprecation GH#24694, GH#19233 raise TypeError( @@ -2336,11 +2463,6 @@ def fillna(self, value, **kwargs): ) return super().fillna(value, **kwargs) - def to_native_types(self, na_rep="NaT", **kwargs): - """ convert to our native types format """ - tda = self.array_values() - return tda._format_native_types(na_rep, **kwargs) - class BoolBlock(NumericBlock): __slots__ = () @@ -2353,26 +2475,16 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, np.bool_) return isinstance(element, (bool, np.bool_)) - def replace(self, to_replace, value, inplace=False, regex=False, convert=True): - inplace = validate_bool_kwarg(inplace, "inplace") - to_replace_values = np.atleast_1d(to_replace) - if not np.can_cast(to_replace_values, bool): - return self - return super().replace( - to_replace, value, inplace=inplace, regex=regex, convert=convert, - ) - class ObjectBlock(Block): __slots__ = () is_object = True _can_hold_na = True - def __init__(self, values, placement=None, ndim=2): + def _maybe_coerce_values(self, values): if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) - - super().__init__(values, ndim=ndim, placement=placement) + return values @property def is_bool(self): @@ -2382,20 +2494,48 @@ def is_bool(self): """ return lib.is_bool_array(self.values.ravel("K")) + def reduce(self, func, ignore_failures: bool = False) -> List[Block]: + """ + For object-dtype, we operate column-wise. + """ + assert self.ndim == 2 + + values = self.values + if len(values) > 1: + # split_and_operate expects func with signature (mask, values, inplace) + def mask_func(mask, values, inplace): + if values.ndim == 1: + values = values.reshape(1, -1) + return func(values) + + return self.split_and_operate( + None, mask_func, False, ignore_failures=ignore_failures + ) + + try: + res = func(values) + except TypeError: + if not ignore_failures: + raise + return [] + + assert isinstance(res, np.ndarray) + assert res.ndim == 1 + res = res.reshape(1, -1) + return [self.make_block_same_class(res)] + def convert( self, copy: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, - coerce: bool = False, - ): + ) -> List["Block"]: """ - attempt to coerce any object types to better types return a copy of + attempt to cast any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! - - can return multiple blocks! """ + # operate column-by-column def f(mask, val, idx): shape = val.shape @@ -2404,7 +2544,6 @@ def f(mask, val, idx): datetime=datetime, numeric=numeric, timedelta=timedelta, - coerce=coerce, copy=copy, ) if isinstance(values, np.ndarray): @@ -2417,7 +2556,7 @@ def f(mask, val, idx): blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) - blocks = [make_block(values, ndim=self.ndim, placement=self.mgr_locs)] + blocks = [self.make_block(values)] return blocks @@ -2427,209 +2566,59 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] return blocks # split and convert the blocks - return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) + return extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) def _can_hold_element(self, element: Any) -> bool: return True - def replace(self, to_replace, value, inplace=False, regex=False, convert=True): - to_rep_is_list = is_list_like(to_replace) - value_is_list = is_list_like(value) - both_lists = to_rep_is_list and value_is_list - either_list = to_rep_is_list or value_is_list - - result_blocks = [] - blocks = [self] - - if not either_list and is_re(to_replace): - return self._replace_single( - to_replace, value, inplace=inplace, regex=True, convert=convert, - ) - elif not (either_list or regex): - return super().replace( - to_replace, value, inplace=inplace, regex=regex, convert=convert, - ) - elif both_lists: - for to_rep, v in zip(to_replace, value): - result_blocks = [] - for b in blocks: - result = b._replace_single( - to_rep, v, inplace=inplace, regex=regex, convert=convert, - ) - result_blocks = _extend_blocks(result, result_blocks) - blocks = result_blocks - return result_blocks - - elif to_rep_is_list and regex: - for to_rep in to_replace: - result_blocks = [] - for b in blocks: - result = b._replace_single( - to_rep, value, inplace=inplace, regex=regex, convert=convert, - ) - result_blocks = _extend_blocks(result, result_blocks) - blocks = result_blocks - return result_blocks - - return self._replace_single( - to_replace, value, inplace=inplace, convert=convert, regex=regex, - ) - - def _replace_single( - self, to_replace, value, inplace=False, regex=False, convert=True, mask=None, - ): - """ - Replace elements by the given value. - - Parameters - ---------- - to_replace : object or pattern - Scalar to replace or regular expression to match. - value : object - Replacement object. - inplace : bool, default False - Perform inplace modification. - regex : bool, default False - If true, perform regular expression substitution. - convert : bool, default True - If true, try to coerce any object types to better types. - mask : array-like of bool, optional - True indicate corresponding element is ignored. - - Returns - ------- - a new block, the result after replacing - """ - inplace = validate_bool_kwarg(inplace, "inplace") - - # to_replace is regex compilable - to_rep_re = regex and is_re_compilable(to_replace) - - # regex is regex compilable - regex_re = is_re_compilable(regex) - - # only one will survive - if to_rep_re and regex_re: - raise AssertionError( - "only one of to_replace and regex can be regex compilable" - ) - - # if regex was passed as something that can be a regex (rather than a - # boolean) - if regex_re: - to_replace = regex - - regex = regex_re or to_rep_re - - # try to get the pattern attribute (compiled re) or it's a string - if is_re(to_replace): - pattern = to_replace.pattern - else: - pattern = to_replace - - # if the pattern is not empty and to_replace is either a string or a - # regex - if regex and pattern: - rx = re.compile(to_replace) - else: - # if the thing to replace is not a string or compiled regex call - # the superclass method -> to_replace is some kind of object - return super().replace(to_replace, value, inplace=inplace, regex=regex) - - new_values = self.values if inplace else self.values.copy() - - # deal with replacing values with objects (strings) that match but - # whose replacement is not a string (numeric, nan, object) - if isna(value) or not isinstance(value, str): - - def re_replacer(s): - if is_re(rx) and isinstance(s, str): - return value if rx.search(s) is not None else s - else: - return s - - else: - # value is guaranteed to be a string here, s can be either a string - # or null if it's null it gets returned - def re_replacer(s): - if is_re(rx) and isinstance(s, str): - return rx.sub(value, s) - else: - return s + def replace( + self, + to_replace, + value, + inplace: bool = False, + regex: bool = False, + ) -> List["Block"]: + # Note: the checks we do in NDFrame.replace ensure we never get + # here with listlike to_replace or value, as those cases + # go through _replace_list - f = np.vectorize(re_replacer, otypes=[self.dtype]) + regex = _should_use_regex(regex, to_replace) - if mask is None: - new_values[:] = f(new_values) + if regex: + return self._replace_regex(to_replace, value, inplace=inplace) else: - new_values[mask] = f(new_values[mask]) + return super().replace(to_replace, value, inplace=inplace, regex=False) - # convert - block = self.make_block(new_values) - if convert: - block = block.convert(numeric=False) - return block - def _replace_coerce( - self, to_replace, value, inplace=True, regex=False, convert=False, mask=None - ): - """ - Replace value corresponding to the given boolean array with another - value. +def _should_use_regex(regex: bool, to_replace: Any) -> bool: + """ + Decide whether to treat `to_replace` as a regular expression. + """ + if is_re(to_replace): + regex = True - Parameters - ---------- - to_replace : object or pattern - Scalar to replace or regular expression to match. - value : object - Replacement object. - inplace : bool, default False - Perform inplace modification. - regex : bool, default False - If true, perform regular expression substitution. - convert : bool, default True - If true, try to coerce any object types to better types. - mask : array-like of bool, optional - True indicate corresponding element is ignored. + regex = regex and is_re_compilable(to_replace) - Returns - ------- - A new block if there is anything to replace or the original block. - """ - if mask.any(): - block = super()._replace_coerce( - to_replace=to_replace, - value=value, - inplace=inplace, - regex=regex, - convert=convert, - mask=mask, - ) - if convert: - block = [b.convert(numeric=False, copy=True) for b in block] - return block - if convert: - return [self.convert(numeric=False, copy=True)] - return self + # Don't use regex if the pattern is empty. + regex = regex and re.compile(to_replace).pattern != "" + return regex class CategoricalBlock(ExtensionBlock): __slots__ = () - is_categorical = True - _verify_integrity = True - _can_hold_na = True - should_store = Block.should_store - - def __init__(self, values, placement, ndim=None): - # coerce to categorical if we can - values = extract_array(values) - assert isinstance(values, Categorical), type(values) - super().__init__(values, placement=placement, ndim=ndim) - - @property - def _holder(self): - return Categorical + def _replace_list( + self, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> List["Block"]: + if len(algos.unique(dest_list)) == 1: + # We likely got here by tiling value inside NDFrame.replace, + # so un-tile here + return self.replace(src_list, dest_list[0], inplace, regex) + return super()._replace_list(src_list, dest_list, inplace, regex) def replace( self, @@ -2637,13 +2626,12 @@ def replace( value, inplace: bool = False, regex: bool = False, - convert: bool = True, - ): + ) -> List["Block"]: inplace = validate_bool_kwarg(inplace, "inplace") result = self if inplace else self.copy() result.values.replace(to_replace, value, inplace=True) - return result + return [result] # ----------------------------------------------------------------- @@ -2666,6 +2654,8 @@ def get_block_type(values, dtype=None): dtype = dtype or values.dtype vtype = dtype.type + cls: Type[Block] + if is_sparse(dtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock @@ -2679,6 +2669,7 @@ def get_block_type(values, dtype=None): elif is_interval_dtype(dtype) or is_period_dtype(dtype): cls = ObjectValuesExtensionBlock elif is_extension_array_dtype(values.dtype): + # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock elif issubclass(vtype, np.floating): cls = FloatBlock @@ -2723,7 +2714,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None): # ----------------------------------------------------------------- -def _extend_blocks(result, blocks=None): +def extend_blocks(result, blocks=None): """ return a new extended blocks, given the result """ if blocks is None: blocks = [] @@ -2747,11 +2738,12 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - values = values.reshape(tuple((1,) + shape)) # type: ignore + # error: "ExtensionArray" has no attribute "reshape" + values = values.reshape(tuple((1,) + shape)) # type: ignore[attr-defined] return values -def _safe_reshape(arr, new_shape): +def safe_reshape(arr, new_shape: Shape): """ If possible, reshape `arr` to have shape `new_shape`, with a couple of exceptions (see gh-13012): @@ -2813,7 +2805,7 @@ def _putmask_smart(v: np.ndarray, mask: np.ndarray, n) -> np.ndarray: else: # make sure that we have a nullable type # if we have nulls - if not _isna_compat(v, nn[0]): + if not isna_compat(v, nn[0]): pass elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): # only compare integers/floats @@ -2861,7 +2853,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: """ if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... - mask = np.asarray(mask, dtype=np.bool_) + # Except for BooleanArray, this is equivalent to just + # np.asarray(mask, dtype=bool) + mask = mask.to_numpy(dtype=bool, na_value=False) assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2cc7461986c8f..06de1972b4c9a 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,15 +1,16 @@ from collections import defaultdict import copy -from typing import List +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast import numpy as np from pandas._libs import NaT, internals as libinternals +from pandas._typing import DtypeObj, Shape from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - _get_dtype, + get_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, @@ -20,16 +21,19 @@ is_timedelta64_dtype, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna_all import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import DatetimeArray, ExtensionArray from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager +if TYPE_CHECKING: + from pandas.core.arrays.sparse.dtype import SparseDtype + def concatenate_block_managers( - mgrs_indexers, axes, concat_axis: int, copy: bool, + mgrs_indexers, axes, concat_axis: int, copy: bool ) -> BlockManager: """ Concatenate block managers into one. @@ -76,8 +80,9 @@ def concatenate_block_managers( b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy,), + _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, + ndim=len(axes), ) blocks.append(b) @@ -100,10 +105,10 @@ def _get_mgr_concatenation_plan(mgr, indexers): """ # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. - mgr_shape = list(mgr.shape) + mgr_shape_list = list(mgr.shape) for ax, indexer in indexers.items(): - mgr_shape[ax] = len(indexer) - mgr_shape = tuple(mgr_shape) + mgr_shape_list[ax] = len(indexer) + mgr_shape = tuple(mgr_shape_list) if 0 in indexers: ax0_indexer = indexers.pop(0) @@ -111,7 +116,7 @@ def _get_mgr_concatenation_plan(mgr, indexers): blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1) else: - if mgr._is_single_block: + if mgr.is_single_block: blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] @@ -126,9 +131,9 @@ def _get_mgr_concatenation_plan(mgr, indexers): join_unit_indexers = indexers.copy() - shape = list(mgr_shape) - shape[0] = len(placements) - shape = tuple(shape) + shape_list = list(mgr_shape) + shape_list[0] = len(placements) + shape = tuple(shape_list) if blkno == -1: unit = JoinUnit(None, shape) @@ -171,7 +176,7 @@ def _get_mgr_concatenation_plan(mgr, indexers): class JoinUnit: - def __init__(self, block, shape, indexers=None): + def __init__(self, block, shape: Shape, indexers=None): # Passing shape explicitly is required for cases when block is None. if indexers is None: indexers = {} @@ -183,7 +188,7 @@ def __repr__(self) -> str: return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" @cache_readonly - def needs_filling(self): + def needs_filling(self) -> bool: for indexer in self.indexers.values(): # FIXME: cache results of indexer == -1 checks. if (indexer == -1).any(): @@ -199,10 +204,10 @@ def dtype(self): if not self.needs_filling: return self.block.dtype else: - return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) + return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) @cache_readonly - def is_na(self): + def is_na(self) -> bool: if self.block is None: return True @@ -213,24 +218,17 @@ def is_na(self): # a block is NOT null, chunks should help in such cases. 1000 value # was chosen rather arbitrarily. values = self.block.values - if self.block.is_categorical: - values_flat = values.categories - elif is_sparse(self.block.values.dtype): + if is_sparse(self.block.values.dtype): return False elif self.block.is_extension: # TODO(EA2D): no need for special case with 2D EAs values_flat = values else: values_flat = values.ravel(order="K") - total_len = values_flat.shape[0] - chunk_len = max(total_len // 40, 1000) - for i in range(0, total_len, chunk_len): - if not isna(values_flat[i : i + chunk_len]).all(): - return False - return True + return isna_all(values_flat) - def get_reindexed_values(self, empty_dtype, upcasted_na): + def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na): if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value @@ -251,9 +249,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): empty_dtype ): if self.block is None: - array = empty_dtype.construct_array_type() # TODO(EA2D): special case unneeded with 2D EAs - return array( + return DatetimeArray( np.full(self.shape[1], fill_value.value), dtype=empty_dtype ) elif getattr(self.block, "is_categorical", False): @@ -333,18 +330,22 @@ def _concatenate_join_units(join_units, concat_axis, copy): # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] - concat_values = concat_compat(to_concat, axis=concat_axis) - if not isinstance(concat_values, ExtensionArray): + concat_values = concat_compat(to_concat, axis=0) + if not isinstance(concat_values, ExtensionArray) or ( + isinstance(concat_values, DatetimeArray) and concat_values.tz is None + ): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block + # special case DatetimeArray, which *is* an EA, but is put in a + # consolidated 2D block concat_values = np.atleast_2d(concat_values) else: - concat_values = concat_compat(to_concat, axis=concat_axis,) + concat_values = concat_compat(to_concat, axis=concat_axis) return concat_values -def _get_empty_dtype_and_na(join_units): +def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: """ Return dtype and N/A values to use when concatenating specified units. @@ -374,45 +375,8 @@ def _get_empty_dtype_and_na(join_units): else: dtypes[i] = unit.dtype - upcast_classes = defaultdict(list) - null_upcast_classes = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - if is_categorical_dtype(dtype): - upcast_cls = "category" - elif is_datetime64tz_dtype(dtype): - upcast_cls = "datetimetz" - - elif is_extension_array_dtype(dtype): - upcast_cls = "extension" - - elif issubclass(dtype.type, np.bool_): - upcast_cls = "bool" - elif issubclass(dtype.type, np.object_): - upcast_cls = "object" - elif is_datetime64_dtype(dtype): - upcast_cls = "datetime" - elif is_timedelta64_dtype(dtype): - upcast_cls = "timedelta" - elif is_sparse(dtype): - upcast_cls = dtype.subtype.name - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - upcast_cls = dtype.name - else: - upcast_cls = "float" + upcast_classes = _get_upcast_classes(join_units, dtypes) - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: @@ -441,23 +405,74 @@ def _get_empty_dtype_and_na(join_units): return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: - g = np.find_common_type(upcast_classes, []) + common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: - if is_float_dtype(g): - return g, g.type(np.nan) - elif is_numeric_dtype(g): + if is_float_dtype(common_dtype): + return common_dtype, common_dtype.type(np.nan) + elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64), np.nan else: - return g, None + return common_dtype, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg) +def _get_upcast_classes( + join_units: Sequence[JoinUnit], + dtypes: Sequence[DtypeObj], +) -> Dict[str, List[DtypeObj]]: + """Create mapping between upcast class names and lists of dtypes.""" + upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + upcast_cls = _select_upcast_cls_from_dtype(dtype) + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + return upcast_classes + + +def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: + """Select upcast class name based on dtype.""" + if is_categorical_dtype(dtype): + return "category" + elif is_datetime64tz_dtype(dtype): + return "datetimetz" + elif is_extension_array_dtype(dtype): + return "extension" + elif issubclass(dtype.type, np.bool_): + return "bool" + elif issubclass(dtype.type, np.object_): + return "object" + elif is_datetime64_dtype(dtype): + return "datetime" + elif is_timedelta64_dtype(dtype): + return "timedelta" + elif is_sparse(dtype): + dtype = cast("SparseDtype", dtype) + return dtype.subtype.name + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + return dtype.name + else: + return "float" + + def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can @@ -469,8 +484,8 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: # cannot necessarily join return ( # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) - and # noqa + all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa + and # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4b9db810dead0..3c5216b65a70b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,10 +9,12 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Axis, DtypeObj, Scalar +from pandas._typing import Axis, DtypeObj, Label, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, + dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -51,7 +53,7 @@ ) if TYPE_CHECKING: - from pandas import Series # noqa:F401 + from pandas import Series # --------------------------------------------------------------------- # BlockManager Interface @@ -189,15 +191,16 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) - if dtype is not None: - if not is_dtype_equal(values.dtype, dtype): - try: - values = values.astype(dtype) - except Exception as orig: - # e.g. ValueError when trying to cast object dtype to float64 - raise ValueError( - f"failed to cast to '{dtype}' (Exception was: {orig})" - ) from orig + if dtype is not None and not is_dtype_equal(values.dtype, dtype): + try: + values = construct_1d_ndarray_preserving_na( + values.ravel(), dtype=dtype, copy=False + ).reshape(values.shape) + except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 + raise ValueError( + f"failed to cast to '{dtype}' (Exception was: {orig})" + ) from orig # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes( @@ -222,7 +225,8 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # TODO: What about re-joining object columns? block_values = [ - make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) + make_block(dvals_list[n], placement=[n], ndim=2) + for n in range(len(dvals_list)) ] else: @@ -242,7 +246,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays: Union[Sequence[Any], "Series"] if columns is not None: - from pandas.core.series import Series # noqa:F811 + from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index @@ -344,7 +348,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): oindex = index.astype("O") if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): - val = com.dict_compat(val) + val = dict_compat(val) else: val = dict(val) val = lib.fast_multiget(val, oindex._values, default=np.nan) @@ -366,7 +370,7 @@ def extract_index(data) -> Index: index = Index([]) elif len(data) > 0: raw_lengths = [] - indexes = [] + indexes: List[Union[List[Label], Index]] = [] have_raw_arrays = False have_series = False @@ -394,7 +398,7 @@ def extract_index(data) -> Index: if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: - raise ValueError("arrays must all be same length") + raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( @@ -434,7 +438,7 @@ def get_names_from_index(data): if not has_some_name: return ibase.default_index(len(data)) - index = list(range(len(data))) + index: List[Label] = list(range(len(data))) count = 0 for i, s in enumerate(data): n = getattr(s, "name", None) @@ -607,7 +611,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( - data: List, + data: List[Dict], columns: Union[Index, List], coerce_float: bool = False, dtype: Optional[DtypeObj] = None, @@ -744,7 +748,12 @@ def sanitize_index(data, index: Index): through a non-Index. """ if len(data) != len(index): - raise ValueError("Length of values does not match length of index") + raise ValueError( + "Length of values " + f"({len(data)}) " + "does not match length of index " + f"({len(index)})" + ) if isinstance(data, np.ndarray): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c82670106d3b6..0b3f1079cdb16 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,14 +1,23 @@ from collections import defaultdict import itertools -import operator -import re -from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + List, + Optional, + Sequence, + Tuple, + TypeVar, + Union, +) import warnings import numpy as np from pandas._libs import internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label, Scalar +from pandas._typing import ArrayLike, DtypeObj, Label, Shape from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -18,23 +27,18 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, - is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, is_list_like, - is_numeric_v_string_like, - is_scalar, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import array_equivalent, isna +from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries +from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject -import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index @@ -44,12 +48,12 @@ DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, - _extend_blocks, - _safe_reshape, + extend_blocks, get_block_type, make_block, + safe_reshape, ) -from pandas.core.internals.ops import operate_blockwise +from pandas.core.internals.ops import blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None @@ -200,7 +204,7 @@ def __nonzero__(self) -> bool: __bool__ = __nonzero__ @property - def shape(self) -> Tuple[int, ...]: + def shape(self) -> Shape: return tuple(len(ax) for ax in self.axes) @property @@ -221,16 +225,16 @@ def set_axis(self, axis: int, new_labels: Index) -> None: self.axes[axis] = new_labels @property - def _is_single_block(self) -> bool: - # Assumes we are 2D; overriden by SingleBlockManager + def is_single_block(self) -> bool: + # Assumes we are 2D; overridden by SingleBlockManager return len(self.blocks) == 1 def _rebuild_blknos_and_blklocs(self) -> None: """ Update mgr._blknos / mgr._blklocs. """ - new_blknos = np.empty(self.shape[0], dtype=np.int64) - new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos = np.empty(self.shape[0], dtype=np.intp) + new_blklocs = np.empty(self.shape[0], dtype=np.intp) new_blknos.fill(-1) new_blklocs.fill(-1) @@ -263,7 +267,7 @@ def __getstate__(self): "0.14.1": { "axes": axes_array, "blocks": [ - dict(values=b.values, mgr_locs=b.mgr_locs.indexer) + {"values": b.values, "mgr_locs": b.mgr_locs.indexer} for b in self.blocks ], } @@ -274,14 +278,17 @@ def __getstate__(self): return axes_array, block_values, block_items, extra_state def __setstate__(self, state): - def unpickle_block(values, mgr_locs): - return make_block(values, placement=mgr_locs) + def unpickle_block(values, mgr_locs, ndim: int): + # TODO(EA2D): ndim would be unnecessary with 2D EAs + return make_block(values, placement=mgr_locs, ndim=ndim) if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]: state = state[3]["0.14.1"] self.axes = [ensure_index(ax) for ax in state["axes"]] + ndim = len(self.axes) self.blocks = tuple( - unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"] + unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) + for b in state["blocks"] ) else: raise NotImplementedError("pre-0.14.1 pickles are no longer supported") @@ -312,7 +319,7 @@ def _verify_integrity(self) -> None: mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: - if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + if block.shape[1:] != mgr_shape[1:]: raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( @@ -321,31 +328,44 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func): + def reduce( + self: T, func: Callable, ignore_failures: bool = False + ) -> Tuple[T, np.ndarray]: + """ + Apply reduction function blockwise, returning a single-row BlockManager. + + Parameters + ---------- + func : reduction function + ignore_failures : bool, default False + Whether to drop blocks where func raises TypeError. + + Returns + ------- + BlockManager + np.ndarray + Indexer of mgr_locs that are retained. + """ # If 2D, we assume that we're operating column-wise - if self.ndim == 1: - # we'll be returning a scalar - blk = self.blocks[0] - return func(blk.values) + assert self.ndim == 2 - res = {} + res_blocks: List[Block] = [] for blk in self.blocks: - bres = func(blk.values) - - if np.ndim(bres) == 0: - # EA - assert blk.shape[0] == 1 - new_res = zip(blk.mgr_locs.as_array, [bres]) + nbs = blk.reduce(func, ignore_failures) + res_blocks.extend(nbs) + + index = Index([None]) # placeholder + if ignore_failures: + if res_blocks: + indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks]) + new_mgr = self._combine(res_blocks, copy=False, index=index) else: - assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), (blk.shape, bres.shape) - new_res = zip(blk.mgr_locs.as_array, bres) - - nr = dict(new_res) - assert not any(key in res for key in nr) - res.update(nr) - - return res + indexer = [] + new_mgr = type(self).from_blocks([], [Index([]), index]) + else: + indexer = np.arange(self.shape[0]) + new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) + return new_mgr, indexer def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ @@ -353,7 +373,13 @@ def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ return operate_blockwise(self, other, array_op) - def apply(self: T, f, align_keys=None, **kwargs) -> T: + def apply( + self: T, + f, + align_keys: Optional[List[str]] = None, + ignore_failures: bool = False, + **kwargs, + ) -> T: """ Iterate over the blocks, collect and create a new BlockManager. @@ -361,6 +387,10 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: ---------- f : str or callable Name of the Block method to apply. + align_keys: List[str] or None, default None + ignore_failures: bool, default False + **kwargs + Keywords to pass to `f` Returns ------- @@ -390,11 +420,19 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: # otherwise we have an ndarray kwargs[k] = obj[b.mgr_locs.indexer] - if callable(f): - applied = b.apply(f, **kwargs) - else: - applied = getattr(b, f)(**kwargs) - result_blocks = _extend_blocks(applied, result_blocks) + try: + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_blocks = extend_blocks(applied, result_blocks) + + if ignore_failures: + return self._combine(result_blocks) if len(result_blocks) == 0: return self.make_empty(self.axes) @@ -417,6 +455,8 @@ def quantile( Parameters ---------- axis: reduction axis, default 0 + consolidate: bool, default True. Join together blocks having same + dtype transposed: bool, default False we are holding transposed data interpolation : type of interpolation, default 'linear' @@ -495,7 +535,7 @@ def get_axe(block, qs, axes): values = values.take(indexer) return SingleBlockManager( - make_block(values, ndim=1, placement=np.arange(len(values))), axes[0], + make_block(values, ndim=1, placement=np.arange(len(values))), axes[0] ) def isna(self, func) -> "BlockManager": @@ -523,9 +563,7 @@ def where( def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) - def putmask( - self, mask, new, align: bool = True, axis: int = 0, - ): + def putmask(self, mask, new, align: bool = True, axis: int = 0): transpose = self.ndim == 2 if align: @@ -551,6 +589,28 @@ def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + if fill_value is lib.no_default: + fill_value = None + + if axis == 0 and self.ndim == 2 and self.nblocks > 1: + # GH#35488 we need to watch out for multi-block cases + # We only get here with fill_value not-lib.no_default + ncols = self.shape[0] + if periods > 0: + indexer = [-1] * periods + list(range(ncols - periods)) + else: + nper = abs(periods) + indexer = list(range(nper, ncols)) + [-1] * nper + result = self.reindex_indexer( + self.items, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + consolidate=False, + ) + return result + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": @@ -572,7 +632,6 @@ def convert( datetime: bool = True, numeric: bool = True, timedelta: bool = True, - coerce: bool = False, ) -> "BlockManager": return self.apply( "convert", @@ -580,66 +639,41 @@ def convert( datetime=datetime, numeric=numeric, timedelta=timedelta, - coerce=coerce, ) - def replace(self, value, **kwargs) -> "BlockManager": + def replace(self, to_replace, value, inplace: bool, regex: bool) -> "BlockManager": assert np.ndim(value) == 0, value - return self.apply("replace", value=value, **kwargs) + return self.apply( + "replace", to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) def replace_list( - self, src_list, dest_list, inplace: bool = False, regex: bool = False - ) -> "BlockManager": + self: T, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> T: """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") - # figure out our mask apriori to avoid repeated replacements - values = self.as_array() - - def comp(s, regex=False): - """ - Generate a bool array by perform an equality check, or perform - an element-wise regular expression matching - """ - if isna(s): - return isna(values) - - s = com.maybe_box_datetimelike(s) - return _compare_or_regex_search(values, s, regex) - - masks = [comp(s, regex) for s in src_list] - - result_blocks = [] - src_len = len(src_list) - 1 - for blk in self.blocks: - - # its possible to get multiple result blocks here - # replace ALWAYS will return a list - rb = [blk if inplace else blk.copy()] - for i, (s, d) in enumerate(zip(src_list, dest_list)): - new_rb: List[Block] = [] - for b in rb: - m = masks[i][b.mgr_locs.indexer] - convert = i == src_len # only convert once at the end - result = b._replace_coerce( - mask=m, - to_replace=s, - value=d, - inplace=inplace, - convert=convert, - regex=regex, - ) - if m.any() or convert: - new_rb = _extend_blocks(result, new_rb) - else: - new_rb.append(b) - rb = new_rb - result_blocks.extend(rb) - - bm = type(self).from_blocks(result_blocks, self.axes) + bm = self.apply( + "_replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + ) bm._consolidate_inplace() return bm + def to_native_types(self, **kwargs) -> "BlockManager": + """ + Convert values to native types (strings / python objects) that are used + in formatting (repr / csv). + """ + return self.apply("to_native_types", **kwargs) + def is_consolidated(self) -> bool: """ Return True if more than one block with the same dtype @@ -653,12 +687,6 @@ def _consolidate_check(self) -> None: self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True - @property - def is_mixed_type(self) -> bool: - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return len(self.blocks) > 1 - @property def is_numeric_mixed_type(self) -> bool: return all(block.is_numeric for block in self.blocks) @@ -685,13 +713,28 @@ def is_view(self) -> bool: def get_bool_data(self, copy: bool = False) -> "BlockManager": """ + Select blocks that are bool-dtype and columns from object-dtype blocks + that are all-bool. + Parameters ---------- copy : bool, default False Whether to copy the blocks """ - self._consolidate_inplace() - return self._combine([b for b in self.blocks if b.is_bool], copy) + + new_blocks = [] + + for blk in self.blocks: + if blk.dtype == bool: + new_blocks.append(blk) + + elif blk.is_object: + nbs = blk._split() + for nb in nbs: + if nb.is_bool: + new_blocks.append(nb) + + return self._combine(new_blocks, copy) def get_numeric_data(self, copy: bool = False) -> "BlockManager": """ @@ -700,10 +743,11 @@ def get_numeric_data(self, copy: bool = False) -> "BlockManager": copy : bool, default False Whether to copy the blocks """ - self._consolidate_inplace() return self._combine([b for b in self.blocks if b.is_numeric], copy) - def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": + def _combine( + self: T, blocks: List[Block], copy: bool = True, index: Optional[Index] = None + ) -> T: """ return a new manager with the blocks """ if len(blocks) == 0: return self.make_empty() @@ -712,13 +756,15 @@ def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_blocks = [] + new_blocks: List[Block] = [] for b in blocks: b = b.copy(deep=copy) b.mgr_locs = inv_indexer[b.mgr_locs.indexer] new_blocks.append(b) axes = list(self.axes) + if index is not None: + axes[-1] = index axes[0] = self.items.take(indexer) return type(self).from_blocks(new_blocks, axes) @@ -807,7 +853,7 @@ def as_array( # mutating the original object copy = copy or na_value is not lib.no_default - if self._is_single_block: + if self.is_single_block: blk = self.blocks[0] if blk.is_extension: # Avoid implicit conversion of extension blocks to object @@ -845,6 +891,8 @@ def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" result = np.empty(self.shape, dtype=dtype) @@ -876,12 +924,7 @@ def to_dict(self, copy: bool = True): Returns ------- values : a dict of dtype -> BlockManager - - Notes - ----- - This consolidates based on str(dtype) """ - self._consolidate_inplace() bd: Dict[str, List[Block]] = {} for b in self.blocks: @@ -1012,6 +1055,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ + value = extract_array(value, extract_numpy=True) # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical if self._blklocs is None and self.ndim > 1: @@ -1027,7 +1071,7 @@ def value_getitem(placement): else: if value.ndim == self.ndim - 1: - value = _safe_reshape(value, (1,) + value.shape) + value = safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value @@ -1059,7 +1103,7 @@ def value_getitem(placement): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): - blk.set(blk_locs, value_getitem(val_locs)) + blk.set_inplace(blk_locs, value_getitem(val_locs)) else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) @@ -1150,7 +1194,7 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): # TODO(EA2D): special case not needed with 2D EAs - value = _safe_reshape(value, (1,) + value.shape) + value = safe_reshape(value, (1,) + value.shape) block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) @@ -1188,6 +1232,8 @@ def reindex_axis( limit=None, fill_value=None, copy: bool = True, + consolidate: bool = True, + only_slice: bool = False, ): """ Conform block manager to new index. @@ -1198,7 +1244,13 @@ def reindex_axis( ) return self.reindex_indexer( - new_index, indexer, axis=axis, fill_value=fill_value, copy=copy + new_index, + indexer, + axis=axis, + fill_value=fill_value, + copy=copy, + consolidate=consolidate, + only_slice=only_slice, ) def reindex_indexer( @@ -1209,6 +1261,8 @@ def reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + consolidate: bool = True, + only_slice: bool = False, ) -> T: """ Parameters @@ -1219,7 +1273,10 @@ def reindex_indexer( fill_value : object, default None allow_dups : bool, default False copy : bool, default True - + consolidate: bool, default True + Whether to consolidate inplace before reindexing. + only_slice : bool, default False + Whether to take views, not copies, along columns. pandas-indexer with -1's only. """ @@ -1232,7 +1289,8 @@ def reindex_indexer( result.axes[axis] = new_axis return result - self._consolidate_inplace() + if consolidate: + self._consolidate_inplace() # some axes don't allow reindexing with dups if not allow_dups: @@ -1242,7 +1300,9 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0(indexer, fill_value=fill_value) + new_blocks = self._slice_take_blocks_ax0( + indexer, fill_value=fill_value, only_slice=only_slice + ) else: new_blocks = [ blk.take_nd( @@ -1286,7 +1346,7 @@ def _slice_take_blocks_ax0( slice_or_indexer, self.shape[0], allow_fill=allow_fill ) - if self._is_single_block: + if self.is_single_block: blk = self.blocks[0] if sl_type in ("slice", "mask"): @@ -1385,13 +1445,12 @@ def _make_na_block(self, placement, fill_value=None): dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, placement=placement) + return make_block(block_values, placement=placement, ndim=block_values.ndim) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ Take items along any axis. """ - self._consolidate_inplace() indexer = ( np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") if isinstance(indexer, slice) @@ -1408,10 +1467,17 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True new_labels = self.axes[axis].take(indexer) return self.reindex_indexer( - new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + new_axis=new_labels, + indexer=indexer, + axis=axis, + allow_dups=True, + consolidate=False, ) - def equals(self, other: "BlockManager") -> bool: + def equals(self, other: object) -> bool: + if not isinstance(other, BlockManager): + return False + self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): return False @@ -1424,26 +1490,9 @@ def equals(self, other: "BlockManager") -> bool: return False left = self.blocks[0].values right = other.blocks[0].values - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - return left.equals(right) - else: - return array_equivalent(left, right) + return array_equals(left, right) - for i in range(len(self.items)): - # Check column-wise, return False if any column doesnt match - left = self.iget_values(i) - right = other.iget_values(i) - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - if not left.equals(right): - return False - else: - if not array_equivalent(left, right): - return False - return True + return blockwise_all(self, other, array_equals) def unstack(self, unstacker, fill_value) -> "BlockManager": """ @@ -1490,7 +1539,7 @@ class SingleBlockManager(BlockManager): _is_consolidated = True _known_consolidated = True __slots__ = () - _is_single_block = True + is_single_block = True def __init__( self, @@ -1511,7 +1560,7 @@ def __init__( ) self.axes = [axis] - self.blocks = tuple([block]) + self.blocks = (block,) @classmethod def from_blocks( @@ -1622,7 +1671,9 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [ - make_block(values=blocks[0], placement=slice(0, len(axes[0]))) + make_block( + values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 + ) ] mgr = BlockManager(blocks, axes) @@ -1642,8 +1693,11 @@ def create_block_manager_from_arrays( assert isinstance(axes, list) assert all(isinstance(x, Index) for x in axes) + # ensure we dont have any PandasArrays when we call get_block_type + # Note: just calling extract_array breaks tests that patch PandasArray._typ. + arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] try: - blocks = form_blocks(arrays, names, axes) + blocks = _form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr @@ -1675,7 +1729,7 @@ def construction_error(tot_items, block_shape, axes, e=None): # ----------------------------------------------------------------------- -def form_blocks(arrays, names: Index, axes) -> List[Block]: +def _form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) @@ -1722,7 +1776,7 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=i) + make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) for i, _, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1737,15 +1791,14 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=i) + make_block(array, klass=CategoricalBlock, placement=i, ndim=2) for i, _, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): - external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=i) + make_block(array, klass=ExtensionBlock, placement=i, ndim=2) for i, _, array in items_dict["ExtensionBlock"] ] @@ -1753,7 +1806,7 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=i) + make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1766,7 +1819,7 @@ def form_blocks(arrays, names: Index, axes) -> List[Block]: block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - na_block = make_block(block_values, placement=extra_locs) + na_block = make_block(block_values, placement=extra_locs, ndim=2) blocks.append(na_block) return blocks @@ -1783,7 +1836,7 @@ def _simple_blockify(tuples, dtype) -> List[Block]: if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - block = make_block(values, placement=placement) + block = make_block(values, placement=placement, ndim=2) return [block] @@ -1797,7 +1850,7 @@ def _multi_blockify(tuples, dtype=None): values, placement = _stack_arrays(list(tup_block), dtype) - block = make_block(values, placement=placement) + block = make_block(values, placement=placement, ndim=2) new_blocks.append(block) return new_blocks @@ -1812,7 +1865,7 @@ def _asarray_compat(x): else: return np.asarray(x) - def _shape_compat(x): + def _shape_compat(x) -> Shape: if isinstance(x, ABCSeries): return (len(x),) else: @@ -1857,12 +1910,12 @@ def _consolidate(blocks): gkey = lambda x: x._consolidate_key grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) - new_blocks = [] + new_blocks: List[Block] = [] for (_can_consolidate, dtype), group_blocks in grouper: merged_blocks = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) - new_blocks = _extend_blocks(merged_blocks, new_blocks) + new_blocks.extend(merged_blocks) return new_blocks @@ -1888,82 +1941,12 @@ def _merge_blocks( new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] - return [make_block(new_values, placement=new_mgr_locs)] + return [make_block(new_values, placement=new_mgr_locs, ndim=2)] # can't consolidate --> no merge return blocks -def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False -) -> Union[ArrayLike, bool]: - """ - Compare two array_like inputs of the same shape or two scalar values - - Calls operator.eq or re.search, depending on regex argument. If regex is - True, perform an element-wise regex matching. - - Parameters - ---------- - a : array_like - b : scalar - regex : bool, default False - - Returns - ------- - mask : array_like of bool - """ - - def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Scalar, - ): - """ - Raises an error if the two arrays (a,b) cannot be compared. - Otherwise, returns the comparison result as expected. - """ - if is_scalar(result) and isinstance(a, np.ndarray): - type_names = [type(a).__name__, type(b).__name__] - - if isinstance(a, np.ndarray): - type_names[0] = f"ndarray(dtype={a.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - - if not regex: - op = lambda x: operator.eq(x, b) - else: - op = np.vectorize( - lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, str) - else False - ) - - # GH#32621 use mask to avoid comparing to NAs - if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): - mask = np.reshape(~(isna(a)), a.shape) - if isinstance(a, np.ndarray): - a = a[mask] - - if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): - # GH#29553 avoid deprecation warnings from numpy - _check_comparison_types(False, a, b) - return False - - result = op(a) - - if isinstance(result, np.ndarray): - # The shape of the mask can differ to that of the result - # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool_) - tmp[mask] = result - result = tmp - - _check_comparison_types(result, a, b) - return result - - def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index fd9a9a5ef6c93..d7ea5d613d96a 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,21 +1,26 @@ -from typing import TYPE_CHECKING, List, Tuple +from collections import namedtuple +from typing import TYPE_CHECKING, Iterator, List, Tuple import numpy as np from pandas._typing import ArrayLike if TYPE_CHECKING: - from pandas.core.internals.managers import BlockManager # noqa:F401 - from pandas.core.internals.blocks import Block # noqa:F401 + from pandas.core.internals.blocks import Block + from pandas.core.internals.managers import BlockManager -def operate_blockwise( - left: "BlockManager", right: "BlockManager", array_op -) -> "BlockManager": +BlockPairInfo = namedtuple( + "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"] +) + + +def _iter_block_pairs( + left: "BlockManager", right: "BlockManager" +) -> Iterator[BlockPairInfo]: # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) - res_blks: List["Block"] = [] for n, blk in enumerate(left.blocks): locs = blk.mgr_locs blk_vals = blk.values @@ -34,21 +39,32 @@ def operate_blockwise( right_ea = not isinstance(rblk.values, np.ndarray) lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) + info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) + yield info - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) +def operate_blockwise( + left: "BlockManager", right: "BlockManager", array_op +) -> "BlockManager": + # At this point we have already checked the parent DataFrames for + # assert rframe._indexed_same(lframe) + + res_blks: List["Block"] = [] + for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right): + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) + + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - _reset_block_mgr_locs(nbs, locs) + _reset_block_mgr_locs(nbs, locs) - res_blks.extend(nbs) + res_blks.extend(nbs) # Assertions are disabled for performance, but should hold: # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} @@ -85,7 +101,7 @@ def _get_same_shape_values( # Require that the indexing into lvals be slice-like assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs - # TODO(EA2D): with 2D EAs pnly this first clause would be needed + # TODO(EA2D): with 2D EAs only this first clause would be needed if not (left_ea or right_ea): lvals = lvals[rblk.mgr_locs.indexer, :] assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) @@ -102,3 +118,14 @@ def _get_same_shape_values( rvals = rvals[0, :] return lvals, rvals + + +def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: + """ + Blockwise `all` reduction. + """ + for info in _iter_block_pairs(left, right): + res = op(info.lvals, info.rvals) + if not res: + return False + return True diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 7802c5cbdbfb3..445c1efae22e4 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,78 +1,67 @@ """ Routines for filling missing data. """ - -from typing import Any, List, Optional, Set, Union +from functools import partial +from typing import TYPE_CHECKING, Any, List, Optional, Set, Union import numpy as np from pandas._libs import algos, lib +from pandas._typing import ArrayLike, Axis, DtypeObj from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( ensure_float64, - is_datetime64_dtype, - is_datetime64tz_dtype, is_integer_dtype, is_numeric_v_string_like, - is_scalar, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna +if TYPE_CHECKING: + from pandas import Index + -def mask_missing(arr, values_to_mask): +def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray: """ Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True - """ - dtype, values_to_mask = infer_dtype_from_array(values_to_mask) - try: - values_to_mask = np.array(values_to_mask, dtype=dtype) + Parameters + ---------- + arr : ArrayLike + values_to_mask: list, tuple, or scalar - except Exception: - values_to_mask = np.array(values_to_mask, dtype=object) + Returns + ------- + np.ndarray[bool] + """ + # When called from Block.replace/replace_list, values_to_mask is a scalar + # known to be holdable by arr. + # When called from Series._single_replace, values_to_mask is tuple or list + dtype, values_to_mask = infer_dtype_from_array(values_to_mask) + values_to_mask = np.array(values_to_mask, dtype=dtype) na_mask = isna(values_to_mask) nonna = values_to_mask[~na_mask] - mask = None + # GH 21977 + mask = np.zeros(arr.shape, dtype=bool) for x in nonna: - if mask is None: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - mask = False - else: - mask = arr == x - - # if x is a string and arr is not, then we get False and we must - # expand the mask to size arr.shape - if is_scalar(mask): - mask = np.zeros(arr.shape, dtype=bool) + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass else: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - mask |= False - else: - mask |= arr == x + mask |= arr == x if na_mask.any(): - if mask is None: - mask = isna(arr) - else: - mask |= isna(arr) - - # GH 21977 - if mask is None: - mask = np.zeros(arr.shape, dtype=bool) + mask |= isna(arr) return mask -def clean_fill_method(method, allow_nearest=False): +def clean_fill_method(method, allow_nearest: bool = False): # asfreq is compat for resampling if method in [None, "asfreq"]: return None @@ -169,7 +158,7 @@ def find_valid_index(values, how: str): def interpolate_1d( - xvalues: np.ndarray, + xvalues: "Index", yvalues: np.ndarray, method: Optional[str] = "linear", limit: Optional[int] = None, @@ -191,9 +180,7 @@ def interpolate_1d( valid = ~invalid if not valid.any(): - # have to call np.asarray(xvalues) since xvalues could be an Index - # which can't be mutated - result = np.empty_like(np.asarray(xvalues), dtype=np.float64) + result = np.empty(xvalues.shape, dtype=np.float64) result.fill(np.nan) return result @@ -201,8 +188,7 @@ def interpolate_1d( return yvalues if method == "time": - if not getattr(xvalues, "is_all_dates", None): - # if not issubclass(xvalues.dtype.type, np.datetime64): + if not needs_i8_conversion(xvalues.dtype): raise ValueError( "time-weighted interpolation only works " "on Series or DataFrames with a " @@ -228,7 +214,7 @@ def interpolate_1d( ) # default limit is unlimited GH #16282 - limit = algos._validate_limit(nobs=None, limit=limit) + limit = algos.validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) @@ -266,20 +252,18 @@ def interpolate_1d( # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) - yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() - # xvalues to pass to NumPy/SciPy + # xarr to pass to NumPy/SciPy + xarr = xvalues._values + if needs_i8_conversion(xarr.dtype): + # GH#1646 for dt64tz + xarr = xarr.view("i8") - xvalues = getattr(xvalues, "values", xvalues) if method == "linear": - inds = xvalues + inds = xarr else: - inds = np.asarray(xvalues) - - # hack for DatetimeIndex, #1646 - if needs_i8_conversion(inds.dtype): - inds = inds.view(np.int64) + inds = np.asarray(xarr) if method in ("values", "index"): if inds.dtype == np.object_: @@ -329,7 +313,7 @@ def _interpolate_scipy_wrapper( "piecewise_polynomial": _from_derivatives, } - if getattr(x, "is_all_dates", False): + if getattr(x, "_is_all_dates", False): # GH 5975, scipy.interp1d can't handle datetime64s x, new_x = x._values.astype("i8"), new_x.astype("i8") @@ -542,13 +526,92 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat return P(x) +def _interpolate_with_limit_area( + values: ArrayLike, method: str, limit: Optional[int], limit_area: Optional[str] +) -> ArrayLike: + """ + Apply interpolation and limit_area logic to values along a to-be-specified axis. + + Parameters + ---------- + values: array-like + Input array. + method: str + Interpolation method. Could be "bfill" or "pad" + limit: int, optional + Index limit on interpolation. + limit_area: str + Limit area for interpolation. Can be "inside" or "outside" + + Returns + ------- + values: array-like + Interpolated array. + """ + + invalid = isna(values) + + if not invalid.all(): + first = find_valid_index(values, "first") + last = find_valid_index(values, "last") + + values = interpolate_2d( + values, + method=method, + limit=limit, + ) + + if limit_area == "inside": + invalid[first : last + 1] = False + elif limit_area == "outside": + invalid[:first] = invalid[last + 1 :] = False + + values[invalid] = np.nan + + return values + + def interpolate_2d( - values, method="pad", axis=0, limit=None, fill_value=None, dtype=None + values, + method: str = "pad", + axis: Axis = 0, + limit: Optional[int] = None, + limit_area: Optional[str] = None, ): """ Perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result. + + Parameters + ---------- + values: array-like + Input array. + method: str, default "pad" + Interpolation method. Could be "bfill" or "pad" + axis: 0 or 1 + Interpolation axis + limit: int, optional + Index limit on interpolation. + limit_area: str, optional + Limit area for interpolation. Can be "inside" or "outside" + + Returns + ------- + values: array-like + Interpolated array. """ + if limit_area is not None: + return np.apply_along_axis( + partial( + _interpolate_with_limit_area, + method=method, + limit=limit, + limit_area=limit_area, + ), + axis, + values, + ) + orig_values = values transf = (lambda x: x) if axis == 0 else (lambda x: x.T) @@ -560,80 +623,73 @@ def interpolate_2d( raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") values = values.reshape(tuple((1,) + values.shape)) - if fill_value is None: - mask = None - else: # todo create faster fill func without masking - mask = mask_missing(transf(values), fill_value) - method = clean_fill_method(method) + tvalues = transf(values) if method == "pad": - values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + result = _pad_2d(tvalues, limit=limit) else: - values = transf( - backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) - ) + result = _backfill_2d(tvalues, limit=limit) + result = transf(result) # reshape back if ndim == 1: - values = values[0] + result = result[0] - if orig_values.dtype.kind == "M": - # convert float back to datetime64 - values = values.astype(orig_values.dtype) + if orig_values.dtype.kind in ["m", "M"]: + # convert float back to datetime64/timedelta64 + result = result.view(orig_values.dtype) - return values + return result -def _cast_values_for_fillna(values, dtype): +def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool): """ Cast values to a dtype that algos.pad and algos.backfill can handle. """ # TODO: for int-dtypes we make a copy, but for everything else this # alters the values in-place. Is this intentional? - if ( - is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - or is_timedelta64_dtype(dtype) - ): + if needs_i8_conversion(dtype): values = values.view(np.int64) - elif is_integer_dtype(values): + elif is_integer_dtype(values) and not has_mask: # NB: this check needs to come after the datetime64 check above + # has_mask check to avoid casting i8 values that have already + # been cast from PeriodDtype values = ensure_float64(values) return values -def _fillna_prep(values, mask=None, dtype=None): - # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d - if dtype is None: - dtype = values.dtype +def _fillna_prep(values, mask=None): + # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d + dtype = values.dtype - if mask is None: + has_mask = mask is not None + if not has_mask: # This needs to occur before datetime/timedeltas are cast to int64 mask = isna(values) - values = _cast_values_for_fillna(values, dtype) + values = _cast_values_for_fillna(values, dtype, has_mask) mask = mask.view(np.uint8) return values, mask -def pad_1d(values, limit=None, mask=None, dtype=None): - values, mask = _fillna_prep(values, mask, dtype) +def _pad_1d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) algos.pad_inplace(values, mask, limit=limit) return values -def backfill_1d(values, limit=None, mask=None, dtype=None): - values, mask = _fillna_prep(values, mask, dtype) +def _backfill_1d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) algos.backfill_inplace(values, mask, limit=limit) return values -def pad_2d(values, limit=None, mask=None, dtype=None): - values, mask = _fillna_prep(values, mask, dtype) +def _pad_2d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) if np.all(values.shape): algos.pad_2d_inplace(values, mask, limit=limit) @@ -643,8 +699,8 @@ def pad_2d(values, limit=None, mask=None, dtype=None): return values -def backfill_2d(values, limit=None, mask=None, dtype=None): - values, mask = _fillna_prep(values, mask, dtype) +def _backfill_2d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) if np.all(values.shape): algos.backfill_2d_inplace(values, mask, limit=limit) @@ -654,7 +710,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): return values -_fill_methods = {"pad": pad_1d, "backfill": backfill_1d} +_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} def get_fill_func(method): @@ -723,15 +779,15 @@ def inner(invalid, limit): # just use forwards return f_idx else: - b_idx = list(inner(invalid[::-1], bw_limit)) - b_idx = set(N - 1 - np.asarray(b_idx)) + b_idx_inv = list(inner(invalid[::-1], bw_limit)) + b_idx = set(N - 1 - np.asarray(b_idx_inv)) if fw_limit == 0: return b_idx return f_idx & b_idx -def _rolling_window(a, window): +def _rolling_window(a: np.ndarray, window: int): """ [True, True, False, True, False], 2 -> diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e7e28798d84a2..88662a4fabed8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -2,18 +2,18 @@ import itertools import operator from typing import Any, Optional, Tuple, Union, cast +import warnings import numpy as np from pandas._config import get_option -from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib +from pandas._libs import NaT, Timedelta, iNaT, lib from pandas._typing import ArrayLike, Dtype, DtypeObj, F, Scalar from pandas.compat._optional import import_optional_dependency -from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( - _get_dtype, + get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, @@ -96,7 +96,11 @@ def __call__(self, alt: F) -> F: @functools.wraps(alt) def f( - values: np.ndarray, axis: Optional[int] = None, skipna: bool = True, **kwds + values: np.ndarray, + *, + axis: Optional[int] = None, + skipna: bool = True, + **kwds, ): if len(self.kwargs) > 0: for k, v in self.kwargs.items(): @@ -185,7 +189,7 @@ def _get_fill_value( else: if fill_value_typ == "+inf": # need the max int here - return _int64_max + return np.iinfo(np.int64).max else: return iNaT @@ -228,7 +232,7 @@ def _maybe_get_mask( # Boolean data cannot contain nulls, so signal via mask being None return None - if skipna: + if skipna or needs_i8_conversion(values.dtype): mask = isna(values) return mask @@ -279,7 +283,7 @@ def _get_values( """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the - # maybe_upcast_putmask call below + # np.where call below assert is_scalar(fill_value) values = extract_array(values, extract_numpy=True) @@ -287,10 +291,12 @@ def _get_values( dtype = values.dtype + datetimelike = False if needs_i8_conversion(values.dtype): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = np.asarray(values.view("i8")) + datetimelike = True dtype_ok = _na_ok_dtype(dtype) @@ -301,13 +307,13 @@ def _get_values( ) if skipna and (mask is not None) and (fill_value is not None): - values = values.copy() - if dtype_ok and mask.any(): - np.putmask(values, mask, fill_value) - - # promote if needed - else: - values, _ = maybe_upcast_putmask(values, mask, fill_value) + if mask.any(): + if dtype_ok or datetimelike: + values = values.copy() + np.putmask(values, mask, fill_value) + else: + # np.where will promote if needed + values = np.where(~mask, values, fill_value) # return a platform independent precision dtype dtype_max = dtype @@ -325,18 +331,24 @@ def _na_ok_dtype(dtype: DtypeObj) -> bool: return not issubclass(dtype.type, np.integer) -def _wrap_results(result, dtype: DtypeObj, fill_value=None): +def _wrap_results(result, dtype: np.dtype, fill_value=None): """ wrap our results if needed """ - if is_datetime64_any_dtype(dtype): + if result is NaT: + pass + + elif is_datetime64_any_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT if not isinstance(result, np.ndarray): - tz = getattr(dtype, "tz", None) assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan - result = Timestamp(result, tz=tz) + + if isna(result): + result = np.datetime64("NaT", "ns") + else: + result = np.int64(result).view("datetime64[ns]") else: # If we have float dtype, taking a view will give the wrong result result = result.astype(dtype) @@ -346,7 +358,7 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): result = np.nan # raise if we have a timedelta64[ns] which is too large - if np.fabs(result) > _int64_max: + if np.fabs(result) > np.iinfo(np.int64).max: raise ValueError("overflow in timedelta operation") result = Timedelta(result, unit="ns") @@ -356,6 +368,39 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): return result +def _datetimelike_compat(func: F) -> F: + """ + If we have datetime64 or timedelta64 values, ensure we have a correct + mask before calling the wrapped function, then cast back afterwards. + """ + + @functools.wraps(func) + def new_func( + values: np.ndarray, + *, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, + **kwargs, + ): + orig_values = values + + datetimelike = values.dtype.kind in ["m", "M"] + if datetimelike and mask is None: + mask = isna(values) + + result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs) + + if datetimelike: + result = _wrap_results(result, orig_values.dtype, fill_value=iNaT) + if not skipna: + result = _mask_datetimelike_result(result, axis, mask, orig_values) + + return result + + return cast(F, new_func) + + def _na_for_min_count( values: np.ndarray, axis: Optional[int] ) -> Union[Scalar, np.ndarray]: @@ -378,23 +423,23 @@ def _na_for_min_count( if is_numeric_dtype(values): values = values.astype("float64") fill_value = na_value_for_dtype(values.dtype) + if fill_value is NaT: + fill_value = values.dtype.type("NaT", "ns") if values.ndim == 1: return fill_value + elif axis is None: + return fill_value else: - assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] - # calling np.full with dtype parameter throws an ValueError when called - # with dtype=np.datetime64 and and fill_value=pd.NaT - try: - result = np.full(result_shape, fill_value, dtype=values.dtype) - except ValueError: - result = np.full(result_shape, fill_value) + + result = np.full(result_shape, fill_value, dtype=values.dtype) return result def nanany( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -432,6 +477,7 @@ def nanany( def nanall( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -468,8 +514,10 @@ def nanall( @disallow("M8") +@_datetimelike_compat def nansum( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, min_count: int = 0, @@ -506,16 +554,36 @@ def nansum( dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 + the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) - return _wrap_results(the_sum, dtype) + return the_sum + + +def _mask_datetimelike_result( + result: Union[np.ndarray, np.datetime64, np.timedelta64], + axis: Optional[int], + mask: np.ndarray, + orig_values: np.ndarray, +): + if isinstance(result, np.ndarray): + # we need to apply the mask + result = result.astype("i8").view(orig_values.dtype) + axis_mask = mask.any(axis=axis) + result[axis_mask] = iNaT + else: + if mask.any(): + result = NaT + return result @disallow(PeriodDtype) @bottleneck_switch() +@_datetimelike_compat def nanmean( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -549,16 +617,16 @@ def nanmean( ) dtype_sum = dtype_max dtype_count = np.float64 + # not using needs_i8_conversion because that includes period - if ( - is_integer_dtype(dtype) - or is_datetime64_any_dtype(dtype) - or is_timedelta64_dtype(dtype) - ): + if dtype.kind in ["m", "M"]: + dtype_sum = np.float64 + elif is_integer_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype + count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) @@ -573,11 +641,11 @@ def nanmean( else: the_mean = the_sum / count if count > 0 else np.nan - return _wrap_results(the_mean, dtype) + return the_mean @bottleneck_switch() -def nanmedian(values, axis=None, skipna=True, mask=None): +def nanmedian(values, *, axis=None, skipna=True, mask=None): """ Parameters ---------- @@ -605,7 +673,11 @@ def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan - return np.nanmedian(x[mask]) + with warnings.catch_warnings(): + # Suppress RuntimeWarning about All-NaN slice + warnings.filterwarnings("ignore", "All-NaN slice encountered") + res = np.nanmedian(x[mask]) + return res values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask) if not is_float_dtype(values.dtype): @@ -628,25 +700,50 @@ def get_median(x): # there's a non-empty array to apply over otherwise numpy raises if notempty: if not skipna: - return _wrap_results( - np.apply_along_axis(get_median, axis, values), dtype - ) + res = np.apply_along_axis(get_median, axis, values) - # fastpath for the skipna case - return _wrap_results(np.nanmedian(values, axis), dtype) + else: + # fastpath for the skipna case + with warnings.catch_warnings(): + # Suppress RuntimeWarning about All-NaN slice + warnings.filterwarnings("ignore", "All-NaN slice encountered") + res = np.nanmedian(values, axis) - # must return the correct shape, but median is not defined for the - # empty set so return nans of shape "everything but the passed axis" - # since "axis" is where the reduction would occur if we had a nonempty - # array - shp = np.array(values.shape) - dims = np.arange(values.ndim) - ret = np.empty(shp[dims != axis]) - ret.fill(np.nan) - return _wrap_results(ret, dtype) + else: + # must return the correct shape, but median is not defined for the + # empty set so return nans of shape "everything but the passed axis" + # since "axis" is where the reduction would occur if we had a nonempty + # array + res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan) - # otherwise return a scalar value - return _wrap_results(get_median(values) if notempty else np.nan, dtype) + else: + # otherwise return a scalar value + res = get_median(values) if notempty else np.nan + return _wrap_results(res, dtype) + + +def get_empty_reduction_result( + shape: Tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any +) -> np.ndarray: + """ + The result from a reduction on an empty ndarray. + + Parameters + ---------- + shape : Tuple[int] + axis : int + dtype : np.dtype + fill_value : Any + + Returns + ------- + np.ndarray + """ + shp = np.array(shape) + dims = np.arange(len(shape)) + ret = np.empty(shp[dims != axis], dtype=dtype) + ret.fill(fill_value) + return ret def _get_counts_nanvar( @@ -678,7 +775,7 @@ def _get_counts_nanvar( count : scalar or array d : scalar or array """ - dtype = _get_dtype(dtype) + dtype = get_dtype(dtype) count = _get_counts(value_counts, mask, axis, dtype=dtype) d = count - dtype.type(ddof) @@ -695,9 +792,8 @@ def _get_counts_nanvar( return count, d -@disallow("M8") @bottleneck_switch(ddof=1) -def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): +def nanstd(values, *, axis=None, skipna=True, ddof=1, mask=None): """ Compute the standard deviation along given axis while ignoring NaNs @@ -725,6 +821,9 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanstd(s) 1.0 """ + if values.dtype == "M8[ns]": + values = values.view("m8[ns]") + orig_dtype = values.dtype values, mask, _, _, _ = _get_values(values, skipna, mask=mask) @@ -734,7 +833,7 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): @disallow("M8", "m8") @bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): +def nanvar(values, *, axis=None, skipna=True, ddof=1, mask=None): """ Compute the variance along given axis while ignoring NaNs @@ -798,12 +897,13 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) - return _wrap_results(result, values.dtype) + return result @disallow("M8", "m8") def nansem( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, ddof: int = 1, @@ -838,22 +938,24 @@ def nansem( """ # This checks if non-numeric-like data is passed with numeric_only=False # and raises a TypeError otherwise - nanvar(values, axis, skipna, ddof=ddof, mask=mask) + nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) - var = nanvar(values, axis, skipna, ddof=ddof) + var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count) def _nanminmax(meth, fill_value_typ): @bottleneck_switch(name="nan" + meth) + @_datetimelike_compat def reduction( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -872,8 +974,8 @@ def reduction( else: result = getattr(values, meth)(axis) - result = _wrap_results(result, dtype, fill_value) - return _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out(result, axis, mask, values.shape) + return result return reduction @@ -885,6 +987,7 @@ def reduction( @disallow("O") def nanargmax( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -929,6 +1032,7 @@ def nanargmax( @disallow("O") def nanargmin( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -973,6 +1077,7 @@ def nanargmin( @disallow("M8", "m8") def nanskew( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -1057,6 +1162,7 @@ def nanskew( @disallow("M8", "m8") def nankurt( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -1150,6 +1256,7 @@ def nankurt( @disallow("M8", "m8") def nanprod( values: np.ndarray, + *, axis: Optional[int] = None, skipna: bool = True, min_count: int = 0, @@ -1234,7 +1341,7 @@ def _get_counts( ------- count : scalar or array """ - dtype = _get_dtype(dtype) + dtype = get_dtype(dtype) if axis is None: if mask is not None: n = mask.size - mask.sum() @@ -1329,7 +1436,7 @@ def _zero_out_fperr(arg): @disallow("M8", "m8") def nancorr( - a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None, + a: np.ndarray, b: np.ndarray, *, method="pearson", min_periods: Optional[int] = None ): """ a, b: ndarrays @@ -1386,6 +1493,7 @@ def func(a, b): def nancov( a: np.ndarray, b: np.ndarray, + *, min_periods: Optional[int] = None, ddof: Optional[int] = 1, ): @@ -1501,6 +1609,7 @@ def _nanpercentile_1d( def nanpercentile( values: np.ndarray, q, + *, axis: int, na_value, mask: np.ndarray, @@ -1529,10 +1638,16 @@ def nanpercentile( if values.dtype.kind in ["m", "M"]: # need to cast to integer to avoid rounding errors in numpy result = nanpercentile( - values.view("i8"), q, axis, na_value.view("i8"), mask, ndim, interpolation + values.view("i8"), + q=q, + axis=axis, + na_value=na_value.view("i8"), + mask=mask, + ndim=ndim, + interpolation=interpolation, ) - # Note: we have to do do `astype` and not view because in general we + # Note: we have to do `astype` and not view because in general we # have float result at this point, not i8 return result.astype(values.dtype) @@ -1558,7 +1673,7 @@ def nanpercentile( return np.percentile(values, q, axis=axis, interpolation=interpolation) -def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: +def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: """ Cumulative function with skipna support. @@ -1616,7 +1731,9 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: result = result.view(orig_dtype) else: # DatetimeArray - result = type(values)._from_sequence(result, dtype=orig_dtype) + result = type(values)._simple_new( # type: ignore[attr-defined] + result, dtype=orig_dtype + ) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 5dd94a8af74ac..d8b5dba424cbf 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -4,40 +4,39 @@ This is not a public API. """ import operator -from typing import TYPE_CHECKING, Optional, Set, Type +from typing import TYPE_CHECKING, Optional, Set +import warnings import numpy as np -from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas._typing import Level from pandas.util._decorators import Appender -from pandas.core.dtypes.common import is_list_like -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.common import is_array_like, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.dtypes.missing import isna -from pandas.core.construction import extract_array -from pandas.core.ops.array_ops import ( +from pandas.core import algorithms +from pandas.core.ops.array_ops import ( # noqa:F401 arithmetic_op, + comp_method_OBJECT_ARRAY, comparison_op, get_array_op, logical_op, ) -from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 -from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.ops.common import ( # noqa:F401 + get_op_result_name, + unpack_zerodim_and_defer, +) from pandas.core.ops.docstrings import ( - _arith_doc_FRAME, _flex_comp_doc_FRAME, - _make_flex_doc, _op_descriptions, + make_flex_doc, ) from pandas.core.ops.invalid import invalid_comparison # noqa:F401 from pandas.core.ops.mask_ops import kleene_and, kleene_or, kleene_xor # noqa: F401 -from pandas.core.ops.methods import ( # noqa:F401 - add_flex_arithmetic_methods, - add_special_arithmetic_methods, -) +from pandas.core.ops.methods import add_flex_arithmetic_methods # noqa:F401 from pandas.core.ops.roperator import ( # noqa:F401 radd, rand_, @@ -54,7 +53,7 @@ ) if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa:F401 + from pandas import DataFrame, Series # ----------------------------------------------------------------------------- # constants @@ -80,115 +79,6 @@ COMPARISON_BINOPS: Set[str] = {"eq", "ne", "lt", "gt", "le", "ge"} -# ----------------------------------------------------------------------------- -# Ops Wrapping Utilities - - -def get_op_result_name(left, right): - """ - Find the appropriate name to pin to an operation result. This result - should always be either an Index or a Series. - - Parameters - ---------- - left : {Series, Index} - right : object - - Returns - ------- - name : object - Usually a string - """ - # `left` is always a Series when called from within ops - if isinstance(right, (ABCSeries, ABCIndexClass)): - name = _maybe_match_name(left, right) - else: - name = left.name - return name - - -def _maybe_match_name(a, b): - """ - Try to find a name to attach to the result of an operation between - a and b. If only one of these has a `name` attribute, return that - name. Otherwise return a consensus name if they match of None if - they have different names. - - Parameters - ---------- - a : object - b : object - - Returns - ------- - name : str or None - - See Also - -------- - pandas.core.common.consensus_name_attr - """ - a_has = hasattr(a, "name") - b_has = hasattr(b, "name") - if a_has and b_has: - if a.name == b.name: - return a.name - else: - # TODO: what if they both have np.nan for their names? - return None - elif a_has: - return a.name - elif b_has: - return b.name - return None - - -# ----------------------------------------------------------------------------- - - -def _get_frame_op_default_axis(name: str) -> Optional[str]: - """ - Only DataFrame cares about default_axis, specifically: - special methods have default_axis=None and flex methods - have default_axis='columns'. - - Parameters - ---------- - name : str - - Returns - ------- - default_axis: str or None - """ - if name.replace("__r", "__") in ["__and__", "__or__", "__xor__"]: - # bool methods - return "columns" - elif name.startswith("__"): - # __add__, __mul__, ... - return None - else: - # add, mul, ... - return "columns" - - -def _get_op_name(op, special: bool) -> str: - """ - Find the name to attach to this method according to conventions - for special and non-special methods. - - Parameters - ---------- - op : binary operator - special : bool - - Returns - ------- - op_name : str - """ - opname = op.__name__.strip("_") - if special: - opname = f"__{opname}__" - return opname - # ----------------------------------------------------------------------------- # Masking NA values and fallbacks for operations numpy does not support @@ -235,77 +125,13 @@ def fill_binop(left, right, fill_value): return left, right -# ----------------------------------------------------------------------------- -# Dispatch logic - - -def dispatch_to_series(left, right, func, axis: Optional[int] = None): - """ - Evaluate the frame operation func(left, right) by evaluating - column-by-column, dispatching to the Series implementation. - - Parameters - ---------- - left : DataFrame - right : scalar, Series, or DataFrame - func : arithmetic or comparison operator - axis : {None, 0, 1} - - Returns - ------- - DataFrame - """ - # Get the appropriate array-op to apply to each column/block's values. - array_op = get_array_op(func) - - right = lib.item_from_zerodim(right) - if not is_list_like(right): - # i.e. scalar, faster than checking np.ndim(right) == 0 - bm = left._mgr.apply(array_op, right=right) - return type(left)(bm) - - elif isinstance(right, ABCDataFrame): - assert left.index.equals(right.index) - assert left.columns.equals(right.columns) - # TODO: The previous assertion `assert right._indexed_same(left)` - # fails in cases with empty columns reached via - # _frame_arith_method_with_reindex - - bm = left._mgr.operate_blockwise(right._mgr, array_op) - return type(left)(bm) - - elif isinstance(right, ABCSeries) and axis == 1: - # axis=1 means we want to operate row-by-row - assert right.index.equals(left.columns) - - right = right._values - # maybe_align_as_frame ensures we do not have an ndarray here - assert not isinstance(right, np.ndarray) - - arrays = [array_op(l, r) for l, r in zip(left._iter_column_arrays(), right)] - - elif isinstance(right, ABCSeries): - assert right.index.equals(left.index) # Handle other cases later - right = right._values - - arrays = [array_op(l, right) for l in left._iter_column_arrays()] - - else: - # Remaining cases have less-obvious dispatch rules - raise NotImplementedError(right) - - return type(left)._from_arrays( - arrays, left.columns, left.index, verify_integrity=False - ) - - # ----------------------------------------------------------------------------- # Series -def _align_method_SERIES(left: "Series", right, align_asobject: bool = False): +def align_method_SERIES(left: "Series", right, align_asobject: bool = False): """ align lhs and rhs Series """ - # ToDo: Different from _align_method_FRAME, list, tuple and ndarray + # ToDo: Different from align_method_FRAME, list, tuple and ndarray # are not coerced here # because Series has inconsistencies described in #13637 @@ -323,84 +149,9 @@ def _align_method_SERIES(left: "Series", right, align_asobject: bool = False): return left, right -def _arith_method_SERIES(cls, op, special): - """ - Wrapper function for Series arithmetic operations, to avoid - code duplication. - """ - assert special # non-special uses _flex_method_SERIES - op_name = _get_op_name(op, special) - - @unpack_zerodim_and_defer(op_name) - def wrapper(left, right): - - left, right = _align_method_SERIES(left, right) - res_name = get_op_result_name(left, right) - - lvalues = extract_array(left, extract_numpy=True) - rvalues = extract_array(right, extract_numpy=True) - result = arithmetic_op(lvalues, rvalues, op) - - return left._construct_result(result, name=res_name) - - wrapper.__name__ = op_name - return wrapper - - -def _comp_method_SERIES(cls, op, special): - """ - Wrapper function for Series arithmetic operations, to avoid - code duplication. - """ - assert special # non-special uses _flex_method_SERIES - op_name = _get_op_name(op, special) - - @unpack_zerodim_and_defer(op_name) - def wrapper(self, other): - - res_name = get_op_result_name(self, other) - - if isinstance(other, ABCSeries) and not self._indexed_same(other): - raise ValueError("Can only compare identically-labeled Series objects") - - lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) - - res_values = comparison_op(lvalues, rvalues, op) - - return self._construct_result(res_values, name=res_name) - - wrapper.__name__ = op_name - return wrapper - - -def _bool_method_SERIES(cls, op, special): - """ - Wrapper function for Series arithmetic operations, to avoid - code duplication. - """ - assert special # non-special uses _flex_method_SERIES - op_name = _get_op_name(op, special) - - @unpack_zerodim_and_defer(op_name) - def wrapper(self, other): - self, other = _align_method_SERIES(self, other, align_asobject=True) - res_name = get_op_result_name(self, other) - - lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) - - res_values = logical_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) - - wrapper.__name__ = op_name - return wrapper - - -def _flex_method_SERIES(cls, op, special): - assert not special # "special" also means "not flex" - name = _get_op_name(op, special) - doc = _make_flex_doc(name, "series") +def flex_method_SERIES(op): + name = op.__name__.strip("_") + doc = make_flex_doc(name, "series") @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -408,13 +159,17 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): if axis is not None: self._get_axis_number(axis) + res_name = get_op_result_name(self, other) + if isinstance(other, ABCSeries): return self._binop(other, op, level=level, fill_value=fill_value) elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): raise ValueError("Lengths must be equal") other = self._constructor(other, self.index) - return self._binop(other, op, level=level, fill_value=fill_value) + result = self._binop(other, op, level=level, fill_value=fill_value) + result.name = res_name + return result else: if fill_value is not None: self = self.fillna(fill_value) @@ -429,7 +184,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # DataFrame -def _align_method_FRAME( +def align_method_FRAME( left, right, axis, flex: Optional[bool] = False, level: Level = None ): """ @@ -498,6 +253,11 @@ def to_series(right): ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): + # GH 36702. Raise when attempting arithmetic with list of array-like. + if any(is_array_like(el) for el in right): + raise ValueError( + f"Unable to coerce list of {type(right[0])} to Series/DataFrame" + ) # GH17901 right = to_series(right) @@ -512,6 +272,18 @@ def to_series(right): elif isinstance(right, ABCSeries): # axis=1 is default for DataFrame-with-Series op axis = left._get_axis_number(axis) if axis is not None else 1 + + if not flex: + if not left.axes[axis].equals(right.index): + warnings.warn( + "Automatic reindexing on DataFrame vs Series comparisons " + "is deprecated and will raise ValueError in a future version. " + "Do `left, right = left.align(right, axis=1, copy=False)` " + "before e.g. `left == right`", + FutureWarning, + stacklevel=5, + ) + left, right = left.align( right, join="outer", axis=axis, level=level, copy=False ) @@ -520,7 +292,7 @@ def to_series(right): return left, right -def _should_reindex_frame_op( +def should_reindex_frame_op( left: "DataFrame", right, op, axis, default_axis, fill_value, level ) -> bool: """ @@ -538,13 +310,18 @@ def _should_reindex_frame_op( if fill_value is None and level is None and axis is default_axis: # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if not (cols.equals(left.columns) and cols.equals(right.columns)): + + # Intersection is always unique so we have to check the unique columns + left_uniques = left.columns.unique() + right_uniques = right.columns.unique() + if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)): + # TODO: is there a shortcut available when len(cols) == 0? return True return False -def _frame_arith_method_with_reindex( +def frame_arith_method_with_reindex( left: "DataFrame", right: "DataFrame", op ) -> "DataFrame": """ @@ -562,18 +339,32 @@ def _frame_arith_method_with_reindex( DataFrame """ # GH#31623, only operate on shared columns - cols = left.columns.intersection(right.columns) + cols, lcols, rcols = left.columns.join( + right.columns, how="inner", level=None, return_indexers=True + ) - new_left = left[cols] - new_right = right[cols] + new_left = left.iloc[:, lcols] + new_right = right.iloc[:, rcols] result = op(new_left, new_right) - # Do the join on the columns instead of using _align_method_FRAME + # Do the join on the columns instead of using align_method_FRAME # to avoid constructing two potentially large/sparse DataFrames join_columns, _, _ = left.columns.join( right.columns, how="outer", level=None, return_indexers=True ) - return result.reindex(join_columns, axis=1) + + if result.columns.has_duplicates: + # Avoid reindexing with a duplicate axis. + # https://github.com/pandas-dev/pandas/issues/35194 + indexer, _ = result.columns.get_indexer_non_unique(join_columns) + indexer = algorithms.unique1d(indexer) + result = result._reindex_with_indexers( + {1: [join_columns, indexer]}, allow_dups=True + ) + else: + result = result.reindex(join_columns, axis=1) + + return result def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int): @@ -599,26 +390,20 @@ def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int return type(frame)(rvalues, index=frame.index, columns=frame.columns) -def _arith_method_FRAME(cls: Type["DataFrame"], op, special: bool): - # This is the only function where `special` can be either True or False - op_name = _get_op_name(op, special) - default_axis = _get_frame_op_default_axis(op_name) +def flex_arith_method_FRAME(op): + op_name = op.__name__.strip("_") + default_axis = "columns" na_op = get_array_op(op) - - if op_name in _op_descriptions: - # i.e. include "add" but not "__add__" - doc = _make_flex_doc(op_name, "dataframe") - else: - doc = _arith_doc_FRAME % op_name + doc = make_flex_doc(op_name, "dataframe") @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): - if _should_reindex_frame_op( + if should_reindex_frame_op( self, other, op, axis, default_axis, fill_value, level ): - return _frame_arith_method_with_reindex(self, other, op) + return frame_arith_method_with_reindex(self, other, op) if isinstance(other, ABCSeries) and fill_value is not None: # TODO: We could allow this in cases where we end up going @@ -627,22 +412,20 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): axis = self._get_axis_number(axis) if axis is not None else 1 - # TODO: why are we passing flex=True instead of flex=not special? - # 15 tests fail if we pass flex=not special instead - self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) + self, other = align_method_FRAME(self, other, axis, flex=True, level=level) if isinstance(other, ABCDataFrame): # Another DataFrame new_data = self._combine_frame(other, na_op, fill_value) elif isinstance(other, ABCSeries): - new_data = dispatch_to_series(self, other, op, axis=axis) + new_data = self._dispatch_frame_op(other, op, axis=axis) else: # in this case we always have `np.ndim(other) == 0` if fill_value is not None: self = self.fillna(fill_value) - new_data = dispatch_to_series(self, other, op) + new_data = self._dispatch_frame_op(other, op) return self._construct_result(new_data) @@ -651,11 +434,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): return f -def _flex_comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): - assert not special # "special" also means "not flex" - op_name = _get_op_name(op, special) - default_axis = _get_frame_op_default_axis(op_name) - assert default_axis == "columns", default_axis # because we are not "special" +def flex_comp_method_FRAME(op): + op_name = op.__name__.strip("_") + default_axis = "columns" # because we are "flex" doc = _flex_comp_doc_FRAME.format( op_name=op_name, desc=_op_descriptions[op_name]["desc"] @@ -665,28 +446,9 @@ def _flex_comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): def f(self, other, axis=default_axis, level=None): axis = self._get_axis_number(axis) if axis is not None else 1 - self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) - - new_data = dispatch_to_series(self, other, op, axis=axis) - return self._construct_result(new_data) - - f.__name__ = op_name - - return f - - -def _comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): - assert special # "special" also means "not flex" - op_name = _get_op_name(op, special) - - @Appender(f"Wrapper for comparison method {op_name}") - def f(self, other): - axis = 1 # only relevant for Series other case - - self, other = _align_method_FRAME(self, other, axis, level=None, flex=False) + self, other = align_method_FRAME(self, other, axis, flex=True, level=level) - # See GH#4537 for discussion of scalar op behavior - new_data = dispatch_to_series(self, other, op, axis=axis) + new_data = self._dispatch_frame_op(other, op, axis=axis) return self._construct_result(new_data) f.__name__ = op_name diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 3379ee56b6ad0..41d539564d91e 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -5,13 +5,13 @@ from datetime import timedelta from functools import partial import operator -from typing import Any, Tuple +from typing import Any import warnings import numpy as np from pandas._libs import Timedelta, Timestamp, lib, ops as libops -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Shape from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -23,12 +23,14 @@ is_bool_dtype, is_integer_dtype, is_list_like, + is_numeric_v_string_like, is_object_dtype, is_scalar, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna +from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -39,13 +41,11 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) - if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): - # Note: these checks can be for ABCIndex and not ABCIndexClass - # because that is the only object-dtype class. + if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)): if not is_object_dtype(y.dtype): y = y.astype(np.object_) - if isinstance(y, (ABCSeries, ABCIndex)): + if isinstance(y, (ABCSeries, ABCIndexClass)): y = y._values if x.shape != y.shape: @@ -56,7 +56,7 @@ def comp_method_OBJECT_ARRAY(op, x, y): return result.reshape(x.shape) -def masked_arith_op(x: np.ndarray, y, op): +def _masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). @@ -115,7 +115,7 @@ def masked_arith_op(x: np.ndarray, y, op): return result -def na_arithmetic_op(left, right, op, is_cmp: bool = False): +def _na_arithmetic_op(left, right, op, is_cmp: bool = False): """ Return the result of evaluating op on the passed in values. @@ -146,7 +146,7 @@ def na_arithmetic_op(left, right, op, is_cmp: bool = False): # In this case we do not fall back to the masked op, as that # will handle complex numbers incorrectly, see GH#32047 raise - result = masked_arith_op(left, right, op) + result = _masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise @@ -176,9 +176,9 @@ def arithmetic_op(left: ArrayLike, right: Any, op): # NB: We assume that extract_array has already been called # on `left` and `right`. - lvalues = maybe_upcast_datetimelike_array(left) - rvalues = maybe_upcast_datetimelike_array(right) - rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) + lvalues = ensure_wrapped_if_datetimelike(left) + rvalues = ensure_wrapped_if_datetimelike(right) + rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 @@ -186,7 +186,7 @@ def arithmetic_op(left: ArrayLike, right: Any, op): else: with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op) + res_values = _na_arithmetic_op(lvalues, rvalues, op) return res_values @@ -207,7 +207,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right - lvalues = maybe_upcast_datetimelike_array(left) + lvalues = ensure_wrapped_if_datetimelike(left) rvalues = right rvalues = lib.item_from_zerodim(rvalues) @@ -235,6 +235,10 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: else: res_values = np.zeros(lvalues.shape, dtype=bool) + elif is_numeric_v_string_like(lvalues, rvalues): + # GH#36377 going through the numexpr path would incorrectly raise + return invalid_comparison(lvalues, rvalues, op) + elif is_object_dtype(lvalues.dtype): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) @@ -243,7 +247,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # suppress warnings from numpy about element-wise comparison warnings.simplefilter("ignore", DeprecationWarning) with np.errstate(all="ignore"): - res_values = na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) + res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) return res_values @@ -328,7 +332,7 @@ def fill_bool(x, left=None): right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right - lvalues = maybe_upcast_datetimelike_array(left) + lvalues = ensure_wrapped_if_datetimelike(left) rvalues = right if should_extension_dispatch(lvalues, rvalues): @@ -349,7 +353,8 @@ def fill_bool(x, left=None): filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_logical_op(lvalues, rvalues, op) - res_values = filler(res_values) # type: ignore + # error: Cannot call function of unknown type + res_values = filler(res_values) # type: ignore[operator] return res_values @@ -396,32 +401,7 @@ def get_array_op(op): raise NotImplementedError(op_name) -def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: - """ - If we have an ndarray that is either datetime64 or timedelta64, wrap in EA. - - Parameters - ---------- - obj : ndarray or ExtensionArray - - Returns - ------- - ndarray or ExtensionArray - """ - if isinstance(obj, np.ndarray): - if obj.dtype.kind == "m": - from pandas.core.arrays import TimedeltaArray - - return TimedeltaArray._from_sequence(obj) - if obj.dtype.kind == "M": - from pandas.core.arrays import DatetimeArray - - return DatetimeArray._from_sequence(obj) - - return obj - - -def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): +def _maybe_upcast_for_op(obj, shape: Shape): """ Cast non-pandas objects to pandas types to unify behavior of arithmetic and comparison operations. diff --git a/pandas/core/ops/common.py b/pandas/core/ops/common.py index 515a0a5198d74..a6bcab44e5519 100644 --- a/pandas/core/ops/common.py +++ b/pandas/core/ops/common.py @@ -65,3 +65,60 @@ def new_method(self, other): return method(self, other) return new_method + + +def get_op_result_name(left, right): + """ + Find the appropriate name to pin to an operation result. This result + should always be either an Index or a Series. + + Parameters + ---------- + left : {Series, Index} + right : object + + Returns + ------- + name : object + Usually a string + """ + if isinstance(right, (ABCSeries, ABCIndexClass)): + name = _maybe_match_name(left, right) + else: + name = left.name + return name + + +def _maybe_match_name(a, b): + """ + Try to find a name to attach to the result of an operation between + a and b. If only one of these has a `name` attribute, return that + name. Otherwise return a consensus name if they match of None if + they have different names. + + Parameters + ---------- + a : object + b : object + + Returns + ------- + name : str or None + + See Also + -------- + pandas.core.common.consensus_name_attr + """ + a_has = hasattr(a, "name") + b_has = hasattr(b, "name") + if a_has and b_has: + if a.name == b.name: + return a.name + else: + # TODO: what if they both have np.nan for their names? + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 4ace873f029ae..06ed321327e06 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -4,7 +4,7 @@ from typing import Dict, Optional -def _make_flex_doc(op_name, typ): +def make_flex_doc(op_name: str, typ: str) -> str: """ Make the appropriate substitutions for the given operation and class-typ into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring @@ -22,16 +22,20 @@ def _make_flex_doc(op_name, typ): op_name = op_name.replace("__", "") op_desc = _op_descriptions[op_name] + op_desc_op = op_desc["op"] + assert op_desc_op is not None # for mypy if op_name.startswith("r"): - equiv = "other " + op_desc["op"] + " " + typ + equiv = "other " + op_desc_op + " " + typ + elif op_name == "divmod": + equiv = f"{op_name}({typ}, other)" else: - equiv = typ + " " + op_desc["op"] + " other" + equiv = typ + " " + op_desc_op + " other" if typ == "series": base_doc = _flex_doc_SERIES if op_desc["reverse"]: base_doc += _see_also_reverse_SERIES.format( - reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"], + reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"] ) doc_no_examples = base_doc.format( desc=op_desc["desc"], @@ -39,8 +43,9 @@ def _make_flex_doc(op_name, typ): equiv=equiv, series_returns=op_desc["series_returns"], ) - if op_desc["series_examples"]: - doc = doc_no_examples + op_desc["series_examples"] + ser_example = op_desc["series_examples"] + if ser_example: + doc = doc_no_examples + ser_example else: doc = doc_no_examples elif typ == "dataframe": @@ -159,6 +164,25 @@ def _make_flex_doc(op_name, typ): """ ) +_divmod_example_SERIES = ( + _common_examples_algebra_SERIES + + """ +>>> a.divmod(b, fill_value=0) +(a 1.0 + b NaN + c NaN + d 0.0 + e NaN + dtype: float64, + a 0.0 + b NaN + c NaN + d 0.0 + e NaN + dtype: float64) +""" +) + _mod_example_SERIES = ( _common_examples_algebra_SERIES + """ @@ -329,7 +353,7 @@ def _make_flex_doc(op_name, typ): "op": "divmod", "desc": "Integer division and modulo", "reverse": "rdivmod", - "series_examples": None, + "series_examples": _divmod_example_SERIES, "series_returns": _returns_tuple, "df_examples": None, }, @@ -424,33 +448,6 @@ def _make_flex_doc(op_name, typ): Series.{reverse} : {see_also_desc}. """ -_arith_doc_FRAME = """ -Binary operator %s with support to substitute a fill_value for missing data in -one of the inputs - -Parameters ----------- -other : Series, DataFrame, or constant -axis : {0, 1, 'index', 'columns'} - For Series input, axis to match Series index on -fill_value : None or float value, default None - Fill existing missing (NaN) values, and any new element needed for - successful DataFrame alignment, with this value before computation. - If data in both corresponding DataFrame locations is missing - the result will be missing -level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - -Returns -------- -result : DataFrame - -Notes ------ -Mismatched indices will be unioned together -""" - _flex_doc_FRAME = """ Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). @@ -611,7 +608,7 @@ def _make_flex_doc(op_name, typ): Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison operators. -Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis +Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. Parameters diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index a4694a6e5134f..4866905d32b83 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -7,16 +7,13 @@ from pandas.core.ops.roperator import ( radd, - rand_, rdivmod, rfloordiv, rmod, rmul, - ror_, rpow, rsub, rtruediv, - rxor, ) @@ -33,101 +30,23 @@ def _get_method_wrappers(cls): ------- arith_flex : function or None comp_flex : function or None - arith_special : function - comp_special : function - bool_special : function - - Notes - ----- - None is only returned for SparseArray """ # TODO: make these non-runtime imports once the relevant functions # are no longer in __init__ from pandas.core.ops import ( - _arith_method_FRAME, - _arith_method_SERIES, - _bool_method_SERIES, - _comp_method_FRAME, - _comp_method_SERIES, - _flex_comp_method_FRAME, - _flex_method_SERIES, + flex_arith_method_FRAME, + flex_comp_method_FRAME, + flex_method_SERIES, ) if issubclass(cls, ABCSeries): # Just Series - arith_flex = _flex_method_SERIES - comp_flex = _flex_method_SERIES - arith_special = _arith_method_SERIES - comp_special = _comp_method_SERIES - bool_special = _bool_method_SERIES + arith_flex = flex_method_SERIES + comp_flex = flex_method_SERIES elif issubclass(cls, ABCDataFrame): - arith_flex = _arith_method_FRAME - comp_flex = _flex_comp_method_FRAME - arith_special = _arith_method_FRAME - comp_special = _comp_method_FRAME - bool_special = _arith_method_FRAME - return arith_flex, comp_flex, arith_special, comp_special, bool_special - - -def add_special_arithmetic_methods(cls): - """ - Adds the full suite of special arithmetic methods (``__add__``, - ``__sub__``, etc.) to the class. - - Parameters - ---------- - cls : class - special methods will be defined and pinned to this class - """ - _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) - new_methods = _create_methods( - cls, arith_method, comp_method, bool_method, special=True - ) - # inplace operators (I feel like these should get passed an `inplace=True` - # or just be removed - - def _wrap_inplace_method(method): - """ - return an inplace wrapper for this method - """ - - def f(self, other): - result = method(self, other) - # Delete cacher - self._reset_cacher() - # this makes sure that we are aligned like the input - # we are updating inplace so we want to ignore is_copy - self._update_inplace( - result.reindex_like(self, copy=False), verify_is_copy=False - ) - - return self - - name = method.__name__.strip("__") - f.__name__ = f"__i{name}__" - return f - - new_methods.update( - dict( - __iadd__=_wrap_inplace_method(new_methods["__add__"]), - __isub__=_wrap_inplace_method(new_methods["__sub__"]), - __imul__=_wrap_inplace_method(new_methods["__mul__"]), - __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), - __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), - __imod__=_wrap_inplace_method(new_methods["__mod__"]), - __ipow__=_wrap_inplace_method(new_methods["__pow__"]), - ) - ) - - new_methods.update( - dict( - __iand__=_wrap_inplace_method(new_methods["__and__"]), - __ior__=_wrap_inplace_method(new_methods["__or__"]), - __ixor__=_wrap_inplace_method(new_methods["__xor__"]), - ) - ) - - _add_methods(cls, new_methods=new_methods) + arith_flex = flex_arith_method_FRAME + comp_flex = flex_comp_method_FRAME + return arith_flex, comp_flex def add_flex_arithmetic_methods(cls): @@ -140,16 +59,14 @@ def add_flex_arithmetic_methods(cls): cls : class flex methods will be defined and pinned to this class """ - flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) - new_methods = _create_methods( - cls, flex_arith_method, flex_comp_method, bool_method=None, special=False - ) + flex_arith_method, flex_comp_method = _get_method_wrappers(cls) + new_methods = _create_methods(cls, flex_arith_method, flex_comp_method) new_methods.update( - dict( - multiply=new_methods["mul"], - subtract=new_methods["sub"], - divide=new_methods["div"], - ) + { + "multiply": new_methods["mul"], + "subtract": new_methods["sub"], + "divide": new_methods["div"], + } ) # opt out of bool flex methods for now assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) @@ -157,68 +74,52 @@ def add_flex_arithmetic_methods(cls): _add_methods(cls, new_methods=new_methods) -def _create_methods(cls, arith_method, comp_method, bool_method, special): - # creates actual methods based upon arithmetic, comp and bool method +def _create_methods(cls, arith_method, comp_method): + # creates actual flex methods based upon arithmetic, and comp method # constructors. have_divmod = issubclass(cls, ABCSeries) # divmod is available for Series - new_methods = dict( - add=arith_method(cls, operator.add, special), - radd=arith_method(cls, radd, special), - sub=arith_method(cls, operator.sub, special), - mul=arith_method(cls, operator.mul, special), - truediv=arith_method(cls, operator.truediv, special), - floordiv=arith_method(cls, operator.floordiv, special), - # Causes a floating point exception in the tests when numexpr enabled, - # so for now no speedup - mod=arith_method(cls, operator.mod, special), - pow=arith_method(cls, operator.pow, special), - # not entirely sure why this is necessary, but previously was included - # so it's here to maintain compatibility - rmul=arith_method(cls, rmul, special), - rsub=arith_method(cls, rsub, special), - rtruediv=arith_method(cls, rtruediv, special), - rfloordiv=arith_method(cls, rfloordiv, special), - rpow=arith_method(cls, rpow, special), - rmod=arith_method(cls, rmod, special), + new_methods = {} + + new_methods.update( + { + "add": arith_method(operator.add), + "radd": arith_method(radd), + "sub": arith_method(operator.sub), + "mul": arith_method(operator.mul), + "truediv": arith_method(operator.truediv), + "floordiv": arith_method(operator.floordiv), + "mod": arith_method(operator.mod), + "pow": arith_method(operator.pow), + "rmul": arith_method(rmul), + "rsub": arith_method(rsub), + "rtruediv": arith_method(rtruediv), + "rfloordiv": arith_method(rfloordiv), + "rpow": arith_method(rpow), + "rmod": arith_method(rmod), + } ) new_methods["div"] = new_methods["truediv"] new_methods["rdiv"] = new_methods["rtruediv"] if have_divmod: # divmod doesn't have an op that is supported by numexpr - new_methods["divmod"] = arith_method(cls, divmod, special) - new_methods["rdivmod"] = arith_method(cls, rdivmod, special) + new_methods["divmod"] = arith_method(divmod) + new_methods["rdivmod"] = arith_method(rdivmod) new_methods.update( - dict( - eq=comp_method(cls, operator.eq, special), - ne=comp_method(cls, operator.ne, special), - lt=comp_method(cls, operator.lt, special), - gt=comp_method(cls, operator.gt, special), - le=comp_method(cls, operator.le, special), - ge=comp_method(cls, operator.ge, special), - ) + { + "eq": comp_method(operator.eq), + "ne": comp_method(operator.ne), + "lt": comp_method(operator.lt), + "gt": comp_method(operator.gt), + "le": comp_method(operator.le), + "ge": comp_method(operator.ge), + } ) - if bool_method: - new_methods.update( - dict( - and_=bool_method(cls, operator.and_, special), - or_=bool_method(cls, operator.or_, special), - xor=bool_method(cls, operator.xor, special), - rand_=bool_method(cls, rand_, special), - ror_=bool_method(cls, ror_, special), - rxor=bool_method(cls, rxor, special), - ) - ) - - if special: - dunderize = lambda x: f"__{x.strip('_')}__" - else: - dunderize = lambda x: x - new_methods = {dunderize(k): v for k, v in new_methods.items()} + new_methods = {k.strip("_"): v for k, v in new_methods.items()} return new_methods diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bfdfc65723433..afd189ad16b5d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -21,12 +21,18 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.aggregation import aggregate import pandas.core.algorithms as algos -from pandas.core.base import DataError, ShallowMixin +from pandas.core.base import DataError from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.groupby.base import GroupByMixin +from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby +from pandas.core.groupby.groupby import ( + BaseGroupBy, + GroupBy, + _pipe_template, + get_groupby, +) from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import Index @@ -37,10 +43,10 @@ from pandas.tseries.frequencies import is_subperiod, is_superperiod from pandas.tseries.offsets import DateOffset, Day, Nano, Tick -_shared_docs_kwargs: Dict[str, str] = dict() +_shared_docs_kwargs: Dict[str, str] = {} -class Resampler(_GroupBy, ShallowMixin): +class Resampler(BaseGroupBy, ShallowMixin): """ Class for resampling datetimelike data, a groupby-like operation. See aggregate, transform, and apply functions on this object. @@ -88,7 +94,10 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.as_index = True self.exclusions = set() self.binner = None - self.grouper = None + # pandas\core\resample.py:96: error: Incompatible types in assignment + # (expression has type "None", variable has type "BaseGrouper") + # [assignment] + self.grouper = None # type: ignore[assignment] if self.groupby is not None: self.groupby._set_grouper(self._convert_obj(obj), sort=True) @@ -125,7 +134,7 @@ def __iter__(self): See Also -------- - GroupBy.__iter__ + GroupBy.__iter__ : Generator yielding sequence for each group. """ self._set_binner() return super().__iter__() @@ -203,7 +212,6 @@ def _assure_grouper(self): @Substitution( klass="Resampler", - versionadded=".. versionadded:: 0.23.0", examples=""" >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, ... index=pd.date_range('2012-08-02', periods=4)) @@ -230,9 +238,12 @@ def pipe(self, func, *args, **kwargs): """ See Also -------- - DataFrame.groupby.aggregate - DataFrame.resample.transform - DataFrame.aggregate + DataFrame.groupby.aggregate : Aggregate using callable, string, dict, + or list of string/callables. + DataFrame.resample.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate: Aggregate using one or more + operations over the specified axis. """ ) @@ -278,14 +289,13 @@ def pipe(self, func, *args, **kwargs): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="DataFrame", axis="", ) def aggregate(self, func, *args, **kwargs): self._set_binner() - result, how = self._aggregate(func, *args, **kwargs) + result, how = aggregate(self, func, *args, **kwargs) if result is None: how = func grouper = None @@ -365,8 +375,9 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped._aggregate_item_by_item(how, *args, **kwargs) else: result = grouped.aggregate(how, *args, **kwargs) - except DataError: + except (DataError, AttributeError, KeyError): # we have a non-reducing function; try to evaluate + # alternatively we want to evaluate only a column of the input result = grouped.apply(how, *args, **kwargs) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -402,14 +413,21 @@ def _apply_loffset(self, result): result : Series or DataFrame the result of resample """ + # pandas\core\resample.py:409: error: Cannot determine type of + # 'loffset' [has-type] needs_offset = ( - isinstance(self.loffset, (DateOffset, timedelta, np.timedelta64)) + isinstance( + self.loffset, # type: ignore[has-type] + (DateOffset, timedelta, np.timedelta64), + ) and isinstance(result.index, DatetimeIndex) and len(result.index) > 0 ) if needs_offset: - result.index = result.index + self.loffset + # pandas\core\resample.py:415: error: Cannot determine type of + # 'loffset' [has-type] + result.index = result.index + self.loffset # type: ignore[has-type] self.loffset = None return result @@ -449,8 +467,8 @@ def pad(self, limit=None): See Also -------- - Series.fillna - DataFrame.fillna + Series.fillna: Fill NA/NaN values using the specified method. + DataFrame.fillna: Fill NA/NaN values using the specified method. """ return self._upsample("pad", limit=limit) @@ -795,7 +813,7 @@ def interpolate( """ Interpolate values according to different methods. """ - result = self._upsample(None) + result = self._upsample("asfreq") return result.interpolate( method=method, axis=axis, @@ -824,8 +842,8 @@ def asfreq(self, fill_value=None): See Also -------- - Series.asfreq - DataFrame.asfreq + Series.asfreq: Convert TimeSeries to specified frequency. + DataFrame.asfreq: Convert TimeSeries to specified frequency. """ return self._upsample("asfreq", fill_value=fill_value) @@ -844,7 +862,9 @@ def std(self, ddof=1, *args, **kwargs): Standard deviation of values within each group. """ nv.validate_resampler_func("std", args, kwargs) - return self._downsample("std", ddof=ddof) + # pandas\core\resample.py:850: error: Unexpected keyword argument + # "ddof" for "_downsample" [call-arg] + return self._downsample("std", ddof=ddof) # type: ignore[call-arg] def var(self, ddof=1, *args, **kwargs): """ @@ -861,7 +881,9 @@ def var(self, ddof=1, *args, **kwargs): Variance of values within each group. """ nv.validate_resampler_func("var", args, kwargs) - return self._downsample("var", ddof=ddof) + # pandas\core\resample.py:867: error: Unexpected keyword argument + # "ddof" for "_downsample" [call-arg] + return self._downsample("var", ddof=ddof) # type: ignore[call-arg] @doc(GroupBy.size) def size(self): @@ -911,14 +933,24 @@ def quantile(self, q=0.5, **kwargs): See Also -------- Series.quantile + Return a series, where the index is q and the values are the quantiles. DataFrame.quantile + Return a DataFrame, where the columns are the columns of self, + and the values are the quantiles. DataFrameGroupBy.quantile + Return a DataFrame, where the coulmns are groupby columns, + and the values are its quantiles. """ - return self._downsample("quantile", q=q, **kwargs) + # pandas\core\resample.py:920: error: Unexpected keyword argument "q" + # for "_downsample" [call-arg] + + # pandas\core\resample.py:920: error: Too many arguments for + # "_downsample" [call-arg] + return self._downsample("quantile", q=q, **kwargs) # type: ignore[call-arg] # downsample methods -for method in ["sum", "prod"]: +for method in ["sum", "prod", "min", "max", "first", "last"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -929,7 +961,7 @@ def f(self, _method=method, min_count=0, *args, **kwargs): # downsample methods -for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: +for method in ["mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -949,7 +981,7 @@ def h(self, _method=method): setattr(Resampler, method, h) -class _GroupByMixin(GroupByMixin): +class _GroupByMixin(GotItemMixin): """ Provide the groupby facilities. """ @@ -966,8 +998,9 @@ def __init__(self, obj, *args, **kwargs): for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) - # error: Too many arguments for "__init__" of "object" - super().__init__(None) # type: ignore + # pandas\core\resample.py:972: error: Too many arguments for "__init__" + # of "object" [call-arg] + super().__init__(None) # type: ignore[call-arg] self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True @@ -1032,7 +1065,12 @@ def _downsample(self, how, **kwargs): # do we have a regular frequency if ax.freq is not None or ax.inferred_freq is not None: - if len(self.grouper.binlabels) > len(ax) and how is None: + # pandas\core\resample.py:1037: error: "BaseGrouper" has no + # attribute "binlabels" [attr-defined] + if ( + len(self.grouper.binlabels) > len(ax) # type: ignore[attr-defined] + and how is None + ): # let's do an asfreq return self.asfreq() @@ -1069,7 +1107,7 @@ def _upsample(self, method, limit=None, fill_value=None): See Also -------- - .fillna + .fillna: Fill NA/NaN values using the specified method. """ self._set_binner() @@ -1088,7 +1126,11 @@ def _upsample(self, method, limit=None, fill_value=None): res_index = self._adjust_binner_for_upsample(binner) # if we have the same frequency as our axis, then we are equal sampling - if limit is None and to_offset(ax.inferred_freq) == self.freq: + if ( + limit is None + and to_offset(ax.inferred_freq) == self.freq + and len(obj) == len(res_index) + ): result = obj.copy() result.index = res_index else: @@ -1201,7 +1243,7 @@ def _upsample(self, method, limit=None, fill_value=None): See Also -------- - .fillna + .fillna: Fill NA/NaN values using the specified method. """ # we may need to actually resample as if we are timestamps diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9e8fb643791f2..4a2629daf63d7 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,15 +3,27 @@ """ from collections import abc -from typing import TYPE_CHECKING, Iterable, List, Mapping, Union, overload +from typing import ( + TYPE_CHECKING, + Iterable, + List, + Mapping, + Optional, + Type, + Union, + cast, + overload, +) import numpy as np -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label +from pandas._typing import FrameOrSeriesUnion, Label from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna +import pandas.core.algorithms as algos from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, @@ -22,14 +34,15 @@ MultiIndex, all_indexes_same, ensure_index, - get_consensus_names, get_objs_combined_axis, + get_unanimous_names, ) import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers if TYPE_CHECKING: - from pandas import DataFrame + from pandas import DataFrame, Series + from pandas.core.generic import NDFrame # --------------------------------------------------------------------- # Concatenate DataFrame objects @@ -53,7 +66,7 @@ def concat( @overload def concat( - objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], + objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -68,7 +81,7 @@ def concat( def concat( - objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], + objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], axis=0, join="outer", ignore_index: bool = False, @@ -121,7 +134,6 @@ def concat( This has no effect when ``join='inner'``, which already preserves the order of the non-concatenation axis. - .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Changed to not sort by default. @@ -294,7 +306,7 @@ class _Concatenator: def __init__( self, - objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], + objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], axis=0, join: str = "outer", keys=None, @@ -359,13 +371,13 @@ def __init__( raise TypeError(msg) # consolidate - obj._consolidate(inplace=True) + obj._consolidate_inplace() ndims.add(obj.ndim) # get the sample # want the highest ndim that we have, and must be non-empty # unless all objs are empty - sample = None + sample: Optional["NDFrame"] = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: @@ -435,6 +447,8 @@ def __init__( # to line up if self._is_frame and axis == 1: name = 0 + # mypy needs to know sample is not an NDFrame + sample = cast("FrameOrSeriesUnion", sample) obj = sample._constructor({name: obj}) self.objs.append(obj) @@ -454,14 +468,17 @@ def __init__( self.new_axes = self._get_new_axes() def get_result(self): + cons: Type[FrameOrSeriesUnion] + sample: FrameOrSeriesUnion # series only if self._is_series: + sample = cast("Series", self.objs[0]) # stack blocks if self.bm_axis == 0: name = com.consensus_name_attr(self.objs) - cons = self.objs[0]._constructor + cons = sample._constructor arrs = [ser._values for ser in self.objs] @@ -474,7 +491,7 @@ def get_result(self): data = dict(zip(range(len(self.objs)), self.objs)) # GH28330 Preserves subclassed objects through concat - cons = self.objs[0]._constructor_expanddim + cons = sample._constructor_expanddim index, columns = self.new_axes df = cons(data, index=index) @@ -483,6 +500,8 @@ def get_result(self): # combine block managers else: + sample = cast("DataFrame", self.objs[0]) + mgrs_indexers = [] for obj in self.objs: indexers = {} @@ -495,17 +514,24 @@ def get_result(self): # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): + # We have to remove the duplicates from obj_labels + # in new labels to make them unique, otherwise we would + # duplicate or duplicates again + if not obj_labels.is_unique: + new_labels = algos.make_duplicates_of_left_unique_in_right( + np.asarray(obj_labels), np.asarray(new_labels) + ) indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy, + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) if not self.copy: new_data._consolidate_inplace() - cons = self.objs[0]._constructor + cons = sample._constructor return cons(new_data).__finalize__(self, method="concat") def _get_result_dim(self) -> int: @@ -624,10 +650,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): - mask = level == key + # Find matching codes, include matching nan values as equal. + mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") - i = np.nonzero(level == key)[0][0] + i = np.nonzero(mask)[0][0] to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) @@ -653,7 +680,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde ) # also copies - names = names + get_consensus_names(indexes) + names = list(names) + list(get_unanimous_names(*indexes)) return MultiIndex( levels=levels, codes=codes_list, names=names, verify_integrity=False diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 1ba6854a79265..f49aaee8bbc00 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -14,18 +14,15 @@ import pandas.core.common as com from pandas.core.indexes.api import Index, MultiIndex from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import _tile_compat +from pandas.core.reshape.util import tile_compat from pandas.core.shared_docs import _shared_docs from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa: F401 + from pandas import DataFrame, Series -@Appender( - _shared_docs["melt"] - % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") -) +@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) def melt( frame: "DataFrame", id_vars=None, @@ -45,7 +42,7 @@ def melt( if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " - "name of the resultiing Dataframe. " + "name of the resulting Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, @@ -136,7 +133,7 @@ def melt( result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: - result.index = _tile_compat(frame.index, K) + result.index = tile_compat(frame.index, K) return result @@ -144,14 +141,43 @@ def melt( @deprecate_kwarg(old_arg_name="label", new_arg_name=None) def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "DataFrame": """ - Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. + + Accepts a dictionary, ``groups``, in which each key is a new column name + and each value is a list of old column names that will be "melted" under + the new column name as part of the reshape. Parameters ---------- data : DataFrame + The wide-format DataFrame. groups : dict - {new_name : list_of_columns} - dropna : boolean, default True + {new_name : list_of_columns}. + dropna : bool, default True + Do not include columns whose entries are all NaN. + label : None + Not used. + + .. deprecated:: 1.0.0 + + Returns + ------- + DataFrame + Reshaped DataFrame. + + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Examples -------- @@ -169,10 +195,6 @@ def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "Dat 1 Yankees 2007 573 2 Red Sox 2008 545 3 Yankees 2008 526 - - Returns - ------- - reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) @@ -249,12 +271,10 @@ def wide_to_long( A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate - suffixes, for example, if your wide variables are of the form - A-one, B-two,.., and you have an unrelated column A-rating, you can - ignore the last one by specifying `suffix='(!?one|two)'`. - - .. versionchanged:: 0.23.0 - When all suffixes are numeric, they are cast to int64/float64. + suffixes, for example, if your wide variables are of the form A-one, + B-two,.., and you have an unrelated column A-rating, you can ignore the + last one by specifying `suffix='(!?one|two)'`. When all suffixes are + numeric, they are cast to int64/float64. Returns ------- @@ -262,6 +282,18 @@ def wide_to_long( A DataFrame that contains each stub name as a variable, with new index (i, j). + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + Notes ----- All extra variables are left untouched. This simply uses @@ -412,7 +444,7 @@ def wide_to_long( 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', - ... sep='_', suffix='\w+') + ... sep='_', suffix=r'\w+') >>> l ... # doctest: +NORMALIZE_WHITESPACE ht @@ -451,7 +483,7 @@ def melt_stub(df, stub: str, i, j, value_vars, sep: str): var_name=j, ) newdf[j] = Categorical(newdf[j]) - newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) # GH17627 Cast numerics suffixes to int/float newdf[j] = to_numeric(newdf[j], errors="ignore") diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 27b331babe692..2c6cdb846221f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -5,15 +5,15 @@ import copy import datetime from functools import partial +import hashlib import string -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Tuple, cast import warnings import numpy as np -from pandas._libs import Timedelta, hashtable as libhashtable, lib -import pandas._libs.join as libjoin -from pandas._typing import ArrayLike, FrameOrSeries +from pandas._libs import Timedelta, hashtable as libhashtable, join as libjoin, lib +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -43,7 +43,6 @@ from pandas import Categorical, Index, MultiIndex from pandas.core import groupby import pandas.core.algorithms as algos -from pandas.core.arrays.categorical import recode_for_categories import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc @@ -51,7 +50,8 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa:F401 + from pandas import DataFrame + from pandas.core.arrays import DatetimeArray @Substitution("\nleft : DataFrame") @@ -114,11 +114,8 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # if we can groupby the rhs # then we can get vastly better perf - - try: + if all(item in right.columns for item in by): rby = right.groupby(by, sort=False) - except KeyError: - pass for key, lhs in lby: @@ -140,9 +137,7 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # make sure join keys are in the merged # TODO, should merge_pieces do this? - for k in by: - if k in merged: - merged[k] = key + merged[by] = key pieces.append(merged) @@ -276,10 +271,20 @@ def _merger(x, y): if left_by is not None and right_by is not None: raise ValueError("Can only group either left or right frames") elif left_by is not None: + if isinstance(left_by, str): + left_by = [left_by] + check = set(left_by).difference(left.columns) + if len(check) != 0: + raise KeyError(f"{check} not found in left columns") result, _ = _groupby_and_merge( left_by, on, left, right, lambda x, y: _merger(x, y) ) elif right_by is not None: + if isinstance(right_by, str): + right_by = [right_by] + check = set(right_by).difference(right.columns) + if len(check) != 0: + raise KeyError(f"{check} not found in right columns") result, _ = _groupby_and_merge( right_by, on, right, left, lambda x, y: _merger(y, x) ) @@ -575,8 +580,8 @@ class _MergeOperation: def __init__( self, - left: Union["Series", "DataFrame"], - right: Union["Series", "DataFrame"], + left: FrameOrSeriesUnion, + right: FrameOrSeriesUnion, how: str = "inner", on=None, left_on=None, @@ -644,6 +649,17 @@ def __init__( self._validate_specification() + cross_col = None + if self.how == "cross": + ( + self.left, + self.right, + self.how, + cross_col, + ) = self._create_cross_configuration(self.left, self.right) + self.left_on = self.right_on = [cross_col] + self._cross = cross_col + # note this function has side effects ( self.left_join_keys, @@ -691,7 +707,13 @@ def get_result(self): self._maybe_restore_index_levels(result) - return result + self._maybe_drop_cross_column(result, self._cross) + + return result.__finalize__(self, method="merge") + + def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): + if cross_col is not None: + result.drop(columns=cross_col, inplace=True) def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" @@ -832,12 +854,15 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) # if we have an all missing left_indexer - # make sure to just use the right values - mask = left_indexer == -1 - if mask.all(): + # make sure to just use the right values or vice-versa + mask_left = left_indexer == -1 + mask_right = right_indexer == -1 + if mask_left.all(): key_col = rvals + elif mask_right.all(): + key_col = lvals else: - key_col = Index(lvals).where(~mask, rvals) + key_col = Index(lvals).where(~mask_left, rvals) if result._is_label_reference(name): result[name] = key_col @@ -859,7 +884,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): def _get_join_indexers(self): """ return the join indexers """ - return _get_join_indexers( + return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) @@ -964,7 +989,10 @@ def _get_merge_keys(self): """ left_keys = [] right_keys = [] - join_names = [] + # pandas\core\reshape\merge.py:966: error: Need type annotation for + # 'join_names' (hint: "join_names: List[] = ...") + # [var-annotated] + join_names = [] # type: ignore[var-annotated] right_drop = [] left_drop = [] @@ -1085,7 +1113,7 @@ def _maybe_coerce_merge_keys(self): # if either left or right is a categorical # then the must match exactly in categories & ordered if lk_is_cat and rk_is_cat: - if lk.is_dtype_equal(rk): + if lk._categories_match_up_to_permutation(rk): continue elif lk_is_cat or rk_is_cat: @@ -1195,21 +1223,62 @@ def _maybe_coerce_merge_keys(self): typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign(**{name: self.right[name].astype(typ)}) + def _create_cross_configuration( + self, left, right + ) -> Tuple["DataFrame", "DataFrame", str, str]: + """ + Creates the configuration to dispatch the cross operation to inner join, + e.g. adding a join column and resetting parameters. Join column is added + to a new object, no inplace modification + + Parameters + ---------- + left: DataFrame + right DataFrame + + Returns + ------- + a tuple (left, right, how, cross_col) representing the adjusted + DataFrames with cross_col, the merge operation set to inner and the column + to join over. + """ + cross_col = f"_cross_{hashlib.md5().hexdigest()}" + how = "inner" + return ( + left.assign(**{cross_col: 1}), + right.assign(**{cross_col: 1}), + how, + cross_col, + ) + def _validate_specification(self): + if self.how == "cross": + if ( + self.left_index + or self.right_index + or self.right_on is not None + or self.left_on is not None + or self.on is not None + ): + raise MergeError( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + return # Hm, any way to make this logic less complicated?? - if self.on is None and self.left_on is None and self.right_on is None: + elif self.on is None and self.left_on is None and self.right_on is None: if self.left_index and self.right_index: self.left_on, self.right_on = (), () elif self.left_index: - if self.right_on is None: - raise MergeError("Must pass right_on or right_index=True") + raise MergeError("Must pass right_on or right_index=True") elif self.right_index: - if self.left_on is None: - raise MergeError("Must pass left_on or left_index=True") + raise MergeError("Must pass left_on or left_index=True") else: # use the common columns - common_cols = self.left.columns.intersection(self.right.columns) + left_cols = self.left.columns + right_cols = self.right.columns + common_cols = left_cols.intersection(right_cols) if len(common_cols) == 0: raise MergeError( "No common columns to perform merge on. " @@ -1218,7 +1287,10 @@ def _validate_specification(self): f"left_index={self.left_index}, " f"right_index={self.right_index}" ) - if not common_cols.is_unique: + if ( + not left_cols.join(common_cols, how="inner").is_unique + or not right_cols.join(common_cols, how="inner").is_unique + ): raise MergeError(f"Data columns not unique: {repr(common_cols)}") self.left_on = self.right_on = common_cols elif self.on is not None: @@ -1227,8 +1299,19 @@ def _validate_specification(self): 'Can only pass argument "on" OR "left_on" ' 'and "right_on", not a combination of both.' ) + if self.left_index or self.right_index: + raise MergeError( + 'Can only pass argument "on" OR "left_index" ' + 'and "right_index", not a combination of both.' + ) self.left_on = self.right_on = self.on elif self.left_on is not None: + if self.left_index: + raise MergeError( + 'Can only pass argument "left_on" OR "left_index" not both.' + ) + if not self.right_index and self.right_on is None: + raise MergeError('Must pass "right_on" OR "right_index".') n = len(self.left_on) if self.right_index: if len(self.left_on) != self.right.index.nlevels: @@ -1238,6 +1321,12 @@ def _validate_specification(self): ) self.right_on = [None] * n elif self.right_on is not None: + if self.right_index: + raise MergeError( + 'Can only pass argument "right_on" OR "right_index" not both.' + ) + if not self.left_index and self.left_on is None: + raise MergeError('Must pass "left_on" OR "left_index".') n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: @@ -1246,7 +1335,7 @@ def _validate_specification(self): 'of levels in the index of "left"' ) self.left_on = [None] * n - if len(self.right_on) != len(self.left_on): + if self.how != "cross" and len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") def _validate(self, validate: str): @@ -1298,7 +1387,7 @@ def _validate(self, validate: str): raise ValueError("Not a valid argument for validate") -def _get_join_indexers( +def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs ): """ @@ -1338,19 +1427,21 @@ def _get_join_indexers( lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if how == "left": + if how in ("left", "right"): kwargs["sort"] = sort join_func = { "inner": libjoin.inner_join, "left": libjoin.left_outer_join, - "right": _right_outer_join, + "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( + y, x, count, **kwargs + )[::-1], "outer": libjoin.full_outer_join, }[how] return join_func(lkey, rkey, count, **kwargs) -def _restore_dropped_levels_multijoin( +def restore_dropped_levels_multijoin( left: MultiIndex, right: MultiIndex, dropped_level_names, @@ -1504,7 +1595,7 @@ def get_result(self): ) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method=self._merge_type) + result = typ(result_data) self._maybe_add_join_keys(result, left_indexer, right_indexer) @@ -1667,7 +1758,7 @@ def _get_merge_keys(self): msg = ( f"incompatible tolerance {self.tolerance}, must be compat " - f"with type {repr(lk.dtype)}" + f"with type {repr(lt.dtype)}" ) if needs_i8_conversion(lt): @@ -1838,7 +1929,7 @@ def _get_single_indexer(join_key, index, sort: bool = False): def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): if len(join_keys) > 1: if not ( - (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) + isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels ): raise AssertionError( "If more than one join key is given then " @@ -1863,14 +1954,9 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = return left_ax, None, right_indexer -def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) - return left_indexer, right_indexer - - def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" -) -> Tuple[np.array, np.array, int]: +) -> Tuple[np.ndarray, np.ndarray, int]: """ Encode left and right keys as enumerated types. @@ -1928,29 +2014,27 @@ def _factorize_keys( if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared - lk, _ = lk._values_for_factorize() - rk, _ = rk._values_for_factorize() + lk = cast("DatetimeArray", lk)._ndarray + rk = cast("DatetimeArray", rk)._ndarray elif ( - is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) + is_categorical_dtype(lk.dtype) + and is_categorical_dtype(rk.dtype) + and is_dtype_equal(lk.dtype, rk.dtype) ): assert isinstance(lk, Categorical) assert isinstance(rk, Categorical) - if lk.categories.equals(rk.categories): - # if we exactly match in categories, allow us to factorize on codes - rk = rk.codes - else: - # Same categories in different orders -> recode - rk = recode_for_categories(rk.codes, rk.categories, lk.categories) + # Cast rk to encoding so we can compare codes with lk + rk = lk._encode_with_my_categories(rk) lk = ensure_int64(lk.codes) - rk = ensure_int64(rk) + rk = ensure_int64(rk.codes) elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() - if is_integer_dtype(lk) and is_integer_dtype(rk): + if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer @@ -2012,8 +2096,11 @@ def _sort_labels(uniques: np.ndarray, left, right): def _get_join_keys(llab, rlab, shape, sort: bool): # how many levels can be done without overflow - pred = lambda i: not is_int64_overflow_possible(shape[:i]) - nlev = next(filter(pred, range(len(shape), 0, -1))) + nlev = next( + lev + for lev in range(len(shape), 0, -1) + if not is_int64_overflow_possible(shape[:lev]) + ) # get keys for the first `nlev` levels stride = np.prod(shape[1:nlev], dtype="i8") diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ea5916eff3afa..40496a5b8671b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -5,6 +5,7 @@ List, Optional, Sequence, + Set, Tuple, Union, cast, @@ -12,7 +13,7 @@ import numpy as np -from pandas._typing import Label +from pandas._typing import FrameOrSeriesUnion, Label from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -200,7 +201,7 @@ def pivot_table( def _add_margins( - table: Union["Series", "DataFrame"], + table: FrameOrSeriesUnion, data, values, rows, @@ -239,7 +240,7 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name, + table, data, values, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -267,19 +268,13 @@ def _add_margins( margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names - try: - # check the result column and leave floats - for dtype in set(result.dtypes): - cols = result.select_dtypes([dtype]).columns - margin_dummy[cols] = margin_dummy[cols].apply( - maybe_downcast_to_dtype, args=(dtype,) - ) - result = result.append(margin_dummy) - except TypeError: - - # we cannot reshape, so coerce the axis - result.index = result.index._to_safe_for_reshape() - result = result.append(margin_dummy) + # check the result column and leave floats + for dtype in set(result.dtypes): + cols = result.select_dtypes([dtype]).columns + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) + result = result.append(margin_dummy) result.index.names = row_names return result @@ -308,7 +303,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", + table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins @@ -327,17 +322,7 @@ def _all_key(key): # we are going to mutate this, so need to copy! piece = piece.copy() - try: - piece[all_key] = margin[key] - except TypeError: - - # we cannot reshape, so coerce the axis - piece.set_axis( - piece._get_axis(cat_axis)._to_safe_for_reshape(), - axis=cat_axis, - inplace=True, - ) - piece[all_key] = margin[key] + piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) @@ -451,10 +436,9 @@ def pivot( cols = com.convert_to_list_like(index) else: cols = [] - cols.extend(columns) append = index is None - indexed = data.set_index(cols, append=append) + indexed = data.set_index(cols + columns, append=append) else: if index is None: index = [Series(data.index, name=data.index.name)] @@ -580,29 +564,37 @@ def crosstab( b 0 1 0 c 0 0 0 """ + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") + index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) - rownames = _get_names(index, rownames, prefix="row") - colnames = _get_names(columns, colnames, prefix="col") - common_idx = None pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] if pass_objs: common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - data: Dict = {} - data.update(zip(rownames, index)) - data.update(zip(colnames, columns)) - - if values is None and aggfunc is not None: - raise ValueError("aggfunc cannot be used without values.") + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") - if values is not None and aggfunc is None: - raise ValueError("values cannot be used without an aggfunc.") + # duplicate names mapped to unique names for pivot op + ( + rownames_mapper, + unique_rownames, + colnames_mapper, + unique_colnames, + ) = _build_names_mapper(rownames, colnames) from pandas import DataFrame + data = { + **dict(zip(unique_rownames, index)), + **dict(zip(unique_colnames, columns)), + } df = DataFrame(data, index=common_idx) original_df_cols = df.columns @@ -615,8 +607,8 @@ def crosstab( table = df.pivot_table( ["__dummy__"], - index=rownames, - columns=colnames, + index=unique_rownames, + columns=unique_colnames, margins=margins, margins_name=margins_name, dropna=dropna, @@ -635,6 +627,9 @@ def crosstab( table, normalize=normalize, margins=margins, margins_name=margins_name ) + table = table.rename_axis(index=rownames_mapper, axis=0) + table = table.rename_axis(columns=colnames_mapper, axis=1) + return table @@ -670,12 +665,11 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): # keep index and column of pivoted table table_index = table.index table_columns = table.columns + last_ind_or_col = table.iloc[-1, :].name - # check if margin name is in (for MI cases) or equal to last + # check if margin name is not in (for MI cases) and not equal to last # index/column and save the column and index margin - if (margins_name not in table.iloc[-1, :].name) | ( - margins_name != table.iloc[:, -1].name - ): + if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] @@ -734,3 +728,57 @@ def _get_names(arrs, names, prefix: str = "row"): names = list(names) return names + + +def _build_names_mapper( + rownames: List[str], colnames: List[str] +) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]: + """ + Given the names of a DataFrame's rows and columns, returns a set of unique row + and column names and mappers that convert to original names. + + A row or column name is replaced if it is duplicate among the rows of the inputs, + among the columns of the inputs or between the rows and the columns. + + Paramters + --------- + rownames: list[str] + colnames: list[str] + + Returns + ------- + Tuple(Dict[str, str], List[str], Dict[str, str], List[str]) + + rownames_mapper: dict[str, str] + a dictionary with new row names as keys and original rownames as values + unique_rownames: list[str] + a list of rownames with duplicate names replaced by dummy names + colnames_mapper: dict[str, str] + a dictionary with new column names as keys and original column names as values + unique_colnames: list[str] + a list of column names with duplicate names replaced by dummy names + + """ + + def get_duplicates(names): + seen: Set = set() + return {name for name in names if name not in seen} + + shared_names = set(rownames).intersection(set(colnames)) + dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names + + rownames_mapper = { + f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names + } + unique_rownames = [ + f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames) + ] + + colnames_mapper = { + f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names + } + unique_colnames = [ + f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames) + ] + + return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 391313fbb5283..c197e142fecbc 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -81,9 +81,7 @@ class _Unstacker: unstacked : DataFrame """ - def __init__( - self, index: MultiIndex, level=-1, constructor=None, - ): + def __init__(self, index: MultiIndex, level=-1, constructor=None): if constructor is None: constructor = DataFrame @@ -139,7 +137,7 @@ def _indexer_and_to_sort(self): @cache_readonly def sorted_labels(self): indexer, to_sort = self._indexer_and_to_sort - return [l.take(indexer) for l in to_sort] + return [line.take(indexer) for line in to_sort] def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort @@ -401,6 +399,7 @@ def _unstack_multiple(data, clocs, fill_value=None): def unstack(obj, level, fill_value=None): + if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, @@ -418,11 +417,18 @@ def unstack(obj, level, fill_value=None): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) + elif not isinstance(obj.index, MultiIndex): + # GH 36113 + # Give nicer error messages when unstack a Series whose + # Index is not a MultiIndex. + raise ValueError( + f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" + ) else: if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( - obj.index, level=level, constructor=obj._constructor_expanddim, + obj.index, level=level, constructor=obj._constructor_expanddim ) return unstacker.get_result( obj.values, value_columns=None, fill_value=fill_value @@ -436,7 +442,7 @@ def _unstack_frame(obj, level, fill_value=None): return obj._constructor(mgr) else: return _Unstacker( - obj.index, level=level, constructor=obj._constructor, + obj.index, level=level, constructor=obj._constructor ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value) @@ -515,7 +521,7 @@ def factorize(index): verify_integrity=False, ) - if frame._is_homogeneous_type: + if not frame.empty and frame._is_homogeneous_type: # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes._values) @@ -588,19 +594,15 @@ def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass - to swaplevel: + to swaplevel. - We generally want to convert the level number into a level name, except - when columns do not have names, in which case we must leave as a level - number + If `level_num` matches a column name return the name from + position `level_num`, otherwise return `level_num`. """ if level_num in columns.names: return columns.names[level_num] - else: - if columns.names[level_num] is None: - return level_num - else: - return columns.names[level_num] + + return level_num this = frame.copy() @@ -766,8 +768,6 @@ def get_dummies( dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. - .. versionadded:: 0.23.0 - Returns ------- DataFrame diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index f7723bee532ff..4c5347bd16e8b 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -84,8 +84,6 @@ def cut( Whether the first interval should be left-inclusive or not. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. - - .. versionadded:: 0.23.0 ordered : bool, default True Whether the labels are ordered or not. Applies to returned types Categorical and Series (with Categorical dtype). If True, @@ -381,7 +379,7 @@ def _bins_to_cuts( duplicates: str = "raise", ordered: bool = True, ): - if not ordered and not labels: + if not ordered and labels is None: raise ValueError("'labels' must be provided if 'ordered = False'") if duplicates not in ["raise", "drop"]: diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 6949270317f7c..d2c08712abacd 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -39,6 +39,9 @@ def cartesian_product(X): lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) + if np.any(cumprodX < 0): + raise ValueError("Product space too large to allocate arrays!") + a = np.roll(cumprodX, 1) a[0] = 1 @@ -48,10 +51,10 @@ def cartesian_product(X): # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) - return [_tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] + return [tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] -def _tile_compat(arr, num: int): +def tile_compat(arr, num: int): """ Index compat for np.tile. diff --git a/pandas/core/series.py b/pandas/core/series.py index 9a633079b8c1d..0e9476285c258 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -25,12 +25,14 @@ from pandas._libs import lib, properties, reshape, tslibs from pandas._libs.lib import no_default from pandas._typing import ( + AggFuncType, ArrayLike, Axis, DtypeObj, FrameOrSeriesUnion, IndexKeyFunc, Label, + StorageOptions, ValueKeyFunc, ) from pandas.compat.numpy import function as nv @@ -54,6 +56,7 @@ is_list_like, is_object_dtype, is_scalar, + validate_all_hashable, ) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable @@ -64,14 +67,15 @@ remove_na_arraylike, ) -import pandas as pd -from pandas.core import algorithms, base, generic, nanops, ops +from pandas.core import algorithms, base, generic, missing, nanops, ops from pandas.core.accessor import CachedAccessor +from pandas.core.aggregation import aggregate, transform from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.construction import ( + array as pd_array, create_series_with_explicit_dtype, extract_array, is_empty_data, @@ -80,14 +84,21 @@ from pandas.core.generic import NDFrame from pandas.core.indexers import deprecate_ndim_indexing, unpack_1tuple from pandas.core.indexes.accessors import CombinedDatetimelikeProperties -from pandas.core.indexes.api import Float64Index, Index, MultiIndex, ensure_index +from pandas.core.indexes.api import ( + CategoricalIndex, + Float64Index, + Index, + MultiIndex, + ensure_index, +) import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager -from pandas.core.sorting import ensure_key_mapped +from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -100,22 +111,21 @@ __all__ = ["Series"] -_shared_doc_kwargs = dict( - axes="index", - klass="Series", - axes_single_arg="{0 or 'index'}", - axis="""axis : {0 or 'index'} +_shared_doc_kwargs = { + "axes": "index", + "klass": "Series", + "axes_single_arg": "{0 or 'index'}", + "axis": """axis : {0 or 'index'} Parameter needed for compatibility with DataFrame.""", - inplace="""inplace : boolean, default False + "inplace": """inplace : boolean, default False If True, performs operation inplace and returns None.""", - unique="np.ndarray", - duplicated="Series", - optional_by="", - optional_mapper="", - optional_labels="", - optional_axis="", - versionadded_to_excel="\n .. versionadded:: 0.20.0\n", -) + "unique": "np.ndarray", + "duplicated": "Series", + "optional_by": "", + "optional_mapper": "", + "optional_labels": "", + "optional_axis": "", +} def _coerce_method(converter): @@ -153,18 +163,14 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Parameters ---------- data : array-like, Iterable, dict, or scalar value - Contains data stored in Series. - - .. versionchanged:: 0.23.0 - If data is a dict, argument order is maintained for Python 3.6 - and later. - + Contains data stored in Series. If data is a dict, argument order is + maintained. index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to - RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index - sequence are used, the index will override the keys found in the - dict. + RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like + and index is None, then the values in the index are used to + reindex the Series after it is created using the keys in the data. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. @@ -176,14 +182,15 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ _typ = "series" + _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) _name: Label _metadata: List[str] = ["name"] _internal_names_set = {"index"} | generic.NDFrame._internal_names_set _accessors = {"dt", "cat", "str", "sparse"} - _deprecations = ( - base.IndexOpsMixin._deprecations - | generic.NDFrame._deprecations + _hidden_attrs = ( + base.IndexOpsMixin._hidden_attrs + | generic.NDFrame._hidden_attrs | frozenset(["compress", "ptp"]) ) @@ -191,6 +198,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): hasnans = property( base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) + __hash__ = generic.NDFrame.__hash__ _mgr: SingleBlockManager div: Callable[["Series", Any], "Series"] rdiv: Callable[["Series", Any], "Series"] @@ -354,15 +362,19 @@ def _init_dict(self, data, index=None, dtype=None): # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: - keys, values = zip(*data.items()) - values = list(values) + # GH:34717, issue was using zip to extract key and values from data. + # using generators in effects the performance. + # Below is the new way of extracting the keys and values + + keys = tuple(data.keys()) + values = list(data.values()) # Generating list of values- faster way elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar # instead of reindexing. values = na_value_for_dtype(dtype) keys = index else: - keys, values = [], [] + keys, values = (), [] # Input is now list-like, so rely on "standard" construction: @@ -390,7 +402,7 @@ def _constructor_expanddim(self) -> Type["DataFrame"]: # types @property - def _can_hold_na(self): + def _can_hold_na(self) -> bool: return self._mgr._can_hold_na _index = None @@ -405,14 +417,26 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: if not fastpath: labels = ensure_index(labels) - is_all_dates = labels.is_all_dates - if is_all_dates: - if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + if labels._is_all_dates: + deep_labels = labels + if isinstance(labels, CategoricalIndex): + deep_labels = labels.categories + + if not isinstance( + deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex) + ): try: labels = DatetimeIndex(labels) # need to set here because we changed the index if fastpath: self._mgr.set_axis(axis, labels) + warnings.warn( + "Automatically casting object-dtype Index of datetimes to " + "DatetimeIndex is deprecated and will be removed in a " + "future version. Explicitly cast to DatetimeIndex instead.", + FutureWarning, + stacklevel=3, + ) except (tslibs.OutOfBoundsDatetime, ValueError): # labels may exceeds datetime bounds, # or not be a DatetimeIndex @@ -491,8 +515,7 @@ def name(self) -> Label: @name.setter def name(self, value: Label) -> None: - if not is_hashable(value): - raise TypeError("Series.name must be a hashable type") + validate_all_hashable(value, error_name=f"{type(self).__name__}.name") object.__setattr__(self, "_name", value) @property @@ -571,7 +594,8 @@ def _values(self): """ return self._mgr.internal_values() - @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore + # error: Decorated property not supported + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] @property def array(self) -> ExtensionArray: return self._mgr._block.array_values() @@ -672,81 +696,6 @@ def view(self, dtype=None) -> "Series": # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) - def __array_ufunc__( - self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any - ): - # TODO: handle DataFrame - cls = type(self) - - # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - # align all the inputs. - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - types = tuple(type(x) for x in inputs) - # TODO: dataframe - alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] - - if len(alignable) > 1: - # This triggers alignment. - # At the moment, there aren't any ufuncs with more than two inputs - # so this ends up just being x1.index | x2.index, but we write - # it to handle *args. - index = alignable[0].index - for s in alignable[1:]: - index |= s.index - inputs = tuple( - x.reindex(index) if issubclass(t, Series) else x - for x, t in zip(inputs, types) - ) - else: - index = self.index - - inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - - name = names[0] if len(set(names)) == 1 else None - - def construct_return(result): - if lib.is_scalar(result): - return result - elif result.ndim > 1: - # e.g. np.subtract.outer - if method == "outer": - # GH#27198 - raise NotImplementedError - return result - return self._constructor(result, index=index, name=name, copy=False) - - if type(result) is tuple: - # multiple return values - return tuple(construct_return(x) for x in result) - elif method == "at": - # no return value - return None - else: - return construct_return(result) - def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. @@ -826,7 +775,7 @@ def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series": FutureWarning, stacklevel=2, ) - nv.validate_take(tuple(), kwargs) + nv.validate_take((), kwargs) indices = ensure_platform_int(indices) new_index = self.index.take(indices) @@ -881,21 +830,19 @@ def __getitem__(self, key): elif key_is_scalar: return self._get_value(key) - if ( - isinstance(key, tuple) - and is_hashable(key) - and isinstance(self.index, MultiIndex) - ): + if is_hashable(key): # Otherwise index.get_value will raise InvalidIndexError try: + # For labels that don't resolve as scalars like tuples and frozensets result = self._get_value(key) return result - except KeyError: - # We still have the corner case where this tuple is a key - # in the first level of our MultiIndex - return self._get_values_tuple(key) + except (KeyError, TypeError): + if isinstance(key, tuple) and isinstance(self.index, MultiIndex): + # We still have the corner case where a tuple is a key + # in the first level of our MultiIndex + return self._get_values_tuple(key) if is_iterator(key): key = list(key) @@ -955,21 +902,22 @@ def _get_values_tuple(self, key): return result if not isinstance(self.index, MultiIndex): - raise ValueError("Can only tuple-index with a MultiIndex") + raise KeyError("key of type tuple not found and not a MultiIndex") # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) return self._constructor(self._values[indexer], index=new_index).__finalize__( - self, + self ) def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self,) + return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack - return self._values[indexer] + # the asarray is needed to avoid returning a 2D DatetimeArray + return np.asarray(self._values[indexer]) def _get_value(self, label, takeable: bool = False): """ @@ -1006,12 +954,14 @@ def __setitem__(self, key, value): # positional setter values[key] = value else: - # GH#12862 adding an new key to the Series + # GH#12862 adding a new key to the Series self.loc[key] = value - except TypeError as e: + except TypeError as err: if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): - raise ValueError("Can only tuple-index with a MultiIndex") from e + raise KeyError( + "key of type tuple not found and not a MultiIndex" + ) from err if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) @@ -1037,10 +987,8 @@ def _set_with_engine(self, key, value): def _set_with(self, key, value): # other: fancy integer or otherwise if isinstance(key, slice): - # extract_array so that if we set e.g. ser[-5:] = ser[:5] - # we get the first five values, and not 5 NaNs indexer = self.index._convert_slice_indexer(key, kind="getitem") - self.iloc[indexer] = extract_array(value, extract_numpy=True) + return self._set_values(indexer, value) else: assert not isinstance(key, tuple) @@ -1058,12 +1006,28 @@ def _set_with(self, key, value): # should be caught by the is_bool_indexer check in __setitem__ if key_type == "integer": if not self.index._should_fallback_to_positional(): - self.loc[key] = value + self._set_labels(key, value) else: - self.iloc[key] = value + self._set_values(key, value) else: self.loc[key] = value + def _set_labels(self, key, value): + key = com.asarray_tuplesafe(key) + indexer: np.ndarray = self.index.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise KeyError(f"{key[mask]} not in index") + self._set_values(indexer, value) + + def _set_values(self, key, value): + if isinstance(key, Series): + key = key._values + self._mgr = self._mgr.setitem( # type: ignore[assignment] + indexer=key, value=value + ) + self._maybe_update_cacher() + def _set_value(self, label, value, takeable: bool = False): """ Quickly set single value at passed label. @@ -1150,7 +1114,7 @@ def repeat(self, repeats, axis=None) -> "Series": 2 c dtype: object """ - nv.validate_repeat(tuple(), dict(axis=axis)) + nv.validate_repeat((), {"axis": axis}) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) return self._constructor(new_values, index=new_index).__finalize__( @@ -1182,7 +1146,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): Returns ------- - Series or DataFrame + Series or DataFrame or None When `drop` is False (the default), a DataFrame is returned. The newly created columns will come first in the DataFrame, followed by the original Series values. @@ -1403,6 +1367,7 @@ def to_string( @doc( klass=_shared_doc_kwargs["klass"], + storage_options=generic._shared_docs["storage_options"], examples=dedent( """ Examples @@ -1419,7 +1384,12 @@ def to_string( ), ) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + self, + buf: Optional[IO[str]] = None, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions = None, + **kwargs, ) -> Optional[str]: """ Print {klass} in Markdown-friendly format. @@ -1431,7 +1401,15 @@ def to_markdown( buf : str, Path or StringIO-like, optional, default None Buffer to write to. If None, the output is returned as a string. mode : str, optional - Mode in which file is opened. + Mode in which file is opened, "wt" by default. + index : bool, optional, default True + Add index (row) labels. + + .. versionadded:: 1.1.0 + {storage_options} + + .. versionadded:: 1.2.0 + **kwargs These parameters will be passed to `tabulate \ `_. @@ -1441,6 +1419,10 @@ def to_markdown( str {klass} in Markdown-friendly format. + Notes + ----- + Requires the `tabulate `_ package. + Examples -------- >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") @@ -1467,7 +1449,9 @@ def to_markdown( | 3 | quetzal | +----+----------+ """ - return self.to_frame().to_markdown(buf, mode, **kwargs) + return self.to_frame().to_markdown( + buf, mode, index, storage_options=storage_options, **kwargs + ) # ---------------------------------------------------------------------- @@ -1756,12 +1740,17 @@ def count(self, level=None): """ if level is None: return notna(self.array).sum() + elif not isinstance(self.index, MultiIndex): + raise ValueError("Series.count level is only valid with a MultiIndex") + + index = self.index + assert isinstance(index, MultiIndex) # for mypy if isinstance(level, str): - level = self.index._get_level_number(level) + level = index._get_level_number(level) - lev = self.index.levels[level] - level_codes = np.array(self.index.codes[level], subok=False, copy=True) + lev = index.levels[level] + level_codes = np.array(index.codes[level], subok=False, copy=True) mask = level_codes == -1 if mask.any(): @@ -1776,7 +1765,9 @@ def count(self, level=None): def mode(self, dropna=True) -> "Series": """ - Return the mode(s) of the dataset. + Return the mode(s) of the Series. + + The mode is the value that appears most often. There can be multiple modes. Always returns Series even if only one value is returned. @@ -1877,8 +1868,8 @@ def drop_duplicates(self, keep="first", inplace=False) -> Optional["Series"]: Returns ------- - Series - Series with duplicates dropped. + Series or None + Series with duplicates dropped or None if ``inplace=True``. See Also -------- @@ -2018,7 +2009,9 @@ def duplicated(self, keep="first") -> "Series": 4 True dtype: bool """ - return super().duplicated(keep=keep) + res = base.IndexOpsMixin.duplicated(self, keep=keep) + result = self._constructor(res, index=self.index) + return result.__finalize__(self, method="duplicated") def idxmin(self, axis=0, skipna=True, *args, **kwargs): """ @@ -2753,7 +2746,8 @@ def _construct_result( out.name = name return out - @Appender( + @doc( + generic._shared_docs["compare"], """ Returns ------- @@ -2813,9 +2807,9 @@ def _construct_result( 2 c c 3 d b 4 e e -""" +""", + klass=_shared_doc_kwargs["klass"], ) - @Appender(generic._shared_docs["compare"] % _shared_doc_kwargs) def compare( self, other: "Series", @@ -3095,8 +3089,8 @@ def sort_values( Returns ------- - Series - Series ordered by values. + Series or None + Series ordered by values or None if ``inplace=True``. See Also -------- @@ -3228,29 +3222,6 @@ def sort_values( "sort in-place you must create a copy" ) - def _try_kind_sort(arr): - arr = ensure_key_mapped(arr, key) - arr = getattr(arr, "_values", arr) - - # easier to ask forgiveness than permission - try: - # if kind==mergesort, it can fail for object dtype - return arr.argsort(kind=kind) - except TypeError: - # stable sort not available for object dtype - # uses the argsort default quicksort - return arr.argsort(kind="quicksort") - - arr = self._values - sorted_index = np.empty(len(self), dtype=np.int32) - - bad = isna(arr) - - good = ~bad - idx = ibase.default_index(len(self)) - - argsorted = _try_kind_sort(self[good]) - if is_list_like(ascending): if len(ascending) != 1: raise ValueError( @@ -3261,21 +3232,16 @@ def _try_kind_sort(arr): if not is_bool(ascending): raise ValueError("ascending must be boolean") - if not ascending: - argsorted = argsorted[::-1] - - if na_position == "last": - n = good.sum() - sorted_index[:n] = idx[good][argsorted] - sorted_index[n:] = idx[bad] - elif na_position == "first": - n = bad.sum() - sorted_index[n:] = idx[good][argsorted] - sorted_index[:n] = idx[bad] - else: + if na_position not in ["first", "last"]: raise ValueError(f"invalid na_position: {na_position}") - result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) + # GH 35922. Make sorting stable by leveraging nargsort + values_to_sort = ensure_key_mapped(self, key)._values if key else self._values + sorted_index = nargsort(values_to_sort, kind, ascending, na_position) + + result = self._constructor( + self._values[sorted_index], index=self.index[sorted_index] + ) if ignore_index: result.index = ibase.default_index(len(sorted_index)) @@ -3341,8 +3307,8 @@ def sort_index( Returns ------- - Series - The original Series sorted by the labels. + Series or None + The original Series sorted by the labels or None if ``inplace=True``. See Also -------- @@ -3432,59 +3398,17 @@ def sort_index( dtype: int64 """ - # TODO: this can be combined with DataFrame.sort_index impl as - # almost identical - inplace = validate_bool_kwarg(inplace, "inplace") - # Validate the axis parameter - self._get_axis_number(axis) - index = ensure_key_mapped(self.index, key, levels=level) - - if level is not None: - new_index, indexer = index.sortlevel( - level, ascending=ascending, sort_remaining=sort_remaining - ) - - elif isinstance(index, MultiIndex): - from pandas.core.sorting import lexsort_indexer - - labels = index._sort_levels_monotonic() - - indexer = lexsort_indexer( - labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position, - ) - else: - from pandas.core.sorting import nargsort - - # Check monotonic-ness before sort an index - # GH11080 - if (ascending and index.is_monotonic_increasing) or ( - not ascending and index.is_monotonic_decreasing - ): - if inplace: - return - else: - return self.copy() - - indexer = nargsort( - index, kind=kind, ascending=ascending, na_position=na_position - ) - - indexer = ensure_platform_int(indexer) - new_index = self.index.take(indexer) - new_index = new_index._sort_levels_monotonic() - - new_values = self._values.take(indexer) - result = self._constructor(new_values, index=new_index) - - if ignore_index: - result.index = ibase.default_index(len(result)) - - if inplace: - self._update_inplace(result) - else: - return result.__finalize__(self, method="sort_index") + return super().sort_index( + axis, + level, + ascending, + inplace, + kind, + na_position, + sort_remaining, + ignore_index, + key, + ) def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ @@ -3799,10 +3723,11 @@ def explode(self, ignore_index: bool = False) -> "Series": Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of elements in + the output will be non-deterministic when exploding sets. Examples -------- @@ -4014,7 +3939,6 @@ def _gotitem(self, key, ndim, subset=None) -> "Series": axis=_shared_doc_kwargs["axis"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="\n.. versionadded:: 0.20.0\n", ) def aggregate(self, func=None, axis=0, *args, **kwargs): # Validate the axis parameter @@ -4024,7 +3948,7 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if func is None: func = dict(kwargs.items()) - result, how = self._aggregate(func, *args, **kwargs) + result, how = aggregate(self, func, *args, **kwargs) if result is None: # we can be called from an inner function which @@ -4050,14 +3974,14 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): agg = aggregate @doc( - NDFrame.transform, + _shared_docs["transform"], klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform(self, func, axis=0, *args, **kwargs): - # Validate the axis parameter - self._get_axis_number(axis) - return super().transform(func, *args, **kwargs) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> FrameOrSeriesUnion: + return transform(self, func, axis, *args, **kwargs) def apply(self, func, convert_dtype=True, args=(), **kwds): """ @@ -4193,14 +4117,22 @@ def f(x): if len(mapped) and isinstance(mapped[0], Series): # GH 25959 use pd.array instead of tolist # so extension arrays can be used - return self._constructor_expanddim(pd.array(mapped), index=self.index) + return self._constructor_expanddim(pd_array(mapped), index=self.index) else: return self._constructor(mapped, index=self.index).__finalize__( self, method="apply" ) def _reduce( - self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + self, + op, + name: str, + *, + axis=0, + skipna=True, + numeric_only=None, + filter_type=None, + **kwds, ): """ Perform a reduction operation. @@ -4312,8 +4244,8 @@ def rename( Returns ------- - Series - Series with index labels or name altered. + Series or None + Series with index labels or name altered or None if ``inplace=True``. See Also -------- @@ -4426,8 +4358,8 @@ def drop( Returns ------- - Series - Series with specified index labels removed. + Series or None + Series with specified index labels removed or None if ``inplace=True``. Raises ------ @@ -4560,6 +4492,31 @@ def replace( method=method, ) + def _replace_single(self, to_replace, method, inplace, limit): + """ + Replaces values in a Series using the fill method specified when no + replacement value is given in the replace method + """ + + orig_dtype = self.dtype + result = self if inplace else self.copy() + fill_f = missing.get_fill_func(method) + + mask = missing.mask_missing(result.values, to_replace) + values = fill_f(result.values, limit=limit, mask=mask) + + if values.dtype == orig_dtype and inplace: + return + + result = self._constructor(values, index=self.index, dtype=self.dtype) + result = result.__finalize__(self) + + if inplace: + self._update_inplace(result) + return + + return result + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( @@ -4613,7 +4570,7 @@ def memory_usage(self, index=True, deep=False): >>> s.memory_usage() 144 >>> s.memory_usage(deep=True) - 260 + 244 """ v = super().memory_usage(deep=deep) if index: @@ -4673,7 +4630,7 @@ def isin(self, values) -> "Series": 5 False Name: animal, dtype: bool """ - result = algorithms.isin(self, values) + result = algorithms.isin(self._values, values) return self._constructor(result, index=self.index).__finalize__( self, method="isin" ) @@ -4762,6 +4719,7 @@ def _convert_dtypes( convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, + convert_floating: bool = True, ) -> "Series": input_series = self if infer_objects: @@ -4769,9 +4727,13 @@ def _convert_dtypes( if is_object_dtype(input_series): input_series = input_series.copy() - if convert_string or convert_integer or convert_boolean: + if convert_string or convert_integer or convert_boolean or convert_floating: inferred_dtype = convert_dtypes( - input_series._values, convert_string, convert_integer, convert_boolean + input_series._values, + convert_string, + convert_integer, + convert_boolean, + convert_floating, ) try: result = input_series.astype(inferred_dtype) @@ -4783,7 +4745,7 @@ def _convert_dtypes( @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> "Series": - return super().isna() + return generic.NDFrame.isna(self) @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isnull(self) -> "Series": @@ -4815,8 +4777,8 @@ def dropna(self, axis=0, inplace=False, how=None): Returns ------- - Series - Series with NA entries dropped from it. + Series or None + Series with NA entries dropped from it or None if ``inplace=True``. See Also -------- @@ -4912,7 +4874,7 @@ def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_index = self.index.to_timestamp(freq=freq, how=how) # type: ignore + new_index = self.index.to_timestamp(freq=freq, how=how) return self._constructor(new_values, index=new_index).__finalize__( self, method="to_timestamp" ) @@ -4969,10 +4931,44 @@ def to_period(self, freq=None, copy=True) -> "Series": # Add plotting methods to Series hist = pandas.plotting.hist_series + # ---------------------------------------------------------------------- + # Template-Based Arithmetic/Comparison Methods + + def _cmp_method(self, other, op): + res_name = ops.get_op_result_name(self, other) + + if isinstance(other, Series) and not self._indexed_same(other): + raise ValueError("Can only compare identically-labeled Series objects") + + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + + res_values = ops.comparison_op(lvalues, rvalues, op) + + return self._construct_result(res_values, name=res_name) + + def _logical_method(self, other, op): + res_name = ops.get_op_result_name(self, other) + self, other = ops.align_method_SERIES(self, other, align_asobject=True) + + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + + res_values = ops.logical_op(lvalues, rvalues, op) + return self._construct_result(res_values, name=res_name) + + def _arith_method(self, other, op): + res_name = ops.get_op_result_name(self, other) + self, other = ops.align_method_SERIES(self, other) + + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + result = ops.arithmetic_op(lvalues, rvalues, op) + + return self._construct_result(result, name=res_name) + Series._add_numeric_operations() -Series._add_series_or_dataframe_operations() # Add arithmetic! ops.add_flex_arithmetic_methods(Series) -ops.add_special_arithmetic_methods(Series) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index b81942f062b19..3aeb3b664b27f 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -1,118 +1,390 @@ from typing import Dict -_shared_docs: Dict[str, str] = dict() +_shared_docs: Dict[str, str] = {} +_shared_docs[ + "aggregate" +] = """ +Aggregate using one or more operations over the specified axis. + +Parameters +---------- +func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + Return scalar, Series or DataFrame. +{see_also} +Notes +----- +`agg` is an alias for `aggregate`. Use the alias. + +A passed user-defined-function will be passed a Series for evaluation. +{examples}""" + +_shared_docs[ + "compare" +] = """ +Compare to another {klass} and show the differences. + +.. versionadded:: 1.1.0 + +Parameters +---------- +other : {klass} + Object to compare with. + +align_axis : {{0 or 'index', 1 or 'columns'}}, default 1 + Determine which axis to align the comparison on. + + * 0, or 'index' : Resulting differences are stacked vertically + with rows drawn alternately from self and other. + * 1, or 'columns' : Resulting differences are aligned horizontally + with columns drawn alternately from self and other. + +keep_shape : bool, default False + If true, all rows and columns are kept. + Otherwise, only the ones with different values are kept. + +keep_equal : bool, default False + If true, the result keeps values that are equal. + Otherwise, equal values are shown as NaNs. +""" + +_shared_docs[ + "groupby" +] = """ +Group %(klass)s using a mapper or by a Series of columns. + +A groupby operation involves some combination of splitting the +object, applying a function, and combining the results. This can be +used to group large amounts of data and compute operations on these +groups. + +Parameters +---------- +by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If an ndarray is passed, the + values are used as-is to determine the groups. A label or list of + labels may be passed to group by the columns in ``self``. Notice + that a tuple is interpreted as a (single) key. +axis : {0 or 'index', 1 or 'columns'}, default 0 + Split along rows (0) or columns (1). +level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. +as_index : bool, default True + For aggregated output, return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. +sort : bool, default True + Sort group keys. Get better performance by turning this off. + Note this does not influence the order of observations within each + group. Groupby preserves the order of rows within each group. +group_keys : bool, default True + When calling apply, add group keys to index to identify pieces. +squeeze : bool, default False + Reduce the dimensionality of the return type if possible, + otherwise return a consistent type. + + .. deprecated:: 1.1.0 + +observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. +dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups + + .. versionadded:: 1.1.0 + +Returns +------- +%(klass)sGroupBy + Returns a groupby object that contains information about the groups. + +See Also +-------- +resample : Convenience method for frequency conversion and resampling + of time series. + +Notes +----- +See the `user guide +`_ for more. +""" _shared_docs[ "melt" ] = """ - Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - - This function is useful to massage a DataFrame into a format where one - or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to - the row axis, leaving just two non-identifier columns, 'variable' and - 'value'. - %(versionadded)s - Parameters - ---------- - id_vars : tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - value_vars : tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name : scalar - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name : scalar, default 'value' - Name to use for the 'value' column. - col_level : int or str, optional - If columns are a MultiIndex then use this level to melt. - ignore_index : bool, default True - If True, original index is ignored. If False, the original index is retained. - Index labels will be repeated as necessary. - - .. versionadded:: 1.1.0 - - Returns - ------- - DataFrame - Unpivoted DataFrame. - - See Also - -------- - %(other)s : Identical method. - pivot_table : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. - DataFrame.explode : Explode a DataFrame from list-like - columns to long format. - - Examples - -------- - >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, - ... 'B': {0: 1, 1: 3, 2: 5}, - ... 'C': {0: 2, 1: 4, 2: 6}}) - >>> df - A B C - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)sid_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 3 a C 2 - 4 b C 4 - 5 c C 6 - - The names of 'variable' and 'value' columns can be customized: - - >>> %(caller)sid_vars=['A'], value_vars=['B'], - ... var_name='myVarname', value_name='myValname') - A myVarname myValname - 0 a B 1 - 1 b B 3 - 2 c B 5 - - Original index values can be kept around: - - >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 0 a C 2 - 1 b C 4 - 2 c C 6 - - If you have multi-index columns: - - >>> df.columns = [list('ABC'), list('DEF')] - >>> df - A B C - D E F - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) - (A, D) variable_0 variable_1 value - 0 a B E 1 - 1 b B E 3 - 2 c B E 5 - """ +Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + +This function is useful to massage a DataFrame into a format where one +or more columns are identifier variables (`id_vars`), while all other +columns, considered measured variables (`value_vars`), are "unpivoted" to +the row axis, leaving just two non-identifier columns, 'variable' and +'value'. + +Parameters +---------- +id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. +value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. +var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. +value_name : scalar, default 'value' + Name to use for the 'value' column. +col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. +ignore_index : bool, default True + If True, original index is ignored. If False, the original index is retained. + Index labels will be repeated as necessary. + + .. versionadded:: 1.1.0 + +Returns +------- +DataFrame + Unpivoted DataFrame. + +See Also +-------- +%(other)s : Identical method. +pivot_table : Create a spreadsheet-style pivot table as a DataFrame. +DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. +DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + +Examples +-------- +>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, +... 'B': {0: 1, 1: 3, 2: 5}, +... 'C': {0: 2, 1: 4, 2: 6}}) +>>> df + A B C +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)sid_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +3 a C 2 +4 b C 4 +5 c C 6 + +The names of 'variable' and 'value' columns can be customized: + +>>> %(caller)sid_vars=['A'], value_vars=['B'], +... var_name='myVarname', value_name='myValname') + A myVarname myValname +0 a B 1 +1 b B 3 +2 c B 5 + +Original index values can be kept around: + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +0 a C 2 +1 b C 4 +2 c C 6 + +If you have multi-index columns: + +>>> df.columns = [list('ABC'), list('DEF')] +>>> df + A B C + D E F +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value +0 a B E 1 +1 b B E 3 +2 c B E 5 +""" + +_shared_docs[ + "transform" +] = """ +Call ``func`` on self producing a {klass} with transformed values. + +Produced {klass} will have same axis length as self. + +Parameters +---------- +func : function, str, list-like or dict-like + Function to use for transforming the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. If func + is both list-like and dict-like, dict-like behavior takes precedence. + + Accepted combinations are: + + - function + - string function name + - list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` + - dict-like of axis labels -> functions, function names or list-like of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +{klass} + A {klass} that must have the same length as self. + +Raises +------ +ValueError : If the returned {klass} has a different length than self. + +See Also +-------- +{klass}.agg : Only perform aggregating type operations. +{klass}.apply : Invoke function on a {klass}. + +Examples +-------- +>>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) +>>> df + A B +0 0 1 +1 1 2 +2 2 3 +>>> df.transform(lambda x: x + 1) + A B +0 1 2 +1 2 3 +2 3 4 + +Even though the resulting {klass} must have the same length as the +input {klass}, it is possible to provide several input functions: + +>>> s = pd.Series(range(3)) +>>> s +0 0 +1 1 +2 2 +dtype: int64 +>>> s.transform([np.sqrt, np.exp]) + sqrt exp +0 0.000000 1.000000 +1 1.000000 2.718282 +2 1.414214 7.389056 + +You can call transform on a GroupBy object: + +>>> df = pd.DataFrame({{ +... "Date": [ +... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05", +... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"], +... "Data": [5, 8, 6, 1, 50, 100, 60, 120], +... }}) +>>> df + Date Data +0 2015-05-08 5 +1 2015-05-07 8 +2 2015-05-06 6 +3 2015-05-05 1 +4 2015-05-08 50 +5 2015-05-07 100 +6 2015-05-06 60 +7 2015-05-05 120 +>>> df.groupby('Date')['Data'].transform('sum') +0 55 +1 108 +2 66 +3 121 +4 55 +5 108 +6 66 +7 121 +Name: Data, dtype: int64 + +>>> df = pd.DataFrame({{ +... "c": [1, 1, 1, 2, 2, 2, 2], +... "type": ["m", "n", "o", "m", "m", "n", "n"] +... }}) +>>> df + c type +0 1 m +1 1 n +2 1 o +3 2 m +4 2 m +5 2 n +6 2 n +>>> df['size'] = df.groupby('c')['type'].transform(len) +>>> df + c type size +0 1 m 3 +1 1 n 3 +2 1 o 3 +3 2 m 4 +4 2 m 4 +5 2 n 4 +6 2 n 4 +""" + +_shared_docs[ + "storage_options" +] = """storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a non-fsspec URL. + See the fsspec and backend storage implementation docs for the set of + allowed keys and values.""" diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ee73aa42701b0..0a1cbc6de1cda 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,15 +1,26 @@ """ miscellaneous sorting / groupby utilities """ -from typing import Callable, Optional +from collections import defaultdict +from typing import ( + TYPE_CHECKING, + Callable, + DefaultDict, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, +) import numpy as np from pandas._libs import algos, hashtable, lib from pandas._libs.hashtable import unique_label_indices +from pandas._typing import IndexKeyFunc from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, - is_categorical_dtype, is_extension_array_dtype, ) from pandas.core.dtypes.generic import ABCMultiIndex @@ -18,9 +29,66 @@ import pandas.core.algorithms as algorithms from pandas.core.construction import extract_array +if TYPE_CHECKING: + from pandas import MultiIndex + from pandas.core.indexes.base import Index + _INT64_MAX = np.iinfo(np.int64).max +def get_indexer_indexer( + target: "Index", + level: Union[str, int, List[str], List[int]], + ascending: bool, + kind: str, + na_position: str, + sort_remaining: bool, + key: IndexKeyFunc, +) -> Optional[np.array]: + """ + Helper method that return the indexer according to input parameters for + the sort_index method of DataFrame and Series. + + Parameters + ---------- + target : Index + level : int or level name or list of ints or list of level names + ascending : bool or list of bools, default True + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + na_position : {'first', 'last'}, default 'last' + sort_remaining : bool, default True + key : callable, optional + + Returns + ------- + Optional[ndarray] + The indexer for the new index. + """ + + target = ensure_key_mapped(target, key, levels=level) + target = target._sort_levels_monotonic() + + if level is not None: + _, indexer = target.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) + elif isinstance(target, ABCMultiIndex): + indexer = lexsort_indexer( + target._get_codes_for_sorting(), orders=ascending, na_position=na_position + ) + else: + # Check monotonic-ness before sort an index (GH 11080) + if (ascending and target.is_monotonic_increasing) or ( + not ascending and target.is_monotonic_decreasing + ): + return None + + indexer = nargsort( + target, kind=kind, ascending=ascending, na_position=na_position + ) + return indexer + + def get_group_index(labels, shape, sort: bool, xnull: bool): """ For the particular label_list, gets the offsets into the hypothetical list @@ -227,13 +295,7 @@ def lexsort_indexer( keys = [ensure_key_mapped(k, key) for k in keys] for k, order in zip(keys, orders): - # we are already a Categorical - if is_categorical_dtype(k): - cat = k - - # create the Categorical - else: - cat = Categorical(k, ordered=True) + cat = Categorical(k, ordered=True) if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {na_position}") @@ -267,6 +329,7 @@ def nargsort( ascending: bool = True, na_position: str = "last", key: Optional[Callable] = None, + mask: Optional[np.ndarray] = None, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. @@ -281,19 +344,27 @@ def nargsort( ascending : bool, default True na_position : {'first', 'last'}, default 'last' key : Optional[Callable], default None + mask : Optional[np.ndarray], default None + Passed when called by ExtensionArray.argsort. """ if key is not None: items = ensure_key_mapped(items, key) return nargsort( - items, kind=kind, ascending=ascending, na_position=na_position, key=None + items, + kind=kind, + ascending=ascending, + na_position=na_position, + key=None, + mask=mask, ) items = extract_array(items) - mask = np.asarray(isna(items)) + if mask is None: + mask = np.asarray(isna(items)) if is_extension_array_dtype(items): - items = items._values_for_argsort() + return items.argsort(ascending=ascending, kind=kind, na_position=na_position) else: items = np.asanyarray(items) @@ -346,7 +417,9 @@ def nargminmax(values, method: str): return non_nan_idx[func(non_nans)] -def ensure_key_mapped_multiindex(index, key: Callable, level=None): +def _ensure_key_mapped_multiindex( + index: "MultiIndex", key: Callable, level=None +) -> "MultiIndex": """ Returns a new MultiIndex in which key has been applied to all levels specified in level (or all levels if level @@ -372,7 +445,6 @@ def ensure_key_mapped_multiindex(index, key: Callable, level=None): labels : MultiIndex Resulting MultiIndex with modified levels. """ - from pandas.core.indexes.api import MultiIndex if level is not None: if isinstance(level, (str, int)): @@ -391,7 +463,7 @@ def ensure_key_mapped_multiindex(index, key: Callable, level=None): for level in range(index.nlevels) ] - labels = MultiIndex.from_arrays(mapped) + labels = type(index).from_arrays(mapped) return labels @@ -415,7 +487,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return values if isinstance(values, ABCMultiIndex): - return ensure_key_mapped_multiindex(values, key, level=levels) + return _ensure_key_mapped_multiindex(values, key, level=levels) result = key(values.copy()) if len(result) != len(values): @@ -440,48 +512,39 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result -class _KeyMapper: - """ - Map compressed group id -> key tuple. - """ - - def __init__(self, comp_ids, ngroups: int, levels, labels): - self.levels = levels - self.labels = labels - self.comp_ids = comp_ids.astype(np.int64) - - self.k = len(labels) - self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] - - self._populate_tables() - - def _populate_tables(self): - for labs, table in zip(self.labels, self.tables): - table.map(self.comp_ids, labs.astype(np.int64)) - - def get_key(self, comp_id): - return tuple( - level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels) - ) - - -def get_flattened_iterator(comp_ids, ngroups, levels, labels): - # provide "flattened" iterator for multi-group setting - mapper = _KeyMapper(comp_ids, ngroups, levels, labels) - return [mapper.get_key(i) for i in range(ngroups)] - - -def get_indexer_dict(label_list, keys): +def get_flattened_list( + comp_ids: np.ndarray, + ngroups: int, + levels: Iterable["Index"], + labels: Iterable[np.ndarray], +) -> List[Tuple]: + """Map compressed group id -> key tuple.""" + comp_ids = comp_ids.astype(np.int64, copy=False) + arrays: DefaultDict[int, List[int]] = defaultdict(list) + for labs, level in zip(labels, levels): + table = hashtable.Int64HashTable(ngroups) + table.map(comp_ids, labs.astype(np.int64, copy=False)) + for i in range(ngroups): + arrays[i].append(level[table.get_item(i)]) + return [tuple(array) for array in arrays.values()] + + +def get_indexer_dict( + label_list: List[np.ndarray], keys: List["Index"] +) -> Dict[Union[str, Tuple], np.ndarray]: """ Returns ------- - dict + dict: Labels mapped to indexers. """ shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) + if np.all(group_index == -1): + # When all keys are nan and dropna=True, indices_fast can't handle this + # and the return is empty anyway + return {} ngroups = ( ((group_index.size and group_index.max()) + 1) if is_int64_overflow_possible(shape) @@ -531,7 +594,7 @@ def compress_group_index(group_index, sort: bool = True): space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT) + size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) @@ -542,7 +605,7 @@ def compress_group_index(group_index, sort: bool = True): if sort and len(obs_group_ids) > 0: obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - return comp_ids, obs_group_ids + return ensure_int64(comp_ids), ensure_int64(obs_group_ids) def _reorder_by_uniques(uniques, labels): diff --git a/pandas/core/strings.py b/pandas/core/strings.py deleted file mode 100644 index a1db7742916de..0000000000000 --- a/pandas/core/strings.py +++ /dev/null @@ -1,3653 +0,0 @@ -import codecs -from functools import wraps -import re -import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.missing as libmissing -import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype, Scalar -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_extension_array_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_object_dtype, - is_re, - is_scalar, - is_string_dtype, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCMultiIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.algorithms import take_1d -from pandas.core.base import NoNewAttributesMixin -from pandas.core.construction import extract_array - -if TYPE_CHECKING: - from pandas.arrays import StringArray - -_cpython_optimized_encoders = ( - "utf-8", - "utf8", - "latin-1", - "latin1", - "iso-8859-1", - "mbcs", - "ascii", -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") - -_shared_docs: Dict[str, str] = dict() - - -def cat_core(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat` - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - if sep == "": - # no need to interleave sep if it is empty - arr_of_cols = np.asarray(list_of_columns, dtype=object) - return np.sum(arr_of_cols, axis=0) - list_with_sep = [sep] * (2 * len(list_of_columns) - 1) - list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep, dtype=object) - return np.sum(arr_with_sep, axis=0) - - -def cat_safe(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat`. - - Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the arrays in list_of columns have the wrong dtypes or content. - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - try: - result = cat_core(list_of_columns, sep) - except TypeError: - # if there are any non-string values (wrong dtype or hidden behind - # object dtype), np.sum will fail; catch and return with better message - for column in list_of_columns: - dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ["string", "empty"]: - raise TypeError( - "Concatenation requires list-likes containing only " - "strings (or missing values). Offending values found in " - f"column {dtype}" - ) from None - return result - - -def _na_map(f, arr, na_result=None, dtype=np.dtype(object)): - if is_extension_array_dtype(arr.dtype): - if na_result is None: - na_result = libmissing.NA - # just StringDtype - arr = extract_array(arr) - return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) - if na_result is None: - na_result = np.nan - return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) - - -def _map_stringarray( - func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype -) -> ArrayLike: - """ - Map a callable over valid elements of a StringArray. - - Parameters - ---------- - func : Callable[[str], Any] - Apply to each valid element. - arr : StringArray - na_value : Any - The value to use for missing values. By default, this is - the original value (NA). - dtype : Dtype - The result dtype to use. Specifying this avoids an intermediate - object-dtype allocation. - - Returns - ------- - ArrayLike - An ExtensionArray for integer or string dtypes, otherwise - an ndarray. - - """ - from pandas.arrays import IntegerArray, StringArray, BooleanArray - - mask = isna(arr) - - assert isinstance(arr, StringArray) - arr = np.asarray(arr) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - func, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, func, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, func, mask.view("uint8")) - - -def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=np.dtype(object)): - if not len(arr): - return np.ndarray(0, dtype=dtype) - - if isinstance(arr, ABCSeries): - arr = arr._values # TODO: extract_array? - if not isinstance(arr, np.ndarray): - arr = np.asarray(arr, dtype=object) - if na_mask: - mask = isna(arr) - convert = not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError) as e: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" - ) - - if len(e.args) >= 1 and re.search(p_err, e.args[0]): - # FIXME: this should be totally avoidable - raise e - - def g(x): - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return _map_object(g, arr, dtype=dtype) - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - else: - return lib.map_infer(arr, f) - - -def str_count(arr, pat, flags=0): - """ - Count occurrences of pattern in each string of the Series/Index. - - This function is used to count the number of times a particular regex - pattern is repeated in each of the string elements of the - :class:`~pandas.Series`. - - Parameters - ---------- - pat : str - Valid regular expression. - flags : int, default 0, meaning no flags - Flags for the `re` module. For a complete list, `see here - `_. - **kwargs - For compatibility with other string methods. Not used. - - Returns - ------- - Series or Index - Same type as the calling object containing the integer counts. - - See Also - -------- - re : Standard library module for regular expressions. - str.count : Standard library version, without regular expression support. - - Notes - ----- - Some characters need to be escaped when passing in `pat`. - eg. ``'$'`` has a special meaning in regex and must be escaped when - finding this literal character. - - Examples - -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') - 0 0.0 - 1 0.0 - 2 2.0 - 3 2.0 - 4 NaN - 5 0.0 - 6 1.0 - dtype: float64 - - Escape ``'$'`` to find the literal dollar sign. - - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 0 - dtype: int64 - - This is also available on Index - - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') - """ - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype="int64") - - -def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): - """ - Test if pattern or regex is contained within a string of a Series or Index. - - Return boolean Series or Index based on whether a given pattern or regex is - contained within a string of a Series or Index. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Flags to pass through to the re module, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - regex : bool, default True - If True, assumes the pat is a regular expression. - - If False, treats the pat as a literal string. - - Returns - ------- - Series or Index of boolean values - A Series or Index of boolean values indicating whether the - given pattern is contained within the string of each element - of the Series or Index. - - See Also - -------- - match : Analogous, but stricter, relying on re.match instead of re.search. - Series.str.startswith : Test if the start of each string element matches a - pattern. - Series.str.endswith : Same as startswith, but tests the end of string. - - Examples - -------- - Returning a Series of booleans using only a literal pattern. - - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) - >>> s1.str.contains('og', regex=False) - 0 False - 1 True - 2 False - 3 False - 4 NaN - dtype: object - - Returning an Index of booleans using only a literal pattern. - - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) - >>> ind.str.contains('23', regex=False) - Index([False, False, False, True, nan], dtype='object') - - Specifying case sensitivity using `case`. - - >>> s1.str.contains('oG', case=True, regex=True) - 0 False - 1 False - 2 False - 3 False - 4 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN` replaces NaN values - with `False`. If Series or Index does not contain NaN values - the resultant dtype will be `bool`, otherwise, an `object` dtype. - - >>> s1.str.contains('og', na=False, regex=True) - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - Returning 'house' or 'dog' when either expression occurs in a string. - - >>> s1.str.contains('house|dog', regex=True) - 0 False - 1 True - 2 True - 3 False - 4 NaN - dtype: object - - Ignoring case sensitivity using `flags` with regex. - - >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) - 0 False - 1 False - 2 True - 3 False - 4 NaN - dtype: object - - Returning any digit using regular expression. - - >>> s1.str.contains('\\d', regex=True) - 0 False - 1 False - 2 False - 3 True - 4 NaN - dtype: object - - Ensure `pat` is a not a literal pattern when `regex` is set to True. - Note in the following example one might expect only `s2[1]` and `s2[3]` to - return `True`. However, '.0' as a regex matches any character - followed by a 0. - - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) - 0 True - 1 True - 2 False - 3 True - 4 False - dtype: bool - """ - if regex: - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None - else: - if case: - f = lambda x: pat in x - else: - upper_pat = pat.upper() - f = lambda x: upper_pat in x - uppered = _na_map(lambda x: x.upper(), arr) - return _na_map(f, uppered, na, dtype=np.dtype(bool)) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_startswith(arr, pat, na=np.nan): - """ - Test if the start of each string element matches a pattern. - - Equivalent to :meth:`str.startswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the start of each string element. - - See Also - -------- - str.startswith : Python standard library string method. - Series.str.endswith : Same as startswith, but tests the end of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) - >>> s - 0 bat - 1 Bear - 2 cat - 3 NaN - dtype: object - - >>> s.str.startswith('b') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.startswith('b', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.startswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_endswith(arr, pat, na=np.nan): - """ - Test if the end of each string element matches a pattern. - - Equivalent to :meth:`str.endswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the end of each string element. - - See Also - -------- - str.endswith : Python standard library string method. - Series.str.startswith : Same as endswith, but tests the start of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) - >>> s - 0 bat - 1 bear - 2 caT - 3 NaN - dtype: object - - >>> s.str.endswith('t') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.endswith('t', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.endswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): - r""" - Replace each occurrence of pattern/regex in the Series/Index. - - Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. - - Parameters - ---------- - pat : str or compiled regex - String can be a character sequence or regular expression. - repl : str or callable - Replacement string or a callable. The callable is passed the regex - match object and must return a replacement string to be used. - See :func:`re.sub`. - n : int, default -1 (all) - Number of replacements to make from start. - case : bool, default None - Determines if replace is case sensitive: - - - If True, case sensitive (the default if `pat` is a string) - - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex. - - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled - regex. - regex : bool, default True - Determines if assumes the passed-in pattern is a regular expression: - - - If True, assumes the passed-in pattern is a regular expression. - - If False, treats the pattern as a literal string - - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. - - .. versionadded:: 0.23.0 - - Returns - ------- - Series or Index of object - A copy of the object with all matching occurrences of `pat` replaced by - `repl`. - - Raises - ------ - ValueError - * if `regex` is False and `repl` is a callable or `pat` is a compiled - regex - * if `pat` is a compiled regex and `case` or `flags` is set - - Notes - ----- - When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled - regex will raise an error. - - Examples - -------- - When `pat` is a string and `regex` is True (the default), the given `pat` - is compiled as a regex. When `repl` is a string, it replaces matching - regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are - left as is: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) - 0 bao - 1 baz - 2 NaN - dtype: object - - When `pat` is a string and `regex` is False, every `pat` is replaced with - `repl` as with :meth:`str.replace`: - - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) - 0 bao - 1 fuz - 2 NaN - dtype: object - - When `repl` is a callable, it is called on every `pat` using - :func:`re.sub`. The callable should expect one positional argument - (a regex object) and return a string. - - To get the idea: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 oo - 1 uz - 2 NaN - dtype: object - - Reverse every lowercase alphabetic word: - - >>> repl = lambda m: m.group(0)[::-1] - >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) - 0 oof 123 - 1 rab zab - 2 NaN - dtype: object - - Using regex groups (extract second group and swap case): - - >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) - 0 tWO - 1 bAR - dtype: object - - Using a compiled regex with flags - - >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') - 0 foo - 1 bar - 2 NaN - dtype: object - """ - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - - is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) - else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - - return _na_map(f, arr, dtype=str) - - -def str_repeat(arr, repeats): - """ - Duplicate each string in the Series or Index. - - Parameters - ---------- - repeats : int or sequence of int - Same value for all (int) or different value per (sequence). - - Returns - ------- - Series or Index of object - Series or Index of repeated string objects specified by - input parameter repeats. - - Examples - -------- - >>> s = pd.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - - Single int repeats string in Series - - >>> s.str.repeat(repeats=2) - 0 aa - 1 bb - 2 cc - dtype: object - - Sequence of int repeats corresponding string in Series - - >>> s.str.repeat(repeats=[1, 2, 3]) - 0 a - 1 bb - 2 ccc - dtype: object - """ - if is_scalar(repeats): - - def scalar_rep(x): - try: - return bytes.__mul__(x, repeats) - except TypeError: - return str.__mul__(x, repeats) - - return _na_map(scalar_rep, arr, dtype=str) - else: - - def rep(x, r): - if x is libmissing.NA: - return x - try: - return bytes.__mul__(x, r) - except TypeError: - return str.__mul__(x, r) - - repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(arr), repeats, rep) - return result - - -def str_match( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string starts with a match of a regular expression. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - fullmatch : Stricter matching that requires the entire string to match. - contains : Analogous, but less strict, relying on re.search instead of - re.match. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.match(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_fullmatch( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string entirely matches a regular expression. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - match : Similar, but also returns `True` when only a *prefix* of the string - matches the regular expression. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.fullmatch(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def _get_single_group_name(rx): - try: - return list(rx.groupindex.keys()).pop() - except IndexError: - return None - - -def _groups_or_na_fun(regex): - """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - return f - - -def _result_dtype(arr): - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - if arr.dtype.name == "string": - return "string" - else: - return object - - -def _str_extract_noexpand(arr, pat, flags=0): - """ - Find groups in each string in the Series using passed regular - expression. This function is called from - str_extract(expand=False), and can return Series, DataFrame, or - Index. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - - if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) - name = _get_single_group_name(regex) - else: - if isinstance(arr, ABCIndexClass): - raise ValueError("only one regex group is supported with Index") - name = None - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - if arr.empty: - result = DataFrame(columns=columns, dtype=object) - else: - dtype = _result_dtype(arr) - result = DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=arr.index, - dtype=dtype, - ) - return result, name - - -def _str_extract_frame(arr, pat, flags=0): - """ - For each subject string in the Series, extract groups from the - first match of regular expression pat. This function is called from - str_extract(expand=True), and always returns a DataFrame. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - - if len(arr) == 0: - return DataFrame(columns=columns, dtype=object) - try: - result_index = arr.index - except AttributeError: - result_index = None - dtype = _result_dtype(arr) - return DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=result_index, - dtype=dtype, - ) - - -def str_extract(arr, pat, flags=0, expand=True): - r""" - Extract capture groups in the regex `pat` as columns in a DataFrame. - - For each subject string in the Series, extract groups from the - first match of regular expression `pat`. - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that - modify regular expression matching for things like case, - spaces, etc. For more details, see :mod:`re`. - expand : bool, default True - If True, return DataFrame with one column per capture group. - If False, return a Series/Index if there is one capture group - or DataFrame if there are multiple capture groups. - - Returns - ------- - DataFrame or Series or Index - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. The dtype of each result - column is always object, even when no match is found. If - ``expand=False`` and pat has only one capture group, then - return a Series (if subject is a Series) or Index (if subject - is an Index). - - See Also - -------- - extractall : Returns all matches (not just the first match). - - Examples - -------- - A pattern with two groups will return a DataFrame with two columns. - Non-matches will be NaN. - - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern may contain optional groups. - - >>> s.str.extract(r'([ab])?(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN 3 - - Named groups will become column names in the result. - - >>> s.str.extract(r'(?P[ab])(?P\d)') - letter digit - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern with one group will return a DataFrame with one column - if expand=True. - - >>> s.str.extract(r'[ab](\d)', expand=True) - 0 - 0 1 - 1 2 - 2 NaN - - A pattern with one group will return a Series if expand=False. - - >>> s.str.extract(r'[ab](\d)', expand=False) - 0 1 - 1 2 - 2 NaN - dtype: object - """ - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - if expand: - return _str_extract_frame(arr._orig, pat, flags=flags) - else: - result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) - - -def str_extractall(arr, pat, flags=0): - r""" - Extract capture groups in the regex `pat` as columns in DataFrame. - - For each subject string in the Series, extract groups from all - matches of regular expression pat. When each subject string in the - Series has exactly one match, extractall(pat).xs(0, level='match') - is the same as extract(pat). - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - A ``re`` module flag, for example ``re.IGNORECASE``. These allow - to modify regular expression matching for things like case, spaces, - etc. Multiple flags can be combined with the bitwise OR operator, - for example ``re.IGNORECASE | re.MULTILINE``. - - Returns - ------- - DataFrame - A ``DataFrame`` with one row for each match, and one column for each - group. Its rows have a ``MultiIndex`` with first levels that come from - the subject ``Series``. The last level is named 'match' and indexes the - matches in each item of the ``Series``. Any capture group names in - regular expression pat will be used for column names; otherwise capture - group numbers will be used. - - See Also - -------- - extract : Returns first match only (not all matches). - - Examples - -------- - A pattern with one group will return a DataFrame with one column. - Indices with no matches will not appear in the result. - - >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) - >>> s.str.extractall(r"[ab](\d)") - 0 - match - A 0 1 - 1 2 - B 0 1 - - Capture group names are used for column names of the result. - - >>> s.str.extractall(r"[ab](?P\d)") - digit - match - A 0 1 - 1 2 - B 0 1 - - A pattern with two groups will return a DataFrame with two columns. - - >>> s.str.extractall(r"(?P[ab])(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - - Optional groups that do not match are NaN in the result. - - >>> s.str.extractall(r"(?P[ab])?(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - C 0 NaN 1 - """ - regex = re.compile(pat, flags=flags) - # the regex must contain capture groups. - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - - if isinstance(arr, ABCIndexClass): - arr = arr.to_series().reset_index(drop=True) - - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - match_list = [] - index_list = [] - is_mi = arr.index.nlevels > 1 - - for subject_key, subject in arr.items(): - if isinstance(subject, str): - - if not is_mi: - subject_key = (subject_key,) - - for match_i, match_tuple in enumerate(regex.findall(subject)): - if isinstance(match_tuple, str): - match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] - match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i,)) - index_list.append(result_key) - - from pandas import MultiIndex - - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - dtype = _result_dtype(arr) - - result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=dtype - ) - return result - - -def str_get_dummies(arr, sep="|"): - """ - Return DataFrame of dummy/indicator variables for Series. - - Each string in Series is split by sep and returned as a DataFrame - of dummy/indicator variables. - - Parameters - ---------- - sep : str, default "|" - String to split on. - - Returns - ------- - DataFrame - Dummy variables corresponding to values of the Series. - - See Also - -------- - get_dummies : Convert categorical variable into dummy/indicator - variables. - - Examples - -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 1 0 0 - 2 1 0 1 - - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - """ - arr = arr.fillna("") - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep - - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) - - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags - - -def str_join(arr, sep): - """ - Join lists contained as elements in the Series/Index with passed delimiter. - - If the elements of a Series are lists themselves, join the content of these - lists using the delimiter passed to the function. - This function is an equivalent to :meth:`str.join`. - - Parameters - ---------- - sep : str - Delimiter to use between list entries. - - Returns - ------- - Series/Index: object - The list entries concatenated by intervening occurrences of the - delimiter. - - Raises - ------ - AttributeError - If the supplied Series contains neither strings nor lists. - - See Also - -------- - str.join : Standard library version of this method. - Series.str.split : Split strings around given separator/delimiter. - - Notes - ----- - If any of the list items is not a string object, the result of the join - will be `NaN`. - - Examples - -------- - Example with a list that contains non-string elements. - - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) - >>> s - 0 [lion, elephant, zebra] - 1 [1.1, 2.2, 3.3] - 2 [cat, nan, dog] - 3 [cow, 4.5, goat] - 4 [duck, [swan, fish], guppy] - dtype: object - - Join all lists using a '-'. The lists containing object(s) of types other - than str will produce a NaN. - - >>> s.str.join('-') - 0 lion-elephant-zebra - 1 NaN - 2 NaN - 3 NaN - 4 NaN - dtype: object - """ - return _na_map(sep.join, arr, dtype=str) - - -def str_findall(arr, pat, flags=0): - """ - Find all occurrences of pattern or regular expression in the Series/Index. - - Equivalent to applying :func:`re.findall` to all the elements in the - Series/Index. - - Parameters - ---------- - pat : str - Pattern or regular expression. - flags : int, default 0 - Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which - means no flags). - - Returns - ------- - Series/Index of lists of strings - All non-overlapping matches of pattern or regular expression in each - string of this Series/Index. - - See Also - -------- - count : Count occurrences of pattern or regular expression in each string - of the Series/Index. - extractall : For each string in the Series, extract groups from all matches - of regular expression and return a DataFrame with one row for each - match and one column for each group. - re.findall : The equivalent ``re`` function to all non-overlapping matches - of pattern or regular expression in string, as a list of strings. - - Examples - -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - On the other hand, the search for the pattern 'MONKEY' doesn't return any - match: - - >>> s.str.findall('MONKEY') - 0 [] - 1 [] - 2 [] - dtype: object - - Flags can be added to the pattern or regular expression. For instance, - to find the pattern 'MONKEY' ignoring the case: - - >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - When the pattern matches more than one string in the Series, all matches - are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: object - - Regular expressions are supported too. For instance, the search for all the - strings ending with the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: object - - If the pattern is found more than once in the same string, then a list of - multiple strings is returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: object - """ - regex = re.compile(pat, flags=flags) - return _na_map(regex.findall, arr) - - -def str_find(arr, sub, start=0, end=None, side="left"): - """ - Return indexes in each strings in the Series/Index where the - substring is fully contained between [start:end]. Return -1 on failure. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - side : {'left', 'right'}, default 'left' - Specifies a starting side, equivalent to ``find`` or ``rfind``. - - Returns - ------- - Series or Index - Indexes where substring is found. - """ - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "find" - elif side == "right": - method = "rfind" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_index(arr, sub, start=0, end=None, side="left"): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "index" - elif side == "right": - method = "rindex" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_pad(arr, width, side="left", fillchar=" "): - """ - Pad strings in the Series/Index up to width. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with character defined in `fillchar`. - side : {'left', 'right', 'both'}, default 'left' - Side from which to fill resulting string. - fillchar : str, default ' ' - Additional character for filling, default is whitespace. - - Returns - ------- - Series or Index of object - Returns Series or Index with minimum number of char in object. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='left')``. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='right')``. - Series.str.center : Fills boths sides of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' - character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. - - Examples - -------- - >>> s = pd.Series(["caribou", "tiger"]) - >>> s - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10) - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10, side='right', fillchar='-') - 0 caribou--- - 1 tiger----- - dtype: object - - >>> s.str.pad(width=10, side='both', fillchar='-') - 0 -caribou-- - 1 --tiger--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - if side == "left": - f = lambda x: x.rjust(width, fillchar) - elif side == "right": - f = lambda x: x.ljust(width, fillchar) - elif side == "both": - f = lambda x: x.center(width, fillchar) - else: # pragma: no cover - raise ValueError("Invalid side") - - return _na_map(f, arr, dtype=str) - - -def str_split(arr, pat=None, n=None): - - if pat is None: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) - res = _na_map(f, arr) - return res - - -def str_rsplit(arr, pat=None, n=None): - - if n is None or n == 0: - n = -1 - f = lambda x: x.rsplit(pat, n) - res = _na_map(f, arr) - return res - - -def str_slice(arr, start=None, stop=None, step=None): - """ - Slice substrings from each element in the Series or Index. - - Parameters - ---------- - start : int, optional - Start position for slice operation. - stop : int, optional - Stop position for slice operation. - step : int, optional - Step size for slice operation. - - Returns - ------- - Series or Index of object - Series or Index from sliced substring from original string object. - - See Also - -------- - Series.str.slice_replace : Replace a slice with a string. - Series.str.get : Return element at position. - Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` - being the position. - - Examples - -------- - >>> s = pd.Series(["koala", "fox", "chameleon"]) - >>> s - 0 koala - 1 fox - 2 chameleon - dtype: object - - >>> s.str.slice(start=1) - 0 oala - 1 ox - 2 hameleon - dtype: object - - >>> s.str.slice(start=-1) - 0 a - 1 x - 2 n - dtype: object - - >>> s.str.slice(stop=2) - 0 ko - 1 fo - 2 ch - dtype: object - - >>> s.str.slice(step=2) - 0 kaa - 1 fx - 2 caeen - dtype: object - - >>> s.str.slice(start=0, stop=5, step=3) - 0 kl - 1 f - 2 cm - dtype: object - - Equivalent behaviour to: - - >>> s.str[0:5:3] - 0 kl - 1 f - 2 cm - dtype: object - """ - obj = slice(start, stop, step) - f = lambda x: x[obj] - return _na_map(f, arr, dtype=str) - - -def str_slice_replace(arr, start=None, stop=None, repl=None): - """ - Replace a positional slice of a string with another value. - - Parameters - ---------- - start : int, optional - Left index position to use for the slice. If not specified (None), - the slice is unbounded on the left, i.e. slice from the start - of the string. - stop : int, optional - Right index position to use for the slice. If not specified (None), - the slice is unbounded on the right, i.e. slice until the - end of the string. - repl : str, optional - String for replacement. If not specified (None), the sliced region - is replaced with an empty string. - - Returns - ------- - Series or Index - Same type as the original object. - - See Also - -------- - Series.str.slice : Just slicing without replacement. - - Examples - -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) - >>> s - 0 a - 1 ab - 2 abc - 3 abdc - 4 abcde - dtype: object - - Specify just `start`, meaning replace `start` until the end of the - string with `repl`. - - >>> s.str.slice_replace(1, repl='X') - 0 aX - 1 aX - 2 aX - 3 aX - 4 aX - dtype: object - - Specify just `stop`, meaning the start of the string to `stop` is replaced - with `repl`, and the rest of the string is included. - - >>> s.str.slice_replace(stop=2, repl='X') - 0 X - 1 X - 2 Xc - 3 Xdc - 4 Xcde - dtype: object - - Specify `start` and `stop`, meaning the slice from `start` to `stop` is - replaced with `repl`. Everything before or after `start` and `stop` is - included as is. - - >>> s.str.slice_replace(start=1, stop=3, repl='X') - 0 aX - 1 aX - 2 aX - 3 aXc - 4 aXde - dtype: object - """ - if repl is None: - repl = "" - - def f(x): - if x[start:stop] == "": - local_stop = start - else: - local_stop = stop - y = "" - if start is not None: - y += x[:start] - y += repl - if stop is not None: - y += x[local_stop:] - return y - - return _na_map(f, arr, dtype=str) - - -def str_strip(arr, to_strip=None, side="both"): - """ - Strip whitespace (including newlines) from each string in the - Series/Index. - - Parameters - ---------- - to_strip : str or unicode - side : {'left', 'right', 'both'}, default 'both' - - Returns - ------- - Series or Index - """ - if side == "both": - f = lambda x: x.strip(to_strip) - elif side == "left": - f = lambda x: x.lstrip(to_strip) - elif side == "right": - f = lambda x: x.rstrip(to_strip) - else: # pragma: no cover - raise ValueError("Invalid side") - return _na_map(f, arr, dtype=str) - - -def str_wrap(arr, width, **kwargs): - r""" - Wrap strings in Series/Index at specified line width. - - This method has the same keyword parameters and defaults as - :class:`textwrap.TextWrapper`. - - Parameters - ---------- - width : int - Maximum line width. - expand_tabs : bool, optional - If True, tab characters will be expanded to spaces (default: True). - replace_whitespace : bool, optional - If True, each whitespace character (as defined by string.whitespace) - remaining after tab expansion will be replaced by a single space - (default: True). - drop_whitespace : bool, optional - If True, whitespace that, after wrapping, happens to end up at the - beginning or end of a line is dropped (default: True). - break_long_words : bool, optional - If True, then words longer than width will be broken in order to ensure - that no lines are longer than width. If it is false, long words will - not be broken, and some lines may be longer than width (default: True). - break_on_hyphens : bool, optional - If True, wrapping will occur preferably on whitespace and right after - hyphens in compound words, as it is customary in English. If false, - only whitespaces will be considered as potentially good places for line - breaks, but you need to set break_long_words to false if you want truly - insecable words (default: True). - - Returns - ------- - Series or Index - - Notes - ----- - Internally, this method uses a :class:`textwrap.TextWrapper` instance with - default settings. To achieve behavior matching R's stringr library str_wrap - function, use the arguments: - - - expand_tabs = False - - replace_whitespace = True - - drop_whitespace = True - - break_long_words = False - - break_on_hyphens = False - - Examples - -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) - >>> s.str.wrap(12) - 0 line to be\nwrapped - 1 another line\nto be\nwrapped - dtype: object - """ - kwargs["width"] = width - - tw = textwrap.TextWrapper(**kwargs) - - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) - - -def str_translate(arr, table): - """ - Map all characters in the string through the given mapping table. - - Equivalent to standard :meth:`str.translate`. - - Parameters - ---------- - table : dict - Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or - None. Unmapped characters are left untouched. - Characters mapped to None are deleted. :meth:`str.maketrans` is a - helper function for making translation tables. - - Returns - ------- - Series or Index - """ - return _na_map(lambda x: x.translate(table), arr, dtype=str) - - -def str_get(arr, i): - """ - Extract element from each component at specified position. - - Extract element from lists, tuples, or strings in each element in the - Series/Index. - - Parameters - ---------- - i : int - Position of element to extract. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) - >>> s - 0 String - 1 (1, 2, 3) - 2 [a, b, c] - 3 123 - 4 -456 - 5 {1: 'Hello', '2': 'World'} - dtype: object - - >>> s.str.get(1) - 0 t - 1 2 - 2 b - 3 NaN - 4 NaN - 5 Hello - dtype: object - - >>> s.str.get(-1) - 0 g - 1 3 - 2 c - 3 NaN - 4 NaN - 5 None - dtype: object - """ - - def f(x): - if isinstance(x, dict): - return x.get(i) - elif len(x) > i >= -len(x): - return x[i] - return np.nan - - return _na_map(f, arr) - - -def str_decode(arr, encoding, errors="strict"): - """ - Decode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in - python3. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - Series or Index - """ - if encoding in _cpython_optimized_decoders: - # CPython optimized implementation - f = lambda x: x.decode(encoding, errors) - else: - decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - return _na_map(f, arr) - - -def str_encode(arr, encoding, errors="strict"): - """ - Encode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.encode`. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - encoded : Series/Index of objects - """ - if encoding in _cpython_optimized_encoders: - # CPython optimized implementation - f = lambda x: x.encode(encoding, errors) - else: - encoder = codecs.getencoder(encoding) - f = lambda x: encoder(x, errors)[0] - return _na_map(f, arr) - - -def forbid_nonstring_types(forbidden, name=None): - """ - Decorator to forbid specific types for a method of StringMethods. - - For calling `.str.{method}` on a Series or Index, it is necessary to first - initialize the :class:`StringMethods` object, and then call the method. - However, different methods allow different input types, and so this can not - be checked during :meth:`StringMethods.__init__`, but must be done on a - per-method basis. This decorator exists to facilitate this process, and - make it explicit which (inferred) types are disallowed by the method. - - :meth:`StringMethods.__init__` allows the *union* of types its different - methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - - The default string types ['string', 'empty'] are allowed for all methods. - For the additional types ['bytes', 'mixed', 'mixed-integer'], each method - then needs to forbid the types it is not intended for. - - Parameters - ---------- - forbidden : list-of-str or None - List of forbidden non-string types, may be one or more of - `['bytes', 'mixed', 'mixed-integer']`. - name : str, default None - Name of the method to use in the error message. By default, this is - None, in which case the name from the method being wrapped will be - copied. However, for working with further wrappers (like _pat_wrapper - and _noarg_wrapper), it is necessary to specify the name. - - Returns - ------- - func : wrapper - The method to which the decorator is applied, with an added check that - enforces the inferred type to not be in the list of forbidden types. - - Raises - ------ - TypeError - If the inferred type of the underlying data is in `forbidden`. - """ - # deal with None - forbidden = [] if forbidden is None else forbidden - - allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( - forbidden - ) - - def _forbid_nonstring_types(func): - func_name = func.__name__ if name is None else name - - @wraps(func) - def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) - return func(self, *args, **kwargs) - - wrapper.__name__ = func_name - return wrapper - - return _forbid_nonstring_types - - -def _noarg_wrapper( - f, - name=None, - docstring=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper(self): - result = _na_map(f, self._parent, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - wrapper.__name__ = f.__name__ if name is None else name - if docstring is not None: - wrapper.__doc__ = docstring - else: - raise ValueError("Provide docstring") - - return wrapper - - -def _pat_wrapper( - f, - flags=False, - na=False, - name=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper1(self, pat): - result = f(self._parent, pat) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper2(self, pat, flags=0, **kwargs): - result = f(self._parent, pat, flags=flags, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper3(self, pat, na=np.nan): - result = f(self._parent, pat, na=na) - return self._wrap_result(result, returns_string=returns_string) - - wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - - wrapper.__name__ = f.__name__ if name is None else name - if f.__doc__: - wrapper.__doc__ = f.__doc__ - - return wrapper - - -def copy(source): - """Copy a docstring from another source function (if present)""" - - def do_copy(target): - if source.__doc__: - target.__doc__ = source.__doc__ - return target - - return do_copy - - -class StringMethods(NoNewAttributesMixin): - """ - Vectorized string functions for Series and Index. - - NAs stay NA unless handled otherwise by a particular method. - Patterned after Python's string methods, with some inspiration from - R's stringr package. - - Examples - -------- - >>> s = pd.Series(["A_Str_Series"]) - >>> s - 0 A_Str_Series - dtype: object - - >>> s.str.split("_") - 0 [A, Str, Series] - dtype: object - - >>> s.str.replace("_", "") - 0 AStrSeries - dtype: object - """ - - def __init__(self, data): - self._inferred_dtype = self._validate(data) - self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" - - # ._values.categories works for both Series/Index - self._parent = data._values.categories if self._is_categorical else data - # save orig to blow up categoricals to the right type - self._orig = data - self._freeze() - - @staticmethod - def _validate(data): - """ - Auxiliary function for StringMethods, infers and checks dtype of data. - - This is a "first line of defence" at the creation of the StringMethods- - object (see _make_accessor), and just checks that the dtype is in the - *union* of the allowed types over all string methods below; this - restriction is then refined on a per-method basis using the decorator - @forbid_nonstring_types (more info in the corresponding docstring). - - This really should exclude all series/index with any non-string values, - but that isn't practical for performance reasons until we have a str - dtype (GH 9343 / 13877) - - Parameters - ---------- - data : The content of the Series - - Returns - ------- - dtype : inferred dtype of data - """ - from pandas import StringDtype - - if isinstance(data, ABCMultiIndex): - raise AttributeError( - "Can only use .str accessor with Index, not MultiIndex" - ) - - # see _libs/lib.pyx for list of inferred types - allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal - - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): - return "string" - - try: - inferred_dtype = lib.infer_dtype(values, skipna=True) - except ValueError: - # GH#27571 mostly occurs with ExtensionArray - inferred_dtype = None - - if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") - return inferred_dtype - - def __getitem__(self, key): - if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) - else: - return self.get(key) - - def __iter__(self): - warnings.warn( - "Columnar iteration over characters will be deprecated in future releases.", - FutureWarning, - stacklevel=2, - ) - i = 0 - g = self.get(i) - while g.notna().any(): - yield g - i += 1 - g = self.get(i) - - def _wrap_result( - self, - result, - use_codes=True, - name=None, - expand=None, - fill_value=np.nan, - returns_string=True, - ): - - from pandas import Index, Series, MultiIndex - - # for category, we do the stuff on the categories, so blow it up - # to the full series again - # But for some operations, we have to do the stuff on the full values, - # so make it possible to skip this step as the method already did this - # before the transformation... - if use_codes and self._is_categorical: - # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d( - result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value - ) - - if not hasattr(result, "ndim") or not hasattr(result, "dtype"): - return result - assert result.ndim < 3 - - # We can be wrapping a string / object / categorical result, in which - # case we'll want to return the same dtype as the input. - # Or we can be wrapping a numeric output, in which case we don't want - # to return a StringArray. - if self._is_string and returns_string: - dtype = "string" - else: - dtype = None - - if expand is None: - # infer from ndim if expand is not specified - expand = result.ndim != 1 - - elif expand is True and not isinstance(self._orig, ABCIndexClass): - # required when expand=True is explicitly specified - # not needed when inferred - - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] - - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - - if expand is False: - # if expand is False, result should have the same name - # as the original otherwise specified - if name is None: - name = getattr(result, "name", None) - if name is None: - # do not use logical or, _orig may be a DataFrame - # which has "name" column - name = self._orig.name - - # Wait until we are sure result is a Series or Index before - # checking attributes (GH 12180) - if isinstance(self._orig, ABCIndexClass): - # if result is a boolean np.array, return the np.array - # instead of wrapping it into a boolean Index (GH 8875) - if is_bool_dtype(result): - return result - - if expand: - result = list(result) - out = MultiIndex.from_tuples(result, names=name) - if out.nlevels == 1: - # We had all tuples of length-one, which are - # better represented as a regular Index. - out = out.get_level_values(0) - return out - else: - return Index(result, name=name) - else: - index = self._orig.index - if expand: - cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) - else: - # Must be a Series - cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) - return result - - def _get_series_list(self, others): - """ - Auxiliary function for :meth:`str.cat`. Turn potentially mixed input - into a list of Series (elements without an index must match the length - of the calling Series/Index). - - Parameters - ---------- - others : Series, DataFrame, np.ndarray, list-like or list-like of - Objects that are either Series, Index or np.ndarray (1-dim). - - Returns - ------- - list of Series - Others transformed into list of Series. - """ - from pandas import Series, DataFrame - - # self._orig is either Series or Index - idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index - - # Generally speaking, all objects without an index inherit the index - # `idx` of the calling Series/Index - i.e. must have matching length. - # Objects with an index (i.e. Series/Index/DataFrame) keep their own. - if isinstance(others, ABCSeries): - return [others] - elif isinstance(others, ABCIndexClass): - return [Series(others._values, index=idx)] - elif isinstance(others, ABCDataFrame): - return [others[x] for x in others] - elif isinstance(others, np.ndarray) and others.ndim == 2: - others = DataFrame(others, index=idx) - return [others[x] for x in others] - elif is_list_like(others, allow_sets=False): - others = list(others) # ensure iterators do not get read twice etc - - # in case of list-like `others`, all elements must be - # either Series/Index/np.ndarray (1-dim)... - if all( - isinstance(x, (ABCSeries, ABCIndexClass)) - or (isinstance(x, np.ndarray) and x.ndim == 1) - for x in others - ): - los = [] - while others: # iterate through list and append each element - los = los + self._get_series_list(others.pop(0)) - return los - # ... or just strings - elif all(not is_list_like(x) for x in others): - return [Series(others, index=idx)] - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarray " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) - - @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) - def cat(self, others=None, sep=None, na_rep=None, join="left"): - """ - Concatenate strings in the Series/Index with given separator. - - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not passed, then all values in the Series/Index are - concatenated into a single string with a given `sep`. - - Parameters - ---------- - others : Series, Index, DataFrame, np.ndarray or list-like - Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and - other list-likes of strings must have the same length as the - calling Series/Index, with the exception of indexed objects (i.e. - Series/Index/DataFrame) if `join` is not None. - - If others is a list-like that contains a combination of Series, - Index or np.ndarray (1-dim), then all elements will be unpacked and - must satisfy the above criteria individually. - - If others is None, the method returns the concatenation of all - strings in the calling Series/Index. - sep : str, default '' - The separator between the different elements/columns. By default - the empty string `''` is used. - na_rep : str or None, default None - Representation that is inserted for all missing values: - - - If `na_rep` is None, and `others` is None, missing values in the - Series/Index are omitted from the result. - - If `na_rep` is None, and `others` is not None, a row containing a - missing value in any of the columns (before concatenation) will - have a missing value in the result. - join : {'left', 'right', 'outer', 'inner'}, default 'left' - Determines the join-style between the calling Series/Index and any - Series/Index/DataFrame in `others` (objects without an index need - to match the length of the calling Series/Index). To disable - alignment, use `.values` on any Series/Index/DataFrame in `others`. - - .. versionadded:: 0.23.0 - .. versionchanged:: 1.0.0 - Changed default of `join` from None to `'left'`. - - Returns - ------- - str, Series or Index - If `others` is None, `str` is returned, otherwise a `Series/Index` - (same type as caller) of objects is returned. - - See Also - -------- - split : Split each string in the Series/Index. - join : Join lists contained as elements in the Series/Index. - - Examples - -------- - When not passing `others`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') - 'a b d' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> s.str.cat(sep=' ', na_rep='?') - 'a b ? d' - - If `others` is specified, corresponding values are concatenated with - the separator. Result will be a Series of strings. - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') - 0 a,A - 1 b,B - 2 NaN - 3 d,D - dtype: object - - Missing values will remain missing in the result, but can again be - represented using `na_rep` - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') - 0 a,A - 1 b,B - 2 -,C - 3 d,D - dtype: object - - If `sep` is not specified, the values are concatenated without - separation. - - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') - 0 aA - 1 bB - 2 -C - 3 dD - dtype: object - - Series with different indexes can be aligned before concatenation. The - `join`-keyword works as in other methods. - - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='outer', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - 4 -e - dtype: object - >>> - >>> s.str.cat(t, join='inner', na_rep='-') - 0 aa - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='right', na_rep='-') - 3 dd - 0 aa - 4 -e - 2 -c - dtype: object - - For more examples, see :ref:`here `. - """ - from pandas import Index, Series, concat - - if isinstance(others, str): - raise ValueError("Did you mean to supply a `sep` keyword?") - if sep is None: - sep = "" - - if isinstance(self._orig, ABCIndexClass): - data = Series(self._orig, index=self._orig) - else: # Series - data = self._orig - - # concatenate Series/Index with itself if no "others" - if others is None: - data = ensure_object(data) - na_mask = isna(data) - if na_rep is None and na_mask.any(): - data = data[~na_mask] - elif na_rep is not None and na_mask.any(): - data = np.where(na_mask, na_rep, data) - return sep.join(data) - - try: - # turn anything in "others" into lists of Series - others = self._get_series_list(others) - except ValueError as err: # do not catch TypeError raised by _get_series_list - raise ValueError( - "If `others` contains arrays or lists (or other " - "list-likes without an index), these must all be " - "of the same length as the calling Series/Index." - ) from err - - # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat( - others, - axis=1, - join=(join if join == "inner" else "outer"), - keys=range(len(others)), - sort=False, - copy=False, - ) - data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) - - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) - ] - result = cat_safe(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_safe(all_cols, sep) - - if isinstance(self._orig, ABCIndexClass): - # add dtype for case that result is all-NA - result = Index(result, dtype=object, name=self._orig.name) - else: # Series - if is_categorical_dtype(self._orig.dtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype - result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) - return result - - _shared_docs[ - "str_split" - ] = r""" - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. - n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - Series.str.split : Split strings around given separator/delimiter. - Series.str.rsplit : Splits string around given separator/delimiter, - starting from the right. - Series.str.join : Join lists contained as elements in the Series/Index - with passed delimiter. - str.split : Standard library version for split. - str.rsplit : Standard library version for rsplit. - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - Examples - -------- - >>> s = pd.Series( - ... [ - ... "this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan - ... ] - ... ) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 NaN - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - Without the `n` parameter, the outputs of `rsplit` and `split` - are identical. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `n` parameter can be used to limit the number of splits on the - delimiter. The outputs of `split` and `rsplit` are different. - - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - >>> s.str.rsplit(n=2) - 0 [this is a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `pat` parameter can be used to split by other characters. - - >>> s.str.split(pat="/") - 0 [this is a regular sentence] - 1 [https:, , docs.python.org, 3, tutorial, index... - 2 NaN - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. If NaN is present, it is propagated throughout - the columns during the split. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html None None None None - 2 NaN NaN NaN NaN NaN - - For slightly more complex use cases like splitting the html document name - from a url, a combination of parameter settings can be used. - - >>> s.str.rsplit("/", n=1, expand=True) - 0 1 - 0 this is a regular sentence None - 1 https://docs.python.org/3/tutorial index.html - 2 NaN NaN - - Remember to escape special characters when explicitly using regular - expressions. - - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 - """ - - @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) - @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) - @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): - result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - _shared_docs[ - "str_partition" - ] = """ - Split the string at the %(side)s occurrence of `sep`. - - This method splits the string at the %(side)s occurrence of `sep`, - and returns 3 elements containing the part before the separator, - the separator itself, and the part after the separator. - If the separator is not found, return %(return)s. - - Parameters - ---------- - sep : str, default whitespace - String to split on. - expand : bool, default True - If True, return DataFrame/MultiIndex expanding dimensionality. - If False, return Series/Index. - - Returns - ------- - DataFrame/MultiIndex or Series/Index of objects - - See Also - -------- - %(also)s - Series.str.split : Split strings around given separators. - str.partition : Standard library version. - - Examples - -------- - - >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - - >>> s.str.partition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by the last space instead of the first one: - - >>> s.str.rpartition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by something different than a space: - - >>> s.str.partition('-') - 0 1 2 - 0 Linda van der Berg - 1 George Pitt - Rivers - - To return a Series containing tuples instead of a DataFrame: - - >>> s.str.partition('-', expand=False) - 0 (Linda van der Berg, , ) - 1 (George Pitt, -, Rivers) - dtype: object - - Also available on indices: - - >>> idx = pd.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.partition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - - Or an index with tuples with ``expand=False``: - - >>> idx.str.partition(expand=False) - Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """ - - @Appender( - _shared_docs["str_partition"] - % { - "side": "first", - "return": "3 elements containing the string itself, followed by two " - "empty strings", - "also": "rpartition : Split the string at the last occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def partition(self, sep=" ", expand=True): - f = lambda x: x.partition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender( - _shared_docs["str_partition"] - % { - "side": "last", - "return": "3 elements containing two empty strings, followed by the " - "string itself", - "also": "partition : Split the string at the first occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rpartition(self, sep=" ", expand=True): - f = lambda x: x.rpartition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @copy(str_get) - def get(self, i): - result = str_get(self._parent, i) - return self._wrap_result(result) - - @copy(str_join) - @forbid_nonstring_types(["bytes"]) - def join(self, sep): - result = str_join(self._parent, sep) - return self._wrap_result(result) - - @copy(str_contains) - @forbid_nonstring_types(["bytes"]) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains( - self._parent, pat, case=case, flags=flags, na=na, regex=regex - ) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_match) - @forbid_nonstring_types(["bytes"]) - def match(self, pat, case=True, flags=0, na=np.nan): - result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_fullmatch) - @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case=True, flags=0, na=np.nan): - result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_replace) - @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - result = str_replace( - self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) - - @copy(str_repeat) - @forbid_nonstring_types(["bytes"]) - def repeat(self, repeats): - result = str_repeat(self._parent, repeats) - return self._wrap_result(result) - - @copy(str_pad) - @forbid_nonstring_types(["bytes"]) - def pad(self, width, side="left", fillchar=" "): - result = str_pad(self._parent, width, side=side, fillchar=fillchar) - return self._wrap_result(result) - - _shared_docs[ - "str_pad" - ] = """ - Pad %(side)s side of strings in the Series/Index. - - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with ``fillchar``. - fillchar : str - Additional character for filling, default is whitespace. - - Returns - ------- - filled : Series/Index of objects. - """ - - @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) - @forbid_nonstring_types(["bytes"]) - def center(self, width, fillchar=" "): - return self.pad(width, side="both", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) - @forbid_nonstring_types(["bytes"]) - def ljust(self, width, fillchar=" "): - return self.pad(width, side="right", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) - @forbid_nonstring_types(["bytes"]) - def rjust(self, width, fillchar=" "): - return self.pad(width, side="left", fillchar=fillchar) - - @forbid_nonstring_types(["bytes"]) - def zfill(self, width): - """ - Pad strings in the Series/Index by prepending '0' characters. - - Strings in the Series/Index are padded with '0' characters on the - left of the string to reach a total string length `width`. Strings - in the Series/Index with length greater or equal to `width` are - unchanged. - - Parameters - ---------- - width : int - Minimum length of resulting string; strings with length less - than `width` be prepended with '0' characters. - - Returns - ------- - Series/Index of objects. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. - Series.str.pad : Fills the specified sides of strings with an arbitrary - character. - Series.str.center : Fills boths sides of strings with an arbitrary - character. - - Notes - ----- - Differs from :meth:`str.zfill` which has special handling - for '+'/'-' in the string. - - Examples - -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) - >>> s - 0 -1 - 1 1 - 2 1000 - 3 10 - 4 NaN - dtype: object - - Note that ``10`` and ``NaN`` are not strings, therefore they are - converted to ``NaN``. The minus sign in ``'-1'`` is treated as a - regular character and the zero is added to the left of it - (:meth:`str.zfill` would have moved it to the left). ``1000`` - remains unchanged as it is longer than `width`. - - >>> s.str.zfill(3) - 0 0-1 - 1 001 - 2 1000 - 3 NaN - 4 NaN - dtype: object - """ - result = str_pad(self._parent, width, side="left", fillchar="0") - return self._wrap_result(result) - - @copy(str_slice) - def slice(self, start=None, stop=None, step=None): - result = str_slice(self._parent, start, stop, step) - return self._wrap_result(result) - - @copy(str_slice_replace) - @forbid_nonstring_types(["bytes"]) - def slice_replace(self, start=None, stop=None, repl=None): - result = str_slice_replace(self._parent, start, stop, repl) - return self._wrap_result(result) - - @copy(str_decode) - def decode(self, encoding, errors="strict"): - # need to allow bytes here - result = str_decode(self._parent, encoding, errors) - # TODO: Not sure how to handle this. - return self._wrap_result(result, returns_string=False) - - @copy(str_encode) - @forbid_nonstring_types(["bytes"]) - def encode(self, encoding, errors="strict"): - result = str_encode(self._parent, encoding, errors) - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "str_strip" - ] = r""" - Remove %(position)s characters. - - Strip whitespaces (including newlines) or a set of specified characters - from each string in the Series/Index from %(side)s. - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters will be stripped. - If None then whitespaces are removed. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.strip : Remove leading and trailing characters in Series/Index. - Series.str.lstrip : Remove leading characters in Series/Index. - Series.str.rstrip : Remove trailing characters in Series/Index. - - Examples - -------- - >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 NaN - dtype: object - - >>> s.str.strip() - 0 1. Ant. - 1 2. Bee! - 2 3. Cat? - 3 NaN - dtype: object - - >>> s.str.lstrip('123.') - 0 Ant. - 1 Bee!\n - 2 Cat?\t - 3 NaN - dtype: object - - >>> s.str.rstrip('.!? \n\t') - 0 1. Ant - 1 2. Bee - 2 3. Cat - 3 NaN - dtype: object - - >>> s.str.strip('123.!? \n\t') - 0 Ant - 1 Bee - 2 Cat - 3 NaN - dtype: object - """ - - @Appender( - _shared_docs["str_strip"] - % dict( - side="left and right sides", method="strip", position="leading and trailing" - ) - ) - @forbid_nonstring_types(["bytes"]) - def strip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="both") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="left side", method="lstrip", position="leading") - ) - @forbid_nonstring_types(["bytes"]) - def lstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="left") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="right side", method="rstrip", position="trailing") - ) - @forbid_nonstring_types(["bytes"]) - def rstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="right") - return self._wrap_result(result) - - @copy(str_wrap) - @forbid_nonstring_types(["bytes"]) - def wrap(self, width, **kwargs): - result = str_wrap(self._parent, width, **kwargs) - return self._wrap_result(result) - - @copy(str_get_dummies) - @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep="|"): - # we need to cast to Series of strings as only that has all - # methods available for making the dummies... - data = self._orig.astype(str) if self._is_categorical else self._parent - result, name = str_get_dummies(data, sep) - return self._wrap_result( - result, - use_codes=(not self._is_categorical), - name=name, - expand=True, - returns_string=False, - ) - - @copy(str_translate) - @forbid_nonstring_types(["bytes"]) - def translate(self, table): - result = str_translate(self._parent, table) - return self._wrap_result(result) - - count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) - startswith = _pat_wrapper( - str_startswith, na=True, name="startswith", returns_string=False - ) - endswith = _pat_wrapper( - str_endswith, na=True, name="endswith", returns_string=False - ) - findall = _pat_wrapper( - str_findall, flags=True, name="findall", returns_string=False - ) - - @copy(str_extract) - @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): - return str_extract(self, pat, flags=flags, expand=expand) - - @copy(str_extractall) - @forbid_nonstring_types(["bytes"]) - def extractall(self, pat, flags=0): - return str_extractall(self._orig, pat, flags=flags) - - _shared_docs[ - "find" - ] = """ - Return %(side)s indexes in each strings in the Series/Index. - - Each of returned indexes corresponds to the position where the - substring is fully contained between [start:end]. Return -1 on - failure. Equivalent to standard :meth:`str.%(method)s`. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of int. - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["find"] - % dict( - side="lowest", - method="find", - also="rfind : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def find(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["find"] - % dict( - side="highest", - method="rfind", - also="find : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rfind(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def normalize(self, form): - """ - Return the Unicode normal form for the strings in the Series/Index. - - For more information on the forms, see the - :func:`unicodedata.normalize`. - - Parameters - ---------- - form : {'NFC', 'NFKC', 'NFD', 'NFKD'} - Unicode form. - - Returns - ------- - normalized : Series/Index of objects - """ - import unicodedata - - f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent, dtype=str) - return self._wrap_result(result) - - _shared_docs[ - "index" - ] = """ - Return %(side)s indexes in each string in Series/Index. - - Each of the returned indexes corresponds to the position where the - substring is fully contained between [start:end]. This is the same - as ``str.%(similar)s`` except instead of returning -1, it raises a - ValueError when the substring is not found. Equivalent to standard - ``str.%(method)s``. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["index"] - % dict( - side="lowest", - similar="find", - method="index", - also="rindex : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def index(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["index"] - % dict( - side="highest", - similar="rfind", - method="rindex", - also="index : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rindex(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "len" - ] = """ - Compute the length of each element in the Series/Index. - - The element may be a sequence (such as a string, tuple or list) or a collection - (such as a dictionary). - - Returns - ------- - Series or Index of int - A Series or Index of integer values indicating the length of each - element in the Series or Index. - - See Also - -------- - str.len : Python built-in function returning the length of an object. - Series.size : Returns the length of the Series. - - Examples - -------- - Returns the length (number of characters) in a string. Returns the - number of entries for dictionaries, lists or tuples. - - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) - >>> s - 0 dog - 1 - 2 5 - 3 {'foo': 'bar'} - 4 [2, 3, 5, 7] - 5 (one, two, three) - dtype: object - >>> s.str.len() - 0 3.0 - 1 0.0 - 2 NaN - 3 1.0 - 4 4.0 - 5 3.0 - dtype: float64 - """ - len = _noarg_wrapper( - len, - docstring=_shared_docs["len"], - forbidden_types=None, - dtype=np.dtype("int64"), - returns_string=False, - ) - - _shared_docs[ - "casemethods" - ] = """ - Convert strings in the Series/Index to %(type)s. - %(version)s - Equivalent to :meth:`str.%(method)s`. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.lower : Converts all characters to lowercase. - Series.str.upper : Converts all characters to uppercase. - Series.str.title : Converts first character of each word to uppercase and - remaining to lowercase. - Series.str.capitalize : Converts first character to uppercase and - remaining to lowercase. - Series.str.swapcase : Converts uppercase to lowercase and lowercase to - uppercase. - Series.str.casefold: Removes all case distinctions in the string. - - Examples - -------- - >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - - >>> s.str.lower() - 0 lower - 1 capitals - 2 this is a sentence - 3 swapcase - dtype: object - - >>> s.str.upper() - 0 LOWER - 1 CAPITALS - 2 THIS IS A SENTENCE - 3 SWAPCASE - dtype: object - - >>> s.str.title() - 0 Lower - 1 Capitals - 2 This Is A Sentence - 3 Swapcase - dtype: object - - >>> s.str.capitalize() - 0 Lower - 1 Capitals - 2 This is a sentence - 3 Swapcase - dtype: object - - >>> s.str.swapcase() - 0 LOWER - 1 capitals - 2 THIS IS A SENTENCE - 3 sWaPcAsE - dtype: object - """ - - # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} - _doc_args["lower"] = dict(type="lowercase", method="lower", version="") - _doc_args["upper"] = dict(type="uppercase", method="upper", version="") - _doc_args["title"] = dict(type="titlecase", method="title", version="") - _doc_args["capitalize"] = dict( - type="be capitalized", method="capitalize", version="" - ) - _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") - _doc_args["casefold"] = dict( - type="be casefolded", - method="casefold", - version="\n .. versionadded:: 0.25.0\n", - ) - lower = _noarg_wrapper( - lambda x: x.lower(), - name="lower", - docstring=_shared_docs["casemethods"] % _doc_args["lower"], - dtype=str, - ) - upper = _noarg_wrapper( - lambda x: x.upper(), - name="upper", - docstring=_shared_docs["casemethods"] % _doc_args["upper"], - dtype=str, - ) - title = _noarg_wrapper( - lambda x: x.title(), - name="title", - docstring=_shared_docs["casemethods"] % _doc_args["title"], - dtype=str, - ) - capitalize = _noarg_wrapper( - lambda x: x.capitalize(), - name="capitalize", - docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], - dtype=str, - ) - swapcase = _noarg_wrapper( - lambda x: x.swapcase(), - name="swapcase", - docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], - dtype=str, - ) - casefold = _noarg_wrapper( - lambda x: x.casefold(), - name="casefold", - docstring=_shared_docs["casemethods"] % _doc_args["casefold"], - dtype=str, - ) - - _shared_docs[ - "ismethods" - ] = """ - Check whether all characters in each string are %(type)s. - - This is equivalent to running the Python string method - :meth:`str.%(method)s` for each element of the Series/Index. If a string - has zero characters, ``False`` is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as the original - Series/Index. - - See Also - -------- - Series.str.isalpha : Check whether all characters are alphabetic. - Series.str.isnumeric : Check whether all characters are numeric. - Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits. - Series.str.isdecimal : Check whether all characters are decimal. - Series.str.isspace : Check whether all characters are whitespace. - Series.str.islower : Check whether all characters are lowercase. - Series.str.isupper : Check whether all characters are uppercase. - Series.str.istitle : Check whether all characters are titlecase. - - Examples - -------- - **Checks for Alphabetic and Numeric Characters** - - >>> s1 = pd.Series(['one', 'one1', '1', '']) - - >>> s1.str.isalpha() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s1.str.isnumeric() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - >>> s1.str.isalnum() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - Note that checks against characters mixed with any additional punctuation - or whitespace will evaluate to false for an alphanumeric check. - - >>> s2 = pd.Series(['A B', '1.5', '3,000']) - >>> s2.str.isalnum() - 0 False - 1 False - 2 False - dtype: bool - - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. - - >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - - >>> s3.str.isdecimal() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. - - >>> s3.str.isdigit() - 0 True - 1 True - 2 False - 3 False - dtype: bool - - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - **Checks for Whitespace** - - >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) - >>> s4.str.isspace() - 0 True - 1 True - 2 False - dtype: bool - - **Checks for Character Case** - - >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - - >>> s5.str.islower() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s5.str.isupper() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - The ``s5.str.istitle`` method checks for whether all words are in title - case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters separated by - whitespace characters. - - >>> s5.str.istitle() - 0 False - 1 True - 2 False - 3 False - dtype: bool - """ - _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") - _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") - _doc_args["isdigit"] = dict(type="digits", method="isdigit") - _doc_args["isspace"] = dict(type="whitespace", method="isspace") - _doc_args["islower"] = dict(type="lowercase", method="islower") - _doc_args["isupper"] = dict(type="uppercase", method="isupper") - _doc_args["istitle"] = dict(type="titlecase", method="istitle") - _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") - _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") - # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) - isalnum = _noarg_wrapper( - lambda x: x.isalnum(), - name="isalnum", - docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], - returns_string=False, - dtype=np.dtype(bool), - ) - isalpha = _noarg_wrapper( - lambda x: x.isalpha(), - name="isalpha", - docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdigit = _noarg_wrapper( - lambda x: x.isdigit(), - name="isdigit", - docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], - returns_string=False, - dtype=np.dtype(bool), - ) - isspace = _noarg_wrapper( - lambda x: x.isspace(), - name="isspace", - docstring=_shared_docs["ismethods"] % _doc_args["isspace"], - returns_string=False, - dtype=np.dtype(bool), - ) - islower = _noarg_wrapper( - lambda x: x.islower(), - name="islower", - docstring=_shared_docs["ismethods"] % _doc_args["islower"], - returns_string=False, - dtype=np.dtype(bool), - ) - isupper = _noarg_wrapper( - lambda x: x.isupper(), - name="isupper", - docstring=_shared_docs["ismethods"] % _doc_args["isupper"], - returns_string=False, - dtype=np.dtype(bool), - ) - istitle = _noarg_wrapper( - lambda x: x.istitle(), - name="istitle", - docstring=_shared_docs["ismethods"] % _doc_args["istitle"], - returns_string=False, - dtype=np.dtype(bool), - ) - isnumeric = _noarg_wrapper( - lambda x: x.isnumeric(), - name="isnumeric", - docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdecimal = _noarg_wrapper( - lambda x: x.isdecimal(), - name="isdecimal", - docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], - returns_string=False, - dtype=np.dtype(bool), - ) - - @classmethod - def _make_accessor(cls, data): - cls._validate(data) - return cls(data) diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py new file mode 100644 index 0000000000000..243250f0360a0 --- /dev/null +++ b/pandas/core/strings/__init__.py @@ -0,0 +1,32 @@ +""" +Implementation of pandas.Series.str and its interface. + +* strings.accessor.StringMethods : Accessor for Series.str +* strings.base.BaseStringArrayMethods: Mixin ABC for EAs to implement str methods + +Most methods on the StringMethods accessor follow the pattern: + + 1. extract the array from the series (or index) + 2. Call that array's implementation of the string method + 3. Wrap the result (in a Series, index, or DataFrame) + +Pandas extension arrays implementing string methods should inherit from +pandas.core.strings.base.BaseStringArrayMethods. This is an ABC defining +the various string methods. To avoid namespace clashes and pollution, +these are prefixed with `_str_`. So ``Series.str.upper()`` calls +``Series.array._str_upper()``. The interface isn't currently public +to other string extension arrays. +""" +# Pandas current implementation is in ObjectStringArrayMixin. This is designed +# to work on object-dtype ndarrays. +# +# BaseStringArrayMethods +# - ObjectStringArrayMixin +# - StringArray +# - PandasArray +# - Categorical + +from .accessor import StringMethods +from .base import BaseStringArrayMethods + +__all__ = ["StringMethods", "BaseStringArrayMethods"] diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py new file mode 100644 index 0000000000000..2713b76189157 --- /dev/null +++ b/pandas/core/strings/accessor.py @@ -0,0 +1,3110 @@ +import codecs +from functools import wraps +import re +from typing import Dict, List, Optional +import warnings + +import numpy as np + +import pandas._libs.lib as lib +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_integer, + is_list_like, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.base import NoNewAttributesMixin + +_shared_docs: Dict[str, str] = {} +_cpython_optimized_encoders = ( + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") + + +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) + return func(self, *args, **kwargs) + + wrapper.__name__ = func_name + return wrapper + + return _forbid_nonstring_types + + +def _map_and_wrap(name, docstring): + @forbid_nonstring_types(["bytes"], name=name) + def wrapper(self): + result = getattr(self._array, f"_str_{name}")() + return self._wrap_result(result) + + wrapper.__doc__ = docstring + return wrapper + + +class StringMethods(NoNewAttributesMixin): + """ + Vectorized string functions for Series and Index. + + NAs stay NA unless handled otherwise by a particular method. + Patterned after Python's string methods, with some inspiration from + R's stringr package. + + Examples + -------- + >>> s = pd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: object + + >>> s.str.split("_") + 0 [A, Str, Series] + dtype: object + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: object + """ + + # Note: see the docstring in pandas.core.strings.__init__ + # for an explanation of the implementation. + # TODO: Dispatch all the methods + # Currently the following are not dispatched to the array + # * cat + # * extract + # * extractall + + def __init__(self, data): + from pandas.core.arrays.string_ import StringDtype + + self._inferred_dtype = self._validate(data) + self._is_categorical = is_categorical_dtype(data.dtype) + self._is_string = isinstance(data.dtype, StringDtype) + array = data.array + self._array = array + + self._index = self._name = None + if isinstance(data, ABCSeries): + self._index = data.index + self._name = data.name + + # ._values.categories works for both Series/Index + self._parent = data._values.categories if self._is_categorical else data + # save orig to blow up categoricals to the right type + self._orig = data + self._freeze() + + @staticmethod + def _validate(data): + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object, and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + from pandas import StringDtype + + if isinstance(data, ABCMultiIndex): + raise AttributeError( + "Can only use .str accessor with Index, not MultiIndex" + ) + + # see _libs/lib.pyx for list of inferred types + allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] + + values = getattr(data, "values", data) # Series / Index + values = getattr(values, "categories", values) # categorical / normal + + # explicitly allow StringDtype + if isinstance(values.dtype, StringDtype): + return "string" + + try: + inferred_dtype = lib.infer_dtype(values, skipna=True) + except ValueError: + # GH#27571 mostly occurs with ExtensionArray + inferred_dtype = None + + if inferred_dtype not in allowed_types: + raise AttributeError("Can only use .str accessor with string values!") + return inferred_dtype + + def __getitem__(self, key): + result = self._array._str_getitem(key) + return self._wrap_result(result) + + def __iter__(self): + warnings.warn( + "Columnar iteration over characters will be deprecated in future releases.", + FutureWarning, + stacklevel=2, + ) + i = 0 + g = self.get(i) + while g.notna().any(): + yield g + i += 1 + g = self.get(i) + + def _wrap_result( + self, + result, + name=None, + expand=None, + fill_value=np.nan, + returns_string=True, + ): + from pandas import Index, MultiIndex + + if not hasattr(result, "ndim") or not hasattr(result, "dtype"): + if isinstance(result, ABCDataFrame): + result = result.__finalize__(self._orig, name="str") + return result + assert result.ndim < 3 + + # We can be wrapping a string / object / categorical result, in which + # case we'll want to return the same dtype as the input. + # Or we can be wrapping a numeric output, in which case we don't want + # to return a StringArray. + # Ideally the array method returns the right array type. + if expand is None: + # infer from ndim if expand is not specified + expand = result.ndim != 1 + + elif expand is True and not isinstance(self._orig, ABCIndexClass): + # required when expand=True is explicitly specified + # not needed when inferred + + def cons_row(x): + if is_list_like(x): + return x + else: + return [x] + + result = [cons_row(x) for x in result] + if result: + # propagate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result + ] + + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + if expand is False: + # if expand is False, result should have the same name + # as the original otherwise specified + if name is None: + name = getattr(result, "name", None) + if name is None: + # do not use logical or, _orig may be a DataFrame + # which has "name" column + name = self._orig.name + + # Wait until we are sure result is a Series or Index before + # checking attributes (GH 12180) + if isinstance(self._orig, ABCIndexClass): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + + if expand: + result = list(result) + out = MultiIndex.from_tuples(result, names=name) + if out.nlevels == 1: + # We had all tuples of length-one, which are + # better represented as a regular Index. + out = out.get_level_values(0) + return out + else: + return Index(result, name=name) + else: + index = self._orig.index + # This is a mess. + dtype: Optional[str] + if self._is_string and returns_string: + dtype = "string" + else: + dtype = None + + if expand: + cons = self._orig._constructor_expanddim + result = cons(result, columns=name, index=index, dtype=dtype) + else: + # Must be a Series + cons = self._orig._constructor + result = cons(result, name=name, index=index) + result = result.__finalize__(self._orig, method="str") + if name is not None and result.ndim == 1: + # __finalize__ might copy over the original name, but we may + # want the new name (e.g. str.extract). + result.name = name + return result + + def _get_series_list(self, others): + """ + Auxiliary function for :meth:`str.cat`. Turn potentially mixed input + into a list of Series (elements without an index must match the length + of the calling Series/Index). + + Parameters + ---------- + others : Series, DataFrame, np.ndarray, list-like or list-like of + Objects that are either Series, Index or np.ndarray (1-dim). + + Returns + ------- + list of Series + Others transformed into list of Series. + """ + from pandas import DataFrame, Series + + # self._orig is either Series or Index + idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index + + # Generally speaking, all objects without an index inherit the index + # `idx` of the calling Series/Index - i.e. must have matching length. + # Objects with an index (i.e. Series/Index/DataFrame) keep their own. + if isinstance(others, ABCSeries): + return [others] + elif isinstance(others, ABCIndexClass): + return [Series(others._values, index=idx)] + elif isinstance(others, ABCDataFrame): + return [others[x] for x in others] + elif isinstance(others, np.ndarray) and others.ndim == 2: + others = DataFrame(others, index=idx) + return [others[x] for x in others] + elif is_list_like(others, allow_sets=False): + others = list(others) # ensure iterators do not get read twice etc + + # in case of list-like `others`, all elements must be + # either Series/Index/np.ndarray (1-dim)... + if all( + isinstance(x, (ABCSeries, ABCIndexClass)) + or (isinstance(x, np.ndarray) and x.ndim == 1) + for x in others + ): + los: List[Series] = [] + while others: # iterate through list and append each element + los = los + self._get_series_list(others.pop(0)) + return los + # ... or just strings + elif all(not is_list_like(x) for x in others): + return [Series(others, index=idx)] + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarray " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + + @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) + def cat(self, others=None, sep=None, na_rep=None, join="left"): + """ + Concatenate strings in the Series/Index with given separator. + + If `others` is specified, this function concatenates the Series/Index + and elements of `others` element-wise. + If `others` is not passed, then all values in the Series/Index are + concatenated into a single string with a given `sep`. + + Parameters + ---------- + others : Series, Index, DataFrame, np.ndarray or list-like + Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and + other list-likes of strings must have the same length as the + calling Series/Index, with the exception of indexed objects (i.e. + Series/Index/DataFrame) if `join` is not None. + + If others is a list-like that contains a combination of Series, + Index or np.ndarray (1-dim), then all elements will be unpacked and + must satisfy the above criteria individually. + + If others is None, the method returns the concatenation of all + strings in the calling Series/Index. + sep : str, default '' + The separator between the different elements/columns. By default + the empty string `''` is used. + na_rep : str or None, default None + Representation that is inserted for all missing values: + + - If `na_rep` is None, and `others` is None, missing values in the + Series/Index are omitted from the result. + - If `na_rep` is None, and `others` is not None, a row containing a + missing value in any of the columns (before concatenation) will + have a missing value in the result. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + Determines the join-style between the calling Series/Index and any + Series/Index/DataFrame in `others` (objects without an index need + to match the length of the calling Series/Index). To disable + alignment, use `.values` on any Series/Index/DataFrame in `others`. + + .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + Changed default of `join` from None to `'left'`. + + Returns + ------- + str, Series or Index + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. + + See Also + -------- + split : Split each string in the Series/Index. + join : Join lists contained as elements in the Series/Index. + + Examples + -------- + When not passing `others`, all values are concatenated into a single + string: + + >>> s = pd.Series(['a', 'b', np.nan, 'd']) + >>> s.str.cat(sep=' ') + 'a b d' + + By default, NA values in the Series are ignored. Using `na_rep`, they + can be given a representation: + + >>> s.str.cat(sep=' ', na_rep='?') + 'a b ? d' + + If `others` is specified, corresponding values are concatenated with + the separator. Result will be a Series of strings. + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + 0 a,A + 1 b,B + 2 NaN + 3 d,D + dtype: object + + Missing values will remain missing in the result, but can again be + represented using `na_rep` + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + 0 a,A + 1 b,B + 2 -,C + 3 d,D + dtype: object + + If `sep` is not specified, the values are concatenated without + separation. + + >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + 0 aA + 1 bB + 2 -C + 3 dD + dtype: object + + Series with different indexes can be aligned before concatenation. The + `join`-keyword works as in other methods. + + >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join='left', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='outer', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + 4 -e + dtype: object + >>> + >>> s.str.cat(t, join='inner', na_rep='-') + 0 aa + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='right', na_rep='-') + 3 dd + 0 aa + 4 -e + 2 -c + dtype: object + + For more examples, see :ref:`here `. + """ + # TODO: dispatch + from pandas import Index, Series, concat + + if isinstance(others, str): + raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = "" + + if isinstance(self._orig, ABCIndexClass): + data = Series(self._orig, index=self._orig) + else: # Series + data = self._orig + + # concatenate Series/Index with itself if no "others" + if others is None: + data = ensure_object(data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) + return sep.join(data) + + try: + # turn anything in "others" into lists of Series + others = self._get_series_list(others) + except ValueError as err: # do not catch TypeError raised by _get_series_list + raise ValueError( + "If `others` contains arrays or lists (or other " + "list-likes without an index), these must all be " + "of the same length as the calling Series/Index." + ) from err + + # align if required + if any(not data.index.equals(x.index) for x in others): + # Need to add keys for uniqueness in case of duplicate columns + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] + result = cat_safe(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_safe(all_cols, sep) + + if isinstance(self._orig, ABCIndexClass): + # add dtype for case that result is all-NA + result = Index(result, dtype=object, name=self._orig.name) + else: # Series + if is_categorical_dtype(self._orig.dtype): + # We need to infer the new categories. + dtype = None + else: + dtype = self._orig.dtype + result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) + result = result.__finalize__(self._orig, method="str_cat") + return result + + _shared_docs[ + "str_split" + ] = r""" + Split strings around given separator/delimiter. + + Splits the string in the Series/Index from the %(side)s, + at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + pat : str, optional + String or regular expression to split on. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the split strings into separate columns. + + * If ``True``, return DataFrame/MultiIndex expanding dimensionality. + * If ``False``, return Series/Index, containing lists of strings. + + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + + See Also + -------- + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, + starting from the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + + Examples + -------- + >>> s = pd.Series( + ... [ + ... "this is a regular sentence", + ... "https://docs.python.org/3/tutorial/index.html", + ... np.nan + ... ] + ... ) + >>> s + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html + 2 NaN + dtype: object + + In the default setting, the string is split by whitespace. + + >>> s.str.split() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. + + >>> s.str.rsplit() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat="/") + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN + dtype: object + + When using ``expand=True``, the split elements will expand out into + separate columns. If NaN is present, it is propagated throughout + the columns during the split. + + >>> s.str.split(expand=True) + 0 1 2 3 4 + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html None None None None + 2 NaN NaN NaN NaN NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True) + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN + + Remember to escape special characters when explicitly using regular + expressions. + + >>> s = pd.Series(["1+1=2"]) + >>> s + 0 1+1=2 + dtype: object + >>> s.str.split(r"\+|=", expand=True) + 0 1 2 + 0 1 1 2 + """ + + @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) + @forbid_nonstring_types(["bytes"]) + def split(self, pat=None, n=-1, expand=False): + result = self._array._str_split(pat, n, expand) + return self._wrap_result(result, returns_string=expand, expand=expand) + + @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) + @forbid_nonstring_types(["bytes"]) + def rsplit(self, pat=None, n=-1, expand=False): + result = self._array._str_rsplit(pat, n=n) + return self._wrap_result(result, expand=expand, returns_string=expand) + + _shared_docs[ + "str_partition" + ] = """ + Split the string at the %(side)s occurrence of `sep`. + + This method splits the string at the %(side)s occurrence of `sep`, + and returns 3 elements containing the part before the separator, + the separator itself, and the part after the separator. + If the separator is not found, return %(return)s. + + Parameters + ---------- + sep : str, default whitespace + String to split on. + expand : bool, default True + If True, return DataFrame/MultiIndex expanding dimensionality. + If False, return Series/Index. + + Returns + ------- + DataFrame/MultiIndex or Series/Index of objects + + See Also + -------- + %(also)s + Series.str.split : Split strings around given separators. + str.partition : Standard library version. + + Examples + -------- + + >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) + >>> s + 0 Linda van der Berg + 1 George Pitt-Rivers + dtype: object + + >>> s.str.partition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by the last space instead of the first one: + + >>> s.str.rpartition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by something different than a space: + + >>> s.str.partition('-') + 0 1 2 + 0 Linda van der Berg + 1 George Pitt - Rivers + + To return a Series containing tuples instead of a DataFrame: + + >>> s.str.partition('-', expand=False) + 0 (Linda van der Berg, , ) + 1 (George Pitt, -, Rivers) + dtype: object + + Also available on indices: + + >>> idx = pd.Index(['X 123', 'Y 999']) + >>> idx + Index(['X 123', 'Y 999'], dtype='object') + + Which will create a MultiIndex: + + >>> idx.str.partition() + MultiIndex([('X', ' ', '123'), + ('Y', ' ', '999')], + ) + + Or an index with tuples with ``expand=False``: + + >>> idx.str.partition(expand=False) + Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') + """ + + @Appender( + _shared_docs["str_partition"] + % { + "side": "first", + "return": "3 elements containing the string itself, followed by two " + "empty strings", + "also": "rpartition : Split the string at the last occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def partition(self, sep=" ", expand=True): + result = self._array._str_partition(sep, expand) + return self._wrap_result(result, expand=expand, returns_string=expand) + + @Appender( + _shared_docs["str_partition"] + % { + "side": "last", + "return": "3 elements containing two empty strings, followed by the " + "string itself", + "also": "partition : Split the string at the first occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rpartition(self, sep=" ", expand=True): + result = self._array._str_rpartition(sep, expand) + return self._wrap_result(result, expand=expand, returns_string=expand) + + def get(self, i): + """ + Extract element from each component at specified position. + + Extract element from lists, tuples, or strings in each element in the + Series/Index. + + Parameters + ---------- + i : int + Position of element to extract. + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = pd.Series(["String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}]) + >>> s + 0 String + 1 (1, 2, 3) + 2 [a, b, c] + 3 123 + 4 -456 + 5 {1: 'Hello', '2': 'World'} + dtype: object + + >>> s.str.get(1) + 0 t + 1 2 + 2 b + 3 NaN + 4 NaN + 5 Hello + dtype: object + + >>> s.str.get(-1) + 0 g + 1 3 + 2 c + 3 NaN + 4 NaN + 5 None + dtype: object + """ + result = self._array._str_get(i) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def join(self, sep): + """ + Join lists contained as elements in the Series/Index with passed delimiter. + + If the elements of a Series are lists themselves, join the content of these + lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + Parameters + ---------- + sep : str + Delimiter to use between list entries. + + Returns + ------- + Series/Index: object + The list entries concatenated by intervening occurrences of the + delimiter. + + Raises + ------ + AttributeError + If the supplied Series contains neither strings nor lists. + + See Also + -------- + str.join : Standard library version of this method. + Series.str.split : Split strings around given separator/delimiter. + + Notes + ----- + If any of the list items is not a string object, the result of the join + will be `NaN`. + + Examples + -------- + Example with a list that contains non-string elements. + + >>> s = pd.Series([['lion', 'elephant', 'zebra'], + ... [1.1, 2.2, 3.3], + ... ['cat', np.nan, 'dog'], + ... ['cow', 4.5, 'goat'], + ... ['duck', ['swan', 'fish'], 'guppy']]) + >>> s + 0 [lion, elephant, zebra] + 1 [1.1, 2.2, 3.3] + 2 [cat, nan, dog] + 3 [cow, 4.5, goat] + 4 [duck, [swan, fish], guppy] + dtype: object + + Join all lists using a '-'. The lists containing object(s) of types other + than str will produce a NaN. + + >>> s.str.join('-') + 0 lion-elephant-zebra + 1 NaN + 2 NaN + 3 NaN + 4 NaN + dtype: object + """ + result = self._array._str_join(sep) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def contains(self, pat, case=True, flags=0, na=None, regex=True): + r""" + Test if pattern or regex is contained within a string of a Series or Index. + + Return boolean Series or Index based on whether a given pattern or regex is + contained within a string of a Series or Index. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Flags to pass through to the re module, e.g. re.IGNORECASE. + na : scalar, optional + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + regex : bool, default True + If True, assumes the pat is a regular expression. + + If False, treats the pat as a literal string. + + Returns + ------- + Series or Index of boolean values + A Series or Index of boolean values indicating whether the + given pattern is contained within the string of each element + of the Series or Index. + + See Also + -------- + match : Analogous, but stricter, relying on re.match instead of re.search. + Series.str.startswith : Test if the start of each string element matches a + pattern. + Series.str.endswith : Same as startswith, but tests the end of string. + + Examples + -------- + Returning a Series of booleans using only a literal pattern. + + >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) + >>> s1.str.contains('og', regex=False) + 0 False + 1 True + 2 False + 3 False + 4 NaN + dtype: object + + Returning an Index of booleans using only a literal pattern. + + >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) + >>> ind.str.contains('23', regex=False) + Index([False, False, False, True, nan], dtype='object') + + Specifying case sensitivity using `case`. + + >>> s1.str.contains('oG', case=True, regex=True) + 0 False + 1 False + 2 False + 3 False + 4 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN` replaces NaN values + with `False`. If Series or Index does not contain NaN values + the resultant dtype will be `bool`, otherwise, an `object` dtype. + + >>> s1.str.contains('og', na=False, regex=True) + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + Returning 'house' or 'dog' when either expression occurs in a string. + + >>> s1.str.contains('house|dog', regex=True) + 0 False + 1 True + 2 True + 3 False + 4 NaN + dtype: object + + Ignoring case sensitivity using `flags` with regex. + + >>> import re + >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + 0 False + 1 False + 2 True + 3 False + 4 NaN + dtype: object + + Returning any digit using regular expression. + + >>> s1.str.contains('\\d', regex=True) + 0 False + 1 False + 2 False + 3 True + 4 NaN + dtype: object + + Ensure `pat` is a not a literal pattern when `regex` is set to True. + Note in the following example one might expect only `s2[1]` and `s2[3]` to + return `True`. However, '.0' as a regex matches any character + followed by a 0. + + >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) + >>> s2.str.contains('.0', regex=True) + 0 True + 1 True + 2 False + 3 True + 4 False + dtype: bool + """ + result = self._array._str_contains(pat, case, flags, na, regex) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def match(self, pat, case=True, flags=0, na=None): + """ + Determine if each string starts with a match of a regular expression. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : scalar, optional + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + fullmatch : Stricter matching that requires the entire string to match. + contains : Analogous, but less strict, relying on re.search instead of + re.match. + extract : Extract matched groups. + """ + result = self._array._str_match(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def fullmatch(self, pat, case=True, flags=0, na=None): + """ + Determine if each string entirely matches a regular expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : scalar, optional. + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + """ + result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): + r""" + Replace each occurrence of pattern/regex in the Series/Index. + + Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on + the regex value. + + Parameters + ---------- + pat : str or compiled regex + String can be a character sequence or regular expression. + repl : str or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + n : int, default -1 (all) + Number of replacements to make from start. + case : bool, default None + Determines if replace is case sensitive: + + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex. + + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled + regex. + regex : bool, default True + Determines if assumes the passed-in pattern is a regular expression: + + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + .. versionadded:: 0.23.0 + + Returns + ------- + Series or Index of object + A copy of the object with all matching occurrences of `pat` replaced by + `repl`. + + Raises + ------ + ValueError + * if `regex` is False and `repl` is a callable or `pat` is a compiled + regex + * if `pat` is a compiled regex and `case` or `flags` is set + + Notes + ----- + When `pat` is a compiled regex, all flags should be included in the + compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled + regex will raise an error. + + Examples + -------- + When `pat` is a string and `regex` is True (the default), the given `pat` + is compiled as a regex. When `repl` is a string, it replaces matching + regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are + left as is: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 NaN + dtype: object + + When `pat` is a string and `regex` is False, every `pat` is replaced with + `repl` as with :meth:`str.replace`: + + >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz + 2 NaN + dtype: object + + When `repl` is a callable, it is called on every `pat` using + :func:`re.sub`. The callable should expect one positional argument + (a regex object) and return a string. + + To get the idea: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) + 0 oo + 1 uz + 2 NaN + dtype: object + + Reverse every lowercase alphabetic word: + + >>> repl = lambda m: m.group(0)[::-1] + >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) + 0 oof 123 + 1 rab zab + 2 NaN + dtype: object + + Using regex groups (extract second group and swap case): + + >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" + >>> repl = lambda m: m.group('two').swapcase() + >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) + 0 tWO + 1 bAR + dtype: object + + Using a compiled regex with flags + + >>> import re + >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') + 0 foo + 1 bar + 2 NaN + dtype: object + """ + if regex is None: + if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"): + # warn only in cases where regex behavior would differ from literal + msg = ( + "The default value of regex will change from True to False " + "in a future version." + ) + if len(pat) == 1: + msg += ( + " In addition, single character regular expressions will" + "*not* be treated as literal strings when regex=True." + ) + warnings.warn(msg, FutureWarning, stacklevel=3) + regex = True + result = self._array._str_replace( + pat, repl, n=n, case=case, flags=flags, regex=regex + ) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def repeat(self, repeats): + """ + Duplicate each string in the Series or Index. + + Parameters + ---------- + repeats : int or sequence of int + Same value for all (int) or different value per (sequence). + + Returns + ------- + Series or Index of object + Series or Index of repeated string objects specified by + input parameter repeats. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + + Single int repeats string in Series + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: object + + Sequence of int repeats corresponding string in Series + + >>> s.str.repeat(repeats=[1, 2, 3]) + 0 a + 1 bb + 2 ccc + dtype: object + """ + result = self._array._str_repeat(repeats) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def pad(self, width, side="left", fillchar=" "): + """ + Pad strings in the Series/Index up to width. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with character defined in `fillchar`. + side : {'left', 'right', 'both'}, default 'left' + Side from which to fill resulting string. + fillchar : str, default ' ' + Additional character for filling, default is whitespace. + + Returns + ------- + Series or Index of object + Returns Series or Index with minimum number of char in object. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='left')``. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='right')``. + Series.str.center : Fills both sides of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='both')``. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. + + Examples + -------- + >>> s = pd.Series(["caribou", "tiger"]) + >>> s + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10) + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10, side='right', fillchar='-') + 0 caribou--- + 1 tiger----- + dtype: object + + >>> s.str.pad(width=10, side='both', fillchar='-') + 0 -caribou-- + 1 --tiger--- + dtype: object + """ + if not isinstance(fillchar, str): + msg = f"fillchar must be a character, not {type(fillchar).__name__}" + raise TypeError(msg) + + if len(fillchar) != 1: + raise TypeError("fillchar must be a character, not str") + + if not is_integer(width): + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) + + result = self._array._str_pad(width, side=side, fillchar=fillchar) + return self._wrap_result(result) + + _shared_docs[ + "str_pad" + ] = """ + Pad %(side)s side of strings in the Series/Index. + + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with ``fillchar``. + fillchar : str + Additional character for filling, default is whitespace. + + Returns + ------- + filled : Series/Index of objects. + """ + + @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"}) + @forbid_nonstring_types(["bytes"]) + def center(self, width, fillchar=" "): + return self.pad(width, side="both", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"}) + @forbid_nonstring_types(["bytes"]) + def ljust(self, width, fillchar=" "): + return self.pad(width, side="right", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"}) + @forbid_nonstring_types(["bytes"]) + def rjust(self, width, fillchar=" "): + return self.pad(width, side="left", fillchar=fillchar) + + @forbid_nonstring_types(["bytes"]) + def zfill(self, width): + """ + Pad strings in the Series/Index by prepending '0' characters. + + Strings in the Series/Index are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the Series/Index with length greater or equal to `width` are + unchanged. + + Parameters + ---------- + width : int + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns + ------- + Series/Index of objects. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.pad : Fills the specified sides of strings with an arbitrary + character. + Series.str.center : Fills both sides of strings with an arbitrary + character. + + Notes + ----- + Differs from :meth:`str.zfill` which has special handling + for '+'/'-' in the string. + + Examples + -------- + >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) + >>> s + 0 -1 + 1 1 + 2 1000 + 3 10 + 4 NaN + dtype: object + + Note that ``10`` and ``NaN`` are not strings, therefore they are + converted to ``NaN``. The minus sign in ``'-1'`` is treated as a + regular character and the zero is added to the left of it + (:meth:`str.zfill` would have moved it to the left). ``1000`` + remains unchanged as it is longer than `width`. + + >>> s.str.zfill(3) + 0 0-1 + 1 001 + 2 1000 + 3 NaN + 4 NaN + dtype: object + """ + result = self.pad(width, side="left", fillchar="0") + return self._wrap_result(result) + + def slice(self, start=None, stop=None, step=None): + """ + Slice substrings from each element in the Series or Index. + + Parameters + ---------- + start : int, optional + Start position for slice operation. + stop : int, optional + Stop position for slice operation. + step : int, optional + Step size for slice operation. + + Returns + ------- + Series or Index of object + Series or Index from sliced substring from original string object. + + See Also + -------- + Series.str.slice_replace : Replace a slice with a string. + Series.str.get : Return element at position. + Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` + being the position. + + Examples + -------- + >>> s = pd.Series(["koala", "fox", "chameleon"]) + >>> s + 0 koala + 1 fox + 2 chameleon + dtype: object + + >>> s.str.slice(start=1) + 0 oala + 1 ox + 2 hameleon + dtype: object + + >>> s.str.slice(start=-1) + 0 a + 1 x + 2 n + dtype: object + + >>> s.str.slice(stop=2) + 0 ko + 1 fo + 2 ch + dtype: object + + >>> s.str.slice(step=2) + 0 kaa + 1 fx + 2 caeen + dtype: object + + >>> s.str.slice(start=0, stop=5, step=3) + 0 kl + 1 f + 2 cm + dtype: object + + Equivalent behaviour to: + + >>> s.str[0:5:3] + 0 kl + 1 f + 2 cm + dtype: object + """ + result = self._array._str_slice(start, stop, step) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def slice_replace(self, start=None, stop=None, repl=None): + """ + Replace a positional slice of a string with another value. + + Parameters + ---------- + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified (None), the sliced region + is replaced with an empty string. + + Returns + ------- + Series or Index + Same type as the original object. + + See Also + -------- + Series.str.slice : Just slicing without replacement. + + Examples + -------- + >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s + 0 a + 1 ab + 2 abc + 3 abdc + 4 abcde + dtype: object + + Specify just `start`, meaning replace `start` until the end of the + string with `repl`. + + >>> s.str.slice_replace(1, repl='X') + 0 aX + 1 aX + 2 aX + 3 aX + 4 aX + dtype: object + + Specify just `stop`, meaning the start of the string to `stop` is replaced + with `repl`, and the rest of the string is included. + + >>> s.str.slice_replace(stop=2, repl='X') + 0 X + 1 X + 2 Xc + 3 Xdc + 4 Xcde + dtype: object + + Specify `start` and `stop`, meaning the slice from `start` to `stop` is + replaced with `repl`. Everything before or after `start` and `stop` is + included as is. + + >>> s.str.slice_replace(start=1, stop=3, repl='X') + 0 aX + 1 aX + 2 aX + 3 aXc + 4 aXde + dtype: object + """ + result = self._array._str_slice_replace(start, stop, repl) + return self._wrap_result(result) + + def decode(self, encoding, errors="strict"): + """ + Decode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in + python3. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + Series or Index + """ + # TODO: Add a similar _bytes interface. + if encoding in _cpython_optimized_decoders: + # CPython optimized implementation + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + arr = self._array + # assert isinstance(arr, (StringArray,)) + result = arr._str_map(f) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def encode(self, encoding, errors="strict"): + """ + Encode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.encode`. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + encoded : Series/Index of objects + """ + result = self._array._str_encode(encoding, errors) + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "str_strip" + ] = r""" + Remove %(position)s characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the Series/Index from %(side)s. + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. + + Examples + -------- + >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) + >>> s + 0 1. Ant. + 1 2. Bee!\n + 2 3. Cat?\t + 3 NaN + dtype: object + + >>> s.str.strip() + 0 1. Ant. + 1 2. Bee! + 2 3. Cat? + 3 NaN + dtype: object + + >>> s.str.lstrip('123.') + 0 Ant. + 1 Bee!\n + 2 Cat?\t + 3 NaN + dtype: object + + >>> s.str.rstrip('.!? \n\t') + 0 1. Ant + 1 2. Bee + 2 3. Cat + 3 NaN + dtype: object + + >>> s.str.strip('123.!? \n\t') + 0 Ant + 1 Bee + 2 Cat + 3 NaN + dtype: object + """ + + @Appender( + _shared_docs["str_strip"] + % { + "side": "left and right sides", + "method": "strip", + "position": "leading and trailing", + } + ) + @forbid_nonstring_types(["bytes"]) + def strip(self, to_strip=None): + result = self._array._str_strip(to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % {"side": "left side", "method": "lstrip", "position": "leading"} + ) + @forbid_nonstring_types(["bytes"]) + def lstrip(self, to_strip=None): + result = self._array._str_lstrip(to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % {"side": "right side", "method": "rstrip", "position": "trailing"} + ) + @forbid_nonstring_types(["bytes"]) + def rstrip(self, to_strip=None): + result = self._array._str_rstrip(to_strip) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def wrap(self, width, **kwargs): + r""" + Wrap strings in Series/Index at specified line width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. + + Parameters + ---------- + width : int + Maximum line width. + expand_tabs : bool, optional + If True, tab characters will be expanded to spaces (default: True). + replace_whitespace : bool, optional + If True, each whitespace character (as defined by string.whitespace) + remaining after tab expansion will be replaced by a single space + (default: True). + drop_whitespace : bool, optional + If True, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True). + break_long_words : bool, optional + If True, then words longer than width will be broken in order to ensure + that no lines are longer than width. If it is false, long words will + not be broken, and some lines may be longer than width (default: True). + break_on_hyphens : bool, optional + If True, wrapping will occur preferably on whitespace and right after + hyphens in compound words, as it is customary in English. If false, + only whitespaces will be considered as potentially good places for line + breaks, but you need to set break_long_words to false if you want truly + insecable words (default: True). + + Returns + ------- + Series or Index + + Notes + ----- + Internally, this method uses a :class:`textwrap.TextWrapper` instance with + default settings. To achieve behavior matching R's stringr library str_wrap + function, use the arguments: + + - expand_tabs = False + - replace_whitespace = True + - drop_whitespace = True + - break_long_words = False + - break_on_hyphens = False + + Examples + -------- + >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s.str.wrap(12) + 0 line to be\nwrapped + 1 another line\nto be\nwrapped + dtype: object + """ + result = self._array._str_wrap(width, **kwargs) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def get_dummies(self, sep="|"): + """ + Return DataFrame of dummy/indicator variables for Series. + + Each string in Series is split by sep and returned as a DataFrame + of dummy/indicator variables. + + Parameters + ---------- + sep : str, default "|" + String to split on. + + Returns + ------- + DataFrame + Dummy variables corresponding to values of the Series. + + See Also + -------- + get_dummies : Convert categorical variable into dummy/indicator + variables. + + Examples + -------- + >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 1 0 0 + 2 1 0 1 + + >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + """ + # we need to cast to Series of strings as only that has all + # methods available for making the dummies... + result, name = self._array._str_get_dummies(sep) + return self._wrap_result( + result, + name=name, + expand=True, + returns_string=False, + ) + + @forbid_nonstring_types(["bytes"]) + def translate(self, table): + """ + Map all characters in the string through the given mapping table. + + Equivalent to standard :meth:`str.translate`. + + Parameters + ---------- + table : dict + Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or + None. Unmapped characters are left untouched. + Characters mapped to None are deleted. :meth:`str.maketrans` is a + helper function for making translation tables. + + Returns + ------- + Series or Index + """ + result = self._array._str_translate(table) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def count(self, pat, flags=0): + r""" + Count occurrences of pattern in each string of the Series/Index. + + This function is used to count the number of times a particular regex + pattern is repeated in each of the string elements of the + :class:`~pandas.Series`. + + Parameters + ---------- + pat : str + Valid regular expression. + flags : int, default 0, meaning no flags + Flags for the `re` module. For a complete list, `see here + `_. + **kwargs + For compatibility with other string methods. Not used. + + Returns + ------- + Series or Index + Same type as the calling object containing the integer counts. + + See Also + -------- + re : Standard library module for regular expressions. + str.count : Standard library version, without regular expression support. + + Notes + ----- + Some characters need to be escaped when passing in `pat`. + eg. ``'$'`` has a special meaning in regex and must be escaped when + finding this literal character. + + Examples + -------- + >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) + >>> s.str.count('a') + 0 0.0 + 1 0.0 + 2 2.0 + 3 2.0 + 4 NaN + 5 0.0 + 6 1.0 + dtype: float64 + + Escape ``'$'`` to find the literal dollar sign. + + >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) + >>> s.str.count('\\$') + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 0 + dtype: int64 + + This is also available on Index + + >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') + Int64Index([0, 0, 2, 1], dtype='int64') + """ + result = self._array._str_count(pat, flags) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def startswith(self, pat, na=None): + """ + Test if the start of each string element matches a pattern. + + Equivalent to :meth:`str.startswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the start of each string element. + + See Also + -------- + str.startswith : Python standard library string method. + Series.str.endswith : Same as startswith, but tests the end of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) + >>> s + 0 bat + 1 Bear + 2 cat + 3 NaN + dtype: object + + >>> s.str.startswith('b') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.startswith('b', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + result = self._array._str_startswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def endswith(self, pat, na=None): + """ + Test if the end of each string element matches a pattern. + + Equivalent to :meth:`str.endswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the end of each string element. + + See Also + -------- + str.endswith : Python standard library string method. + Series.str.startswith : Same as endswith, but tests the start of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) + >>> s + 0 bat + 1 bear + 2 caT + 3 NaN + dtype: object + + >>> s.str.endswith('t') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.endswith('t', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + result = self._array._str_endswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def findall(self, pat, flags=0): + """ + Find all occurrences of pattern or regular expression in the Series/Index. + + Equivalent to applying :func:`re.findall` to all the elements in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 + Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which + means no flags). + + Returns + ------- + Series/Index of lists of strings + All non-overlapping matches of pattern or regular expression in each + string of this Series/Index. + + See Also + -------- + count : Count occurrences of pattern or regular expression in each string + of the Series/Index. + extractall : For each string in the Series, extract groups from all matches + of regular expression and return a DataFrame with one row for each + match and one column for each group. + re.findall : The equivalent ``re`` function to all non-overlapping matches + of pattern or regular expression in string, as a list of strings. + + Examples + -------- + >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + + The search for the pattern 'Monkey' returns one match: + + >>> s.str.findall('Monkey') + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + On the other hand, the search for the pattern 'MONKEY' doesn't return any + match: + + >>> s.str.findall('MONKEY') + 0 [] + 1 [] + 2 [] + dtype: object + + Flags can be added to the pattern or regular expression. For instance, + to find the pattern 'MONKEY' ignoring the case: + + >>> import re + >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + When the pattern matches more than one string in the Series, all matches + are returned: + + >>> s.str.findall('on') + 0 [on] + 1 [on] + 2 [] + dtype: object + + Regular expressions are supported too. For instance, the search for all the + strings ending with the word 'on' is shown next: + + >>> s.str.findall('on$') + 0 [on] + 1 [] + 2 [] + dtype: object + + If the pattern is found more than once in the same string, then a list of + multiple strings is returned: + + >>> s.str.findall('b') + 0 [] + 1 [] + 2 [b, b] + dtype: object + """ + result = self._array._str_findall(pat, flags) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def extract(self, pat, flags=0, expand=True): + r""" + Extract capture groups in the regex `pat` as columns in a DataFrame. + + For each subject string in the Series, extract groups from the + first match of regular expression `pat`. + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that + modify regular expression matching for things like case, + spaces, etc. For more details, see :mod:`re`. + expand : bool, default True + If True, return DataFrame with one column per capture group. + If False, return a Series/Index if there is one capture group + or DataFrame if there are multiple capture groups. + + Returns + ------- + DataFrame or Series or Index + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. The dtype of each result + column is always object, even when no match is found. If + ``expand=False`` and pat has only one capture group, then + return a Series (if subject is a Series) or Index (if subject + is an Index). + + See Also + -------- + extractall : Returns all matches (not just the first match). + + Examples + -------- + A pattern with two groups will return a DataFrame with two columns. + Non-matches will be NaN. + + >>> s = pd.Series(['a1', 'b2', 'c3']) + >>> s.str.extract(r'([ab])(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern may contain optional groups. + + >>> s.str.extract(r'([ab])?(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN 3 + + Named groups will become column names in the result. + + >>> s.str.extract(r'(?P[ab])(?P\d)') + letter digit + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern with one group will return a DataFrame with one column + if expand=True. + + >>> s.str.extract(r'[ab](\d)', expand=True) + 0 + 0 1 + 1 2 + 2 NaN + + A pattern with one group will return a Series if expand=False. + + >>> s.str.extract(r'[ab](\d)', expand=False) + 0 1 + 1 2 + 2 NaN + dtype: object + """ + # TODO: dispatch + return str_extract(self, pat, flags, expand=expand) + + @forbid_nonstring_types(["bytes"]) + def extractall(self, pat, flags=0): + r""" + Extract capture groups in the regex `pat` as columns in DataFrame. + + For each subject string in the Series, extract groups from all + matches of regular expression pat. When each subject string in the + Series has exactly one match, extractall(pat).xs(0, level='match') + is the same as extract(pat). + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + A ``re`` module flag, for example ``re.IGNORECASE``. These allow + to modify regular expression matching for things like case, spaces, + etc. Multiple flags can be combined with the bitwise OR operator, + for example ``re.IGNORECASE | re.MULTILINE``. + + Returns + ------- + DataFrame + A ``DataFrame`` with one row for each match, and one column for each + group. Its rows have a ``MultiIndex`` with first levels that come from + the subject ``Series``. The last level is named 'match' and indexes the + matches in each item of the ``Series``. Any capture group names in + regular expression pat will be used for column names; otherwise capture + group numbers will be used. + + See Also + -------- + extract : Returns first match only (not all matches). + + Examples + -------- + A pattern with one group will return a DataFrame with one column. + Indices with no matches will not appear in the result. + + >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) + >>> s.str.extractall(r"[ab](\d)") + 0 + match + A 0 1 + 1 2 + B 0 1 + + Capture group names are used for column names of the result. + + >>> s.str.extractall(r"[ab](?P\d)") + digit + match + A 0 1 + 1 2 + B 0 1 + + A pattern with two groups will return a DataFrame with two columns. + + >>> s.str.extractall(r"(?P[ab])(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + + Optional groups that do not match are NaN in the result. + + >>> s.str.extractall(r"(?P[ab])?(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + C 0 NaN 1 + """ + # TODO: dispatch + return str_extractall(self._orig, pat, flags) + + _shared_docs[ + "find" + ] = """ + Return %(side)s indexes in each strings in the Series/Index. + + Each of returned indexes corresponds to the position where the + substring is fully contained between [start:end]. Return -1 on + failure. Equivalent to standard :meth:`str.%(method)s`. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of int. + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["find"] + % { + "side": "lowest", + "method": "find", + "also": "rfind : Return highest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def find(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_find(sub, start, end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["find"] + % { + "side": "highest", + "method": "rfind", + "also": "find : Return lowest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rfind(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_rfind(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def normalize(self, form): + """ + Return the Unicode normal form for the strings in the Series/Index. + + For more information on the forms, see the + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {'NFC', 'NFKC', 'NFD', 'NFKD'} + Unicode form. + + Returns + ------- + normalized : Series/Index of objects + """ + result = self._array._str_normalize(form) + return self._wrap_result(result) + + _shared_docs[ + "index" + ] = """ + Return %(side)s indexes in each string in Series/Index. + + Each of the returned indexes corresponds to the position where the + substring is fully contained between [start:end]. This is the same + as ``str.%(similar)s`` except instead of returning -1, it raises a + ValueError when the substring is not found. Equivalent to standard + ``str.%(method)s``. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of object + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["index"] + % { + "side": "lowest", + "similar": "find", + "method": "index", + "also": "rindex : Return highest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def index(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_index(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["index"] + % { + "side": "highest", + "similar": "rfind", + "method": "rindex", + "also": "index : Return lowest indexes in each strings.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rindex(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_rindex(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + def len(self): + """ + Compute the length of each element in the Series/Index. + + The element may be a sequence (such as a string, tuple or list) or a collection + (such as a dictionary). + + Returns + ------- + Series or Index of int + A Series or Index of integer values indicating the length of each + element in the Series or Index. + + See Also + -------- + str.len : Python built-in function returning the length of an object. + Series.size : Returns the length of the Series. + + Examples + -------- + Returns the length (number of characters) in a string. Returns the + number of entries for dictionaries, lists or tuples. + + >>> s = pd.Series(['dog', + ... '', + ... 5, + ... {'foo' : 'bar'}, + ... [2, 3, 5, 7], + ... ('one', 'two', 'three')]) + >>> s + 0 dog + 1 + 2 5 + 3 {'foo': 'bar'} + 4 [2, 3, 5, 7] + 5 (one, two, three) + dtype: object + >>> s.str.len() + 0 3.0 + 1 0.0 + 2 NaN + 3 1.0 + 4 4.0 + 5 3.0 + dtype: float64 + """ + result = self._array._str_len() + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "casemethods" + ] = """ + Convert strings in the Series/Index to %(type)s. + %(version)s + Equivalent to :meth:`str.%(method)s`. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.lower : Converts all characters to lowercase. + Series.str.upper : Converts all characters to uppercase. + Series.str.title : Converts first character of each word to uppercase and + remaining to lowercase. + Series.str.capitalize : Converts first character to uppercase and + remaining to lowercase. + Series.str.swapcase : Converts uppercase to lowercase and lowercase to + uppercase. + Series.str.casefold: Removes all case distinctions in the string. + + Examples + -------- + >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) + >>> s + 0 lower + 1 CAPITALS + 2 this is a sentence + 3 SwApCaSe + dtype: object + + >>> s.str.lower() + 0 lower + 1 capitals + 2 this is a sentence + 3 swapcase + dtype: object + + >>> s.str.upper() + 0 LOWER + 1 CAPITALS + 2 THIS IS A SENTENCE + 3 SWAPCASE + dtype: object + + >>> s.str.title() + 0 Lower + 1 Capitals + 2 This Is A Sentence + 3 Swapcase + dtype: object + + >>> s.str.capitalize() + 0 Lower + 1 Capitals + 2 This is a sentence + 3 Swapcase + dtype: object + + >>> s.str.swapcase() + 0 LOWER + 1 capitals + 2 THIS IS A SENTENCE + 3 sWaPcAsE + dtype: object + """ + # Types: + # cases: + # upper, lower, title, capitalize, swapcase, casefold + # boolean: + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # _doc_args holds dict of strings to use in substituting casemethod docs + _doc_args: Dict[str, Dict[str, str]] = {} + _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} + _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} + _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} + _doc_args["capitalize"] = { + "type": "be capitalized", + "method": "capitalize", + "version": "", + } + _doc_args["swapcase"] = { + "type": "be swapcased", + "method": "swapcase", + "version": "", + } + _doc_args["casefold"] = { + "type": "be casefolded", + "method": "casefold", + "version": "\n .. versionadded:: 0.25.0\n", + } + + @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) + @forbid_nonstring_types(["bytes"]) + def lower(self): + result = self._array._str_lower() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) + @forbid_nonstring_types(["bytes"]) + def upper(self): + result = self._array._str_upper() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["title"]) + @forbid_nonstring_types(["bytes"]) + def title(self): + result = self._array._str_title() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) + @forbid_nonstring_types(["bytes"]) + def capitalize(self): + result = self._array._str_capitalize() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) + @forbid_nonstring_types(["bytes"]) + def swapcase(self): + result = self._array._str_swapcase() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) + @forbid_nonstring_types(["bytes"]) + def casefold(self): + result = self._array._str_casefold() + return self._wrap_result(result) + + _shared_docs[ + "ismethods" + ] = """ + Check whether all characters in each string are %(type)s. + + This is equivalent to running the Python string method + :meth:`str.%(method)s` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns + ------- + Series or Index of bool + Series or Index of boolean values with the same length as the original + Series/Index. + + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + **Checks for Alphabetic and Numeric Characters** + + >>> s1 = pd.Series(['one', 'one1', '1', '']) + + >>> s1.str.isalpha() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s1.str.isnumeric() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + >>> s1.str.isalnum() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + Note that checks against characters mixed with any additional punctuation + or whitespace will evaluate to false for an alphanumeric check. + + >>> s2 = pd.Series(['A B', '1.5', '3,000']) + >>> s2.str.isalnum() + 0 False + 1 False + 2 False + dtype: bool + + **More Detailed Checks for Numeric Characters** + + There are several different but overlapping sets of numeric characters that + can be checked for. + + >>> s3 = pd.Series(['23', '³', '⅕', '']) + + The ``s3.str.isdecimal`` method checks for characters used to form numbers + in base 10. + + >>> s3.str.isdecimal() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also + includes special digits, like superscripted and subscripted digits in + unicode. + + >>> s3.str.isdigit() + 0 True + 1 True + 2 False + 3 False + dtype: bool + + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also + includes other characters that can represent quantities such as unicode + fractions. + + >>> s3.str.isnumeric() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + **Checks for Whitespace** + + >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) + >>> s4.str.isspace() + 0 True + 1 True + 2 False + dtype: bool + + **Checks for Character Case** + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + + >>> s5.str.islower() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s5.str.isupper() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + The ``s5.str.istitle`` method checks for whether all words are in title + case (whether only the first letter of each word is capitalized). Words are + assumed to be as any sequence of non-numeric characters separated by + whitespace characters. + + >>> s5.str.istitle() + 0 False + 1 True + 2 False + 3 False + dtype: bool + """ + _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} + _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} + _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} + _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} + _doc_args["islower"] = {"type": "lowercase", "method": "islower"} + _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} + _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} + _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} + _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"} + # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) + + isalnum = _map_and_wrap( + "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + ) + isalpha = _map_and_wrap( + "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + ) + isdigit = _map_and_wrap( + "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + ) + isspace = _map_and_wrap( + "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + ) + islower = _map_and_wrap( + "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + ) + isupper = _map_and_wrap( + "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + ) + istitle = _map_and_wrap( + "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + ) + isnumeric = _map_and_wrap( + "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + ) + isdecimal = _map_and_wrap( + "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + ) + + +def cat_safe(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + try: + result = cat_core(list_of_columns, sep) + except TypeError: + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ["string", "empty"]: + raise TypeError( + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + f"column {dtype}" + ) from None + return result + + +def cat_core(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat` + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + if sep == "": + # no need to interleave sep if it is empty + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + arr_with_sep = np.asarray(list_with_sep, dtype=object) + return np.sum(arr_with_sep, axis=0) + + +def _groups_or_na_fun(regex): + """Used in both extract_noexpand and extract_frame""" + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + empty_row = [np.nan] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row + + return f + + +def _result_dtype(arr): + # workaround #27953 + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype, StringDtype): + return arr.dtype.name + else: + return object + + +def _get_single_group_name(rx): + try: + return list(rx.groupindex.keys()).pop() + except IndexError: + return None + + +def _str_extract_noexpand(arr, pat, flags=0): + """ + Find groups in each string in the Series using passed regular + expression. This function is called from + str_extract(expand=False), and can return Series, DataFrame, or + Index. + + """ + from pandas import DataFrame, array + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + result_dtype = _result_dtype(arr) + + if regex.groups == 1: + result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + name = _get_single_group_name(regex) + # not dispatching, so we have to reconstruct here. + result = array(result, dtype=result_dtype) + else: + if isinstance(arr, ABCIndexClass): + raise ValueError("only one regex group is supported with Index") + name = None + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + if arr.size == 0: + result = DataFrame(columns=columns, dtype=object) + else: + dtype = _result_dtype(arr) + result = DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=arr.index, + dtype=dtype, + ) + return result, name + + +def _str_extract_frame(arr, pat, flags=0): + """ + For each subject string in the Series, extract groups from the + first match of regular expression pat. This function is called from + str_extract(expand=True), and always returns a DataFrame. + + """ + from pandas import DataFrame + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + + if len(arr) == 0: + return DataFrame(columns=columns, dtype=object) + try: + result_index = arr.index + except AttributeError: + result_index = None + dtype = _result_dtype(arr) + return DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=result_index, + dtype=dtype, + ) + + +def str_extract(arr, pat, flags=0, expand=True): + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + if expand: + result = _str_extract_frame(arr._orig, pat, flags=flags) + return result.__finalize__(arr._orig, method="str_extract") + else: + result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) + return arr._wrap_result(result, name=name, expand=expand) + + +def str_extractall(arr, pat, flags=0): + regex = re.compile(pat, flags=flags) + # the regex must contain capture groups. + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if isinstance(arr, ABCIndexClass): + arr = arr.to_series().reset_index(drop=True) + + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + match_list = [] + index_list = [] + is_mi = arr.index.nlevels > 1 + + for subject_key, subject in arr.items(): + if isinstance(subject, str): + + if not is_mi: + subject_key = (subject_key,) + + for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, str): + match_tuple = (match_tuple,) + na_tuple = [np.NaN if group == "" else group for group in match_tuple] + match_list.append(na_tuple) + result_key = tuple(subject_key + (match_i,)) + index_list.append(result_key) + + from pandas import MultiIndex + + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + dtype = _result_dtype(arr) + + result = arr._constructor_expanddim( + match_list, index=index, columns=columns, dtype=dtype + ) + return result diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py new file mode 100644 index 0000000000000..08064244a2ff9 --- /dev/null +++ b/pandas/core/strings/base.py @@ -0,0 +1,225 @@ +import abc +from typing import Pattern, Union + +import numpy as np + +from pandas._typing import Scalar + + +class BaseStringArrayMethods(abc.ABC): + """ + Base class for extension arrays implementing string methods. + + This is where our ExtensionArrays can override the implementation of + Series.str.. We don't expect this to work with + 3rd-party extension arrays. + + * User calls Series.str. + * pandas extracts the extension array from the Series + * pandas calls ``extension_array._str_(*args, **kwargs)`` + * pandas wraps the result, to return to the user. + + See :ref:`Series.str` for the docstring of each method. + """ + + def _str_getitem(self, key): + if isinstance(key, slice): + return self._str_slice(start=key.start, stop=key.stop, step=key.step) + else: + return self._str_get(key) + + @abc.abstractmethod + def _str_count(self, pat, flags=0): + pass + + @abc.abstractmethod + def _str_pad(self, width, side="left", fillchar=" "): + pass + + @abc.abstractmethod + def _str_contains(self, pat, case=True, flags=0, na=None, regex=True): + pass + + @abc.abstractmethod + def _str_startswith(self, pat, na=None): + pass + + @abc.abstractmethod + def _str_endswith(self, pat, na=None): + pass + + @abc.abstractmethod + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + pass + + @abc.abstractmethod + def _str_repeat(self, repeats): + pass + + @abc.abstractmethod + def _str_match( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + pass + + @abc.abstractmethod + def _str_fullmatch( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + pass + + @abc.abstractmethod + def _str_encode(self, encoding, errors="strict"): + pass + + @abc.abstractmethod + def _str_find(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_rfind(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_findall(self, pat, flags=0): + pass + + @abc.abstractmethod + def _str_get(self, i): + pass + + @abc.abstractmethod + def _str_index(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_rindex(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_join(self, sep): + pass + + @abc.abstractmethod + def _str_partition(self, sep, expand): + pass + + @abc.abstractmethod + def _str_rpartition(self, sep, expand): + pass + + @abc.abstractmethod + def _str_len(self): + pass + + @abc.abstractmethod + def _str_slice(self, start=None, stop=None, step=None): + pass + + @abc.abstractmethod + def _str_slice_replace(self, start=None, stop=None, repl=None): + pass + + @abc.abstractmethod + def _str_translate(self, table): + pass + + @abc.abstractmethod + def _str_wrap(self, width, **kwargs): + pass + + @abc.abstractmethod + def _str_get_dummies(self, sep="|"): + pass + + @abc.abstractmethod + def _str_isalnum(self): + pass + + @abc.abstractmethod + def _str_isalpha(self): + pass + + @abc.abstractmethod + def _str_isdecimal(self): + pass + + @abc.abstractmethod + def _str_isdigit(self): + pass + + @abc.abstractmethod + def _str_islower(self): + pass + + @abc.abstractmethod + def _str_isnumeric(self): + pass + + @abc.abstractmethod + def _str_isspace(self): + pass + + @abc.abstractmethod + def _str_istitle(self): + pass + + @abc.abstractmethod + def _str_isupper(self): + pass + + @abc.abstractmethod + def _str_capitalize(self): + pass + + @abc.abstractmethod + def _str_casefold(self): + pass + + @abc.abstractmethod + def _str_title(self): + pass + + @abc.abstractmethod + def _str_swapcase(self): + pass + + @abc.abstractmethod + def _str_lower(self): + pass + + @abc.abstractmethod + def _str_upper(self): + pass + + @abc.abstractmethod + def _str_normalize(self, form): + pass + + @abc.abstractmethod + def _str_strip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_lstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_rstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_split(self, pat=None, n=-1, expand=False): + pass + + @abc.abstractmethod + def _str_rsplit(self, pat=None, n=-1): + pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py new file mode 100644 index 0000000000000..a29d84edd3a77 --- /dev/null +++ b/pandas/core/strings/object_array.py @@ -0,0 +1,432 @@ +import re +import textwrap +from typing import Pattern, Set, Union, cast +import unicodedata +import warnings + +import numpy as np + +import pandas._libs.lib as lib +import pandas._libs.missing as libmissing +import pandas._libs.ops as libops +from pandas._typing import Scalar + +from pandas.core.dtypes.common import is_re, is_scalar +from pandas.core.dtypes.missing import isna + +from pandas.core.strings.base import BaseStringArrayMethods + + +class ObjectStringArrayMixin(BaseStringArrayMethods): + """ + String Methods operating on object-dtype ndarrays. + """ + + _str_na_value = np.nan + + def __len__(self): + # For typing, _str_map relies on the object being sized. + raise NotImplementedError + + def _str_map(self, f, na_value=None, dtype=None): + """ + Map a callable over valid element of the array. + + Parameters + ---------- + f : Callable + A function to call on each non-NA element. + na_value : Scalar, optional + The value to set for NA values. Might also be used for the + fill value if the callable `f` raises an exception. + This defaults to ``self._str_na_value`` which is ``np.nan`` + for object-dtype and Categorical and ``pd.NA`` for StringArray. + dtype : Dtype, optional + The dtype of the result array. + """ + arr = self + if dtype is None: + dtype = np.dtype("object") + if na_value is None: + na_value = self._str_na_value + + if not len(arr): + return np.ndarray(0, dtype=dtype) + + if not isinstance(arr, np.ndarray): + arr = np.asarray(arr, dtype=object) + mask = isna(arr) + convert = not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + except (TypeError, AttributeError) as e: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(e.args) >= 1 and re.search(p_err, e.args[0]): + # FIXME: this should be totally avoidable + raise e + + def g(x): + # This type of fallback behavior can be removed once + # we remove object-dtype .str accessor. + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return self._str_map(g, na_value=na_value, dtype=dtype) + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + def _str_count(self, pat, flags=0): + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return self._str_map(f, dtype="int64") + + def _str_pad(self, width, side="left", fillchar=" "): + if side == "left": + f = lambda x: x.rjust(width, fillchar) + elif side == "right": + f = lambda x: x.ljust(width, fillchar) + elif side == "both": + f = lambda x: x.center(width, fillchar) + else: # pragma: no cover + raise ValueError("Invalid side") + return self._str_map(f) + + def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, + ) + + f = lambda x: regex.search(x) is not None + else: + if case: + f = lambda x: pat in x + else: + upper_pat = pat.upper() + f = lambda x: upper_pat in x.upper() + return self._str_map(f, na, dtype=np.dtype("bool")) + + def _str_startswith(self, pat, na=None): + f = lambda x: x.startswith(pat) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_endswith(self, pat, na=None): + f = lambda x: x.endswith(pat) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + else: + if is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with " + "regex=False" + ) + if callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + f = lambda x: x.replace(pat, repl, n) + + return self._str_map(f, dtype=str) + + def _str_repeat(self, repeats): + if is_scalar(repeats): + + def scalar_rep(x): + try: + return bytes.__mul__(x, repeats) + except TypeError: + return str.__mul__(x, repeats) + + return self._str_map(scalar_rep, dtype=str) + else: + from pandas.core.arrays.string_ import StringArray + + def rep(x, r): + if x is libmissing.NA: + return x + try: + return bytes.__mul__(x, r) + except TypeError: + return str.__mul__(x, r) + + repeats = np.asarray(repeats, dtype=object) + result = libops.vec_binop(np.asarray(self), repeats, rep) + if isinstance(self, StringArray): + # Not going through map, so we have to do this here. + result = StringArray._from_sequence(result) + return result + + def _str_match( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = None, + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.match(x) is not None + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_fullmatch( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = None, + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.fullmatch(x) is not None + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_encode(self, encoding, errors="strict"): + f = lambda x: x.encode(encoding, errors=errors) + return self._str_map(f, dtype=object) + + def _str_find(self, sub, start=0, end=None): + return self._str_find_(sub, start, end, side="left") + + def _str_rfind(self, sub, start=0, end=None): + return self._str_find_(sub, start, end, side="right") + + def _str_find_(self, sub, start, end, side): + if side == "left": + method = "find" + elif side == "right": + method = "rfind" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_findall(self, pat, flags=0): + regex = re.compile(pat, flags=flags) + return self._str_map(regex.findall, dtype="object") + + def _str_get(self, i): + def f(x): + if isinstance(x, dict): + return x.get(i) + elif len(x) > i >= -len(x): + return x[i] + return self._str_na_value + + return self._str_map(f) + + def _str_index(self, sub, start=0, end=None): + if end: + f = lambda x: x.index(sub, start, end) + else: + f = lambda x: x.index(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_rindex(self, sub, start=0, end=None): + if end: + f = lambda x: x.rindex(sub, start, end) + else: + f = lambda x: x.rindex(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_join(self, sep): + return self._str_map(sep.join) + + def _str_partition(self, sep, expand): + result = self._str_map(lambda x: x.partition(sep), dtype="object") + return result + + def _str_rpartition(self, sep, expand): + return self._str_map(lambda x: x.rpartition(sep), dtype="object") + + def _str_len(self): + return self._str_map(len, dtype="int64") + + def _str_slice(self, start=None, stop=None, step=None): + obj = slice(start, stop, step) + return self._str_map(lambda x: x[obj]) + + def _str_slice_replace(self, start=None, stop=None, repl=None): + if repl is None: + repl = "" + + def f(x): + if x[start:stop] == "": + local_stop = start + else: + local_stop = stop + y = "" + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return self._str_map(f) + + def _str_split(self, pat=None, n=-1, expand=False): + if pat is None: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if len(pat) == 1: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if n is None or n == -1: + n = 0 + regex = re.compile(pat) + f = lambda x: regex.split(x, maxsplit=n) + return self._str_map(f, dtype=object) + + def _str_rsplit(self, pat=None, n=-1): + if n is None or n == 0: + n = -1 + f = lambda x: x.rsplit(pat, n) + return self._str_map(f, dtype="object") + + def _str_translate(self, table): + return self._str_map(lambda x: x.translate(table)) + + def _str_wrap(self, width, **kwargs): + kwargs["width"] = width + tw = textwrap.TextWrapper(**kwargs) + return self._str_map(lambda s: "\n".join(tw.wrap(s))) + + def _str_get_dummies(self, sep="|"): + from pandas import Series + + arr = Series(self).fillna("") + try: + arr = sep + arr + sep + except TypeError: + arr = cast(Series, arr) + arr = sep + arr.astype(str) + sep + arr = cast(Series, arr) + + tags: Set[str] = set() + for ts in Series(arr).str.split(sep): + tags.update(ts) + tags2 = sorted(tags - {""}) + + dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + + for i, t in enumerate(tags2): + pat = sep + t + sep + dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) + return dummies, tags2 + + def _str_upper(self): + return self._str_map(lambda x: x.upper()) + + def _str_isalnum(self): + return self._str_map(str.isalnum, dtype="bool") + + def _str_isalpha(self): + return self._str_map(str.isalpha, dtype="bool") + + def _str_isdecimal(self): + return self._str_map(str.isdecimal, dtype="bool") + + def _str_isdigit(self): + return self._str_map(str.isdigit, dtype="bool") + + def _str_islower(self): + return self._str_map(str.islower, dtype="bool") + + def _str_isnumeric(self): + return self._str_map(str.isnumeric, dtype="bool") + + def _str_isspace(self): + return self._str_map(str.isspace, dtype="bool") + + def _str_istitle(self): + return self._str_map(str.istitle, dtype="bool") + + def _str_isupper(self): + return self._str_map(str.isupper, dtype="bool") + + def _str_capitalize(self): + return self._str_map(str.capitalize) + + def _str_casefold(self): + return self._str_map(str.casefold) + + def _str_title(self): + return self._str_map(str.title) + + def _str_swapcase(self): + return self._str_map(str.swapcase) + + def _str_lower(self): + return self._str_map(str.lower) + + def _str_normalize(self, form): + f = lambda x: unicodedata.normalize(form, x) + return self._str_map(f) + + def _str_strip(self, to_strip=None): + return self._str_map(lambda x: x.strip(to_strip)) + + def _str_lstrip(self, to_strip=None): + return self._str_map(lambda x: x.lstrip(to_strip)) + + def _str_rstrip(self, to_strip=None): + return self._str_map(lambda x: x.rstrip(to_strip)) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0adab143f6052..1553deeef4059 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -16,12 +16,20 @@ import numpy as np -from pandas._libs import tslib, tslibs -from pandas._libs.tslibs import Timestamp, conversion, parsing +from pandas._libs import tslib +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + Timedelta, + Timestamp, + conversion, + iNaT, + nat_strings, + parsing, +) from pandas._libs.tslibs.parsing import ( # noqa DateParseError, - _format_is_iso, - _guess_datetime_format, + format_is_iso, + guess_datetime_format, ) from pandas._libs.tslibs.strptime import array_strptime from pandas._typing import ArrayLike, Label, Timezone @@ -53,8 +61,9 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from pandas import Series # noqa:F401 - from pandas._libs.tslibs.nattype import NaTType # noqa:F401 + from pandas._libs.tslibs.nattype import NaTType + + from pandas import Series # --------------------------------------------------------------------- # types used in annotations @@ -72,7 +81,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] if len(non_nan_elements): - return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) + return guess_datetime_format(arr[non_nan_elements[0]], **kwargs) def should_cache( @@ -306,9 +315,7 @@ def _convert_listlike_datetimes( if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == "utc": - # error: Item "DatetimeIndex" of "Union[DatetimeArray, DatetimeIndex]" has - # no attribute "tz_convert" - arg = arg.tz_convert(None).tz_localize(tz) # type: ignore + arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg_dtype): @@ -388,7 +395,7 @@ def _convert_listlike_datetimes( # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case - format_is_iso8601 = _format_is_iso(format) + format_is_iso8601 = format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None @@ -405,7 +412,7 @@ def _convert_listlike_datetimes( # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, tslibs.OutOfBoundsDatetime) as err: + except (ValueError, TypeError, OutOfBoundsDatetime) as err: raise ValueError( "cannot convert the input to '%Y%m%d' date format" ) from err @@ -420,13 +427,13 @@ def _convert_listlike_datetimes( return _return_parsed_timezone_results( result, timezones, tz, name ) - except tslibs.OutOfBoundsDatetime: + except OutOfBoundsDatetime: if errors == "raise": raise elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult.fill(tslibs.iNaT) + iresult.fill(iNaT) else: result = arg except ValueError: @@ -439,7 +446,7 @@ def _convert_listlike_datetimes( elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult.fill(tslibs.iNaT) + iresult.fill(iNaT) else: result = arg except ValueError as e: @@ -509,7 +516,7 @@ def _adjust_to_origin(arg, origin, unit): j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): - raise tslibs.OutOfBoundsDatetime( + raise OutOfBoundsDatetime( f"{original} is Out of Bounds for origin='julian'" ) else: @@ -526,10 +533,8 @@ def _adjust_to_origin(arg, origin, unit): # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) - except tslibs.OutOfBoundsDatetime as err: - raise tslibs.OutOfBoundsDatetime( - f"origin {origin} is Out of Bounds" - ) from err + except OutOfBoundsDatetime as err: + raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err except ValueError as err: raise ValueError( f"origin {origin} cannot be converted to a Timestamp" @@ -541,7 +546,7 @@ def _adjust_to_origin(arg, origin, unit): # convert the offset to the unit of the arg # this should be lossless in terms of precision - offset = offset // tslibs.Timedelta(1, unit=unit) + offset = offset // Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)): @@ -681,8 +686,6 @@ def to_datetime( used when there are at least 50 values. The presence of out-of-bounds values will render the cache unusable and may slow down parsing. - .. versionadded:: 0.23.0 - .. versionchanged:: 0.25.0 - changed default value from False to True. @@ -812,7 +815,7 @@ def to_datetime( elif is_list_like(arg): try: cache_array = _maybe_cache(arg, format, cache, convert_listlike) - except tslibs.OutOfBoundsDatetime: + except OutOfBoundsDatetime: # caching attempts to create a DatetimeIndex, which may raise # an OOB. If that's the desired behavior, then just reraise... if errors == "raise": @@ -876,7 +879,7 @@ def _assemble_from_unit_mappings(arg, errors, tz): ------- Series """ - from pandas import to_timedelta, to_numeric, DataFrame + from pandas import DataFrame, to_numeric, to_timedelta arg = DataFrame(arg) if not arg.columns.is_unique: @@ -968,7 +971,7 @@ def calc(carg): def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult[~mask] = tslibs.iNaT + iresult[~mask] = iNaT masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) result[mask] = masked_result.astype("M8[ns]") @@ -989,7 +992,7 @@ def calc_with_mask(carg, mask): # string with NaN-like try: - mask = ~algorithms.isin(arg, list(tslibs.nat_strings)) + mask = ~algorithms.isin(arg, list(nat_strings)) return calc_with_mask(arg, mask) except (ValueError, OverflowError, TypeError): pass diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 41548931f17f8..dd7373927ed9b 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -2,7 +2,7 @@ from pandas._libs import lib -from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.cast import maybe_downcast_numeric from pandas.core.dtypes.common import ( ensure_object, is_datetime_or_timedelta_dtype, @@ -10,6 +10,7 @@ is_number, is_numeric_dtype, is_scalar, + needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -40,13 +41,13 @@ def to_numeric(arg, errors="raise", downcast=None): - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. - downcast : {'int', 'signed', 'unsigned', 'float'}, default None + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - - 'int' or 'signed': smallest signed int dtype (min.: np.int8) + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) @@ -123,8 +124,9 @@ def to_numeric(arg, errors="raise", downcast=None): values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True - values = arg.asi8 - if values is None: + if needs_i8_conversion(arg.dtype): + values = arg.asi8 + else: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype="O") @@ -178,15 +180,16 @@ def to_numeric(arg, errors="raise", downcast=None): if typecodes is not None: # from smallest to largest for dtype in typecodes: - if np.dtype(dtype).itemsize <= values.dtype.itemsize: - values = maybe_downcast_to_dtype(values, dtype) + dtype = np.dtype(dtype) + if dtype.itemsize <= values.dtype.itemsize: + values = maybe_downcast_numeric(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: - return pd.Series(values, index=arg.index, name=arg.name) + return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index e457a8819f27a..6a9fd7a542a44 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -26,6 +26,11 @@ def to_timedelta(arg, unit=None, errors="raise"): ---------- arg : str, timedelta, list-like or Series The data to be converted to timedelta. + + .. deprecated:: 1.2 + Strings with units 'M', 'Y' and 'y' do not represent + unambiguous timedelta values and will be removed in a future version + unit : str, optional Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``. @@ -61,6 +66,11 @@ def to_timedelta(arg, unit=None, errors="raise"): to_datetime : Convert argument to datetime. convert_dtypes : Convert dtypes. + Notes + ----- + If the precision is higher than nanoseconds, the precision of the duration is + truncated to nanoseconds for string inputs. + Examples -------- Parsing a single string to a Timedelta: @@ -90,7 +100,7 @@ def to_timedelta(arg, unit=None, errors="raise"): unit = parse_timedelta_unit(unit) if errors not in ("ignore", "raise", "coerce"): - raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'}") + raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") if unit in {"Y", "y", "M"}: raise ValueError( diff --git a/pandas/core/tools/times.py b/pandas/core/tools/times.py index 3bac4cf0edb63..643c1165180b4 100644 --- a/pandas/core/tools/times.py +++ b/pandas/core/tools/times.py @@ -5,11 +5,9 @@ from pandas._libs.lib import is_list_like -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import notna -from pandas.core.indexes.base import Index - def to_time(arg, format=None, infer_time_format=False, errors="raise"): """ @@ -105,7 +103,7 @@ def _convert_listlike(arg, format): elif isinstance(arg, ABCSeries): values = _convert_listlike(arg._values, format) return arg._constructor(values, index=arg.index, name=arg.name) - elif isinstance(arg, Index): + elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) elif is_list_like(arg): return _convert_listlike(arg, format) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 1b56b6d5a46fa..df082c7285ae8 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -24,7 +24,7 @@ _default_hash_key = "0123456789123456" -def _combine_hash_arrays(arrays, num_items: int): +def combine_hash_arrays(arrays, num_items: int): """ Parameters ---------- @@ -108,7 +108,7 @@ def hash_pandas_object( for _ in [None] ) arrays = itertools.chain([h], index_iter) - h = _combine_hash_arrays(arrays, 2) + h = combine_hash_arrays(arrays, 2) h = Series(h, index=obj.index, dtype="uint64", copy=False) @@ -131,7 +131,7 @@ def hash_pandas_object( # keep `hashes` specifically a generator to keep mypy happy _hashes = itertools.chain(hashes, index_hash_generator) hashes = (x for x in _hashes) - h = _combine_hash_arrays(hashes, num_items) + h = combine_hash_arrays(hashes, num_items) h = Series(h, index=obj.index, dtype="uint64", copy=False) else: @@ -175,7 +175,7 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): hashes = ( _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals ) - h = _combine_hash_arrays(hashes, len(vals)) + h = combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] @@ -275,7 +275,7 @@ def hash_array( # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: - from pandas import factorize, Categorical, Index + from pandas import Categorical, Index, factorize codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index c3f60ea7cc217..ed920c174ea69 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,49 +1,31 @@ """Common utilities for Numba operations""" from distutils.version import LooseVersion -import inspect import types from typing import Callable, Dict, Optional, Tuple import numpy as np -from pandas._typing import FrameOrSeries from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError -NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = dict() +GLOBAL_USE_NUMBA: bool = False +NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = {} -def check_kwargs_and_nopython( - kwargs: Optional[Dict] = None, nopython: Optional[bool] = None -) -> None: - """ - Validate that **kwargs and nopython=True was passed - https://github.com/numba/numba/issues/2916 +def maybe_use_numba(engine: Optional[str]) -> bool: + """Signal whether to use numba routines.""" + return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA) - Parameters - ---------- - kwargs : dict, default None - user passed keyword arguments to pass into the JITed function - nopython : bool, default None - nopython parameter - Returns - ------- - None - - Raises - ------ - NumbaUtilError - """ - if kwargs and nopython: - raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) +def set_use_numba(enable: bool = False) -> None: + global GLOBAL_USE_NUMBA + if enable: + import_optional_dependency("numba") + GLOBAL_USE_NUMBA = enable def get_jit_arguments( - engine_kwargs: Optional[Dict[str, bool]] = None + engine_kwargs: Optional[Dict[str, bool]] = None, kwargs: Optional[Dict] = None ) -> Tuple[bool, bool, bool]: """ Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. @@ -52,16 +34,27 @@ def get_jit_arguments( ---------- engine_kwargs : dict, default None user passed keyword arguments for numba.JIT + kwargs : dict, default None + user passed keyword arguments to pass into the JITed function Returns ------- (bool, bool, bool) nopython, nogil, parallel + + Raises + ------ + NumbaUtilError """ if engine_kwargs is None: engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) + if kwargs and nopython: + raise NumbaUtilError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) return nopython, nogil, parallel @@ -116,94 +109,3 @@ def impl(data, *_args): return impl return numba_func - - -def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: - """ - Split pandas object into its components as numpy arrays for numba functions. - - Parameters - ---------- - arg : Series or DataFrame - - Returns - ------- - (ndarray, ndarray) - values, index - """ - return arg.to_numpy(), arg.index.to_numpy() - - -def validate_udf(func: Callable) -> None: - """ - Validate user defined function for ops when using Numba. - - The first signature arguments should include: - - def f(values, index, ...): - ... - - Parameters - ---------- - func : function, default False - user defined function - - Returns - ------- - None - - Raises - ------ - NumbaUtilError - """ - udf_signature = list(inspect.signature(func).parameters.keys()) - expected_args = ["values", "index"] - min_number_args = len(expected_args) - if ( - len(udf_signature) < min_number_args - or udf_signature[:min_number_args] != expected_args - ): - raise NumbaUtilError( - f"The first {min_number_args} arguments to {func.__name__} must be " - f"{expected_args}" - ) - - -def generate_numba_func( - func: Callable, - engine_kwargs: Optional[Dict[str, bool]], - kwargs: dict, - cache_key_str: str, -) -> Tuple[Callable, Tuple[Callable, str]]: - """ - Return a JITed function and cache key for the NUMBA_FUNC_CACHE - - This _may_ be specific to groupby (as it's only used there currently). - - Parameters - ---------- - func : function - user defined function - engine_kwargs : dict or None - numba.jit arguments - kwargs : dict - kwargs for func - cache_key_str : str - string representing the second part of the cache key tuple - - Returns - ------- - (JITed function, cache key) - - Raises - ------ - NumbaUtilError - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - check_kwargs_and_nopython(kwargs, nopython) - validate_udf(func) - cache_key = (func, cache_key_str) - numba_func = NUMBA_FUNC_CACHE.get( - cache_key, jit_user_function(func, nopython, nogil, parallel) - ) - return numba_func, cache_key diff --git a/pandas/core/window/__init__.py b/pandas/core/window/__init__.py index 304c61ac0e489..b3d0820fee4da 100644 --- a/pandas/core/window/__init__.py +++ b/pandas/core/window/__init__.py @@ -1,3 +1,6 @@ -from pandas.core.window.ewm import ExponentialMovingWindow # noqa:F401 +from pandas.core.window.ewm import ( # noqa:F401 + ExponentialMovingWindow, + ExponentialMovingWindowGroupby, +) from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 58e7841d4dde5..6ebf610587d30 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,15 +1,14 @@ """Common utility functions for rolling operations""" from collections import defaultdict -from typing import Callable, Optional +from typing import cast import warnings import numpy as np from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.generic import _shared_docs -from pandas.core.groupby.base import GroupByMixin from pandas.core.indexes.api import MultiIndex +from pandas.core.shared_docs import _shared_docs _shared_docs = dict(**_shared_docs) _doc_template = """ @@ -27,72 +26,7 @@ """ -def _dispatch(name: str, *args, **kwargs): - """ - Dispatch to apply. - """ - - def outer(self, *args, **kwargs): - def f(x): - x = self._shallow_copy(x, groupby=self._groupby) - return getattr(x, name)(*args, **kwargs) - - return self._groupby.apply(f) - - outer.__name__ = name - return outer - - -class WindowGroupByMixin(GroupByMixin): - """ - Provide the groupby facilities. - """ - - def __init__(self, obj, *args, **kwargs): - kwargs.pop("parent", None) - groupby = kwargs.pop("groupby", None) - if groupby is None: - groupby, obj = obj, obj.obj - self._groupby = groupby - self._groupby.mutated = True - self._groupby.grouper.mutated = True - super().__init__(obj, *args, **kwargs) - - count = _dispatch("count") - corr = _dispatch("corr", other=None, pairwise=None) - cov = _dispatch("cov", other=None, pairwise=None) - - def _apply( - self, - func: Callable, - center: bool, - require_min_periods: int = 0, - floor: int = 1, - is_weighted: bool = False, - name: Optional[str] = None, - use_numba_cache: bool = False, - **kwargs, - ): - """ - Dispatch to apply; we are stripping all of the _apply kwargs and - performing the original function call on the grouped object. - """ - kwargs.pop("floor", None) - kwargs.pop("original_func", None) - - # TODO: can we de-duplicate with _dispatch? - def f(x, name=name, *args): - x = self._shallow_copy(x) - - if isinstance(name, str): - return getattr(x, name)(*args, **kwargs) - - return x.apply(name, *args, **kwargs) - - return self._groupby.apply(f) - - -def _flex_binary_moment(arg1, arg2, f, pairwise=False): +def flex_binary_moment(arg1, arg2, f, pairwise=False): if not ( isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) @@ -176,6 +110,9 @@ def dataframe_from_int_dict(data, frame_template): # set the index and reorder if arg2.columns.nlevels > 1: + # mypy needs to know columns is a MultiIndex, Index doesn't + # have levels attribute + arg2.columns = cast(MultiIndex, arg2.columns) result.index = MultiIndex.from_product( arg2.columns.levels + [result_index] ) @@ -222,7 +159,7 @@ def dataframe_from_int_dict(data, frame_template): return dataframe_from_int_dict(results, arg1) else: - return _flex_binary_moment(arg2, arg1, f) + return flex_binary_moment(arg2, arg1, f) def zsqrt(x): diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 7a2d8e84bec76..f8237a436f436 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -1,7 +1,7 @@ import datetime from functools import partial from textwrap import dedent -from typing import Optional, Union +from typing import TYPE_CHECKING, Optional, Union import numpy as np @@ -12,12 +12,26 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.common import is_datetime64_ns_dtype -from pandas.core.dtypes.generic import ABCDataFrame -from pandas.core.base import DataError import pandas.core.common as common -from pandas.core.window.common import _doc_template, _shared_docs, zsqrt -from pandas.core.window.rolling import _flex_binary_moment, _Rolling +from pandas.core.util.numba_ import maybe_use_numba +from pandas.core.window.common import ( + _doc_template, + _shared_docs, + flex_binary_moment, + zsqrt, +) +from pandas.core.window.indexers import ( + BaseIndexer, + ExponentialMovingWindowIndexer, + GroupbyIndexer, +) +from pandas.core.window.numba_ import generate_numba_groupby_ewma_func +from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby, dispatch + +if TYPE_CHECKING: + from pandas import Series + _bias_template = """ Parameters @@ -62,7 +76,16 @@ def get_center_of_mass( return float(comass) -class ExponentialMovingWindow(_Rolling): +def wrap_result(obj: "Series", result: np.ndarray) -> "Series": + """ + Wrap a single 1D result. + """ + obj = obj._selected_obj + + return obj._constructor(result, obj.index, name=obj.name) + + +class ExponentialMovingWindow(BaseWindow): r""" Provide exponential weighted (EW) functions. @@ -161,7 +184,7 @@ class ExponentialMovingWindow(_Rolling): ----- More details can be found at: - :ref:`Exponentially weighted windows `. + :ref:`Exponentially weighted windows `. Examples -------- @@ -208,14 +231,16 @@ def __init__( ignore_na: bool = False, axis: int = 0, times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, + **kwargs, ): - self.com: Optional[float] self.obj = obj self.min_periods = max(int(min_periods), 1) self.adjust = adjust self.ignore_na = ignore_na self.axis = axis self.on = None + self.center = False + self.closed = None if times is not None: if isinstance(times, str): times = self._selected_obj[times] @@ -234,7 +259,7 @@ def __init__( if common.count_not_none(com, span, alpha) > 0: self.com = get_center_of_mass(com, span, None, alpha) else: - self.com = None + self.com = 0.0 else: if halflife is not None and isinstance(halflife, (str, datetime.timedelta)): raise ValueError( @@ -249,6 +274,12 @@ def __init__( def _constructor(self): return ExponentialMovingWindow + def _get_window_indexer(self) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds + """ + return ExponentialMovingWindowIndexer() + _agg_see_also_doc = dedent( """ See Also @@ -280,7 +311,6 @@ def _constructor(self): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/Dataframe", axis="", ) @@ -289,44 +319,6 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - def _apply(self, func): - """ - Rolling statistical measure using supplied function. Designed to be - used with passed-in Cython array-based functions. - - Parameters - ---------- - func : str/callable to apply - - Returns - ------- - y : same type as input argument - """ - blocks, obj = self._create_blocks(self._selected_obj) - block_list = list(blocks) - - results = [] - exclude = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") from err - - if values.size == 0: - results.append(values.copy()) - continue - - results.append(np.apply_along_axis(func, self.axis, values)) - - return self._wrap_results(results, block_list, obj, exclude) - @Substitution(name="ewm", func_name="mean") @Appender(_doc_template) def mean(self, *args, **kwargs): @@ -343,7 +335,6 @@ def mean(self, *args, **kwargs): window_func = self._get_roll_func("ewma_time") window_func = partial( window_func, - minp=self.min_periods, times=self.times, halflife=self.halflife, ) @@ -354,7 +345,6 @@ def mean(self, *args, **kwargs): com=self.com, adjust=self.adjust, ignore_na=self.ignore_na, - minp=self.min_periods, ) return self._apply(window_func) @@ -378,13 +368,19 @@ def var(self, bias: bool = False, *args, **kwargs): Exponential weighted moving variance. """ nv.validate_window_func("var", args, kwargs) + window_func = self._get_roll_func("ewmcov") + window_func = partial( + window_func, + com=self.com, + adjust=self.adjust, + ignore_na=self.ignore_na, + bias=bias, + ) - def f(arg): - return window_aggregations.ewmcov( - arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias, - ) + def var_func(values, begin, end, min_periods): + return window_func(values, begin, end, min_periods, values) - return self._apply(f) + return self._apply(var_func) @Substitution(name="ewm", func_name="cov") @Appender(_doc_template) @@ -426,16 +422,18 @@ def _get_cov(X, Y): Y = self._shallow_copy(Y) cov = window_aggregations.ewmcov( X._prep_values(), + np.array([0], dtype=np.int64), + np.array([0], dtype=np.int64), + self.min_periods, Y._prep_values(), self.com, self.adjust, self.ignore_na, - self.min_periods, bias, ) - return X._wrap_result(cov) + return wrap_result(X, cov) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -477,7 +475,15 @@ def _get_corr(X, Y): def _cov(x, y): return window_aggregations.ewmcov( - x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1, + x, + np.array([0], dtype=np.int64), + np.array([0], dtype=np.int64), + self.min_periods, + y, + self.com, + self.adjust, + self.ignore_na, + 1, ) x_values = X._prep_values() @@ -487,8 +493,83 @@ def _cov(x, y): x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) corr = cov / zsqrt(x_var * y_var) - return X._wrap_result(corr) + return wrap_result(X, corr) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) + + +class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): + """ + Provide an exponential moving window groupby implementation. + """ + + def _get_window_indexer(self) -> GroupbyIndexer: + """ + Return an indexer class that will compute the window start and end bounds + + Returns + ------- + GroupbyIndexer + """ + window_indexer = GroupbyIndexer( + groupby_indicies=self._groupby.indices, + window_indexer=ExponentialMovingWindowIndexer, + ) + return window_indexer + + var = dispatch("var", bias=False) + std = dispatch("std", bias=False) + cov = dispatch("cov", other=None, pairwise=None, bias=False) + corr = dispatch("corr", other=None, pairwise=None) + + def mean(self, engine=None, engine_kwargs=None): + """ + Parameters + ---------- + engine : str, default None + * ``'cython'`` : Runs mean through C-extensions from cython. + * ``'numba'`` : Runs mean through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.2.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}``. + + .. versionadded:: 1.2.0 + + Returns + ------- + Series or DataFrame + Return type is determined by the caller. + """ + if maybe_use_numba(engine): + groupby_ewma_func = generate_numba_groupby_ewma_func( + engine_kwargs, + self.com, + self.adjust, + self.ignore_na, + ) + return self._apply( + groupby_ewma_func, + numba_cache_key=(lambda x: x, "groupby_ewma"), + ) + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return x.mean() + + return self._groupby.apply(f) + else: + raise ValueError("engine must be either 'numba' or 'cython'") diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index bbc19fad8b799..94875ba86db65 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,14 +1,18 @@ from textwrap import dedent -from typing import Dict, Optional +from typing import Any, Callable, Dict, Optional, Tuple, Union +import numpy as np + +from pandas._typing import FrameOrSeries from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc -from pandas.core.window.common import WindowGroupByMixin, _doc_template, _shared_docs -from pandas.core.window.rolling import _Rolling_and_Expanding +from pandas.core.window.common import _doc_template, _shared_docs +from pandas.core.window.indexers import BaseIndexer, ExpandingIndexer, GroupbyIndexer +from pandas.core.window.rolling import BaseWindowGroupby, RollingAndExpandingMixin -class Expanding(_Rolling_and_Expanding): +class Expanding(RollingAndExpandingMixin): """ Provide expanding transformations. @@ -57,16 +61,24 @@ class Expanding(_Rolling_and_Expanding): _attributes = ["min_periods", "center", "axis"] - def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): + def __init__(self, obj, min_periods=1, center=None, axis=0, **kwargs): super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) @property def _constructor(self): return Expanding - def _get_window(self, other=None, **kwargs): + def _get_window_indexer(self) -> BaseIndexer: + """ + Return an indexer class that will compute the window start and end bounds """ - Get the window length over which to perform some operation. + return ExpandingIndexer() + + def _get_cov_corr_window( + self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None, **kwargs + ) -> int: + """ + Get the window length over which to perform cov and corr operations. Parameters ---------- @@ -117,7 +129,6 @@ def _get_window(self, other=None, **kwargs): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/Dataframe", axis="", ) @@ -128,19 +139,19 @@ def aggregate(self, func, *args, **kwargs): @Substitution(name="expanding") @Appender(_shared_docs["count"]) - def count(self, **kwargs): - return super().count(**kwargs) + def count(self): + return super().count() @Substitution(name="expanding") @Appender(_shared_docs["apply"]) def apply( self, - func, + func: Callable[..., Any], raw: bool = False, - engine: str = "cython", + engine: Optional[str] = None, engine_kwargs: Optional[Dict[str, bool]] = None, - args=None, - kwargs=None, + args: Optional[Tuple[Any, ...]] = None, + kwargs: Optional[Dict[str, Any]] = None, ): return super().apply( func, @@ -183,16 +194,21 @@ def median(self, **kwargs): @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["std"]) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["var"]) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): nv.validate_expanding_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) + @Substitution(name="expanding") + @Appender(_shared_docs["sem"]) + def sem(self, ddof: int = 1, *args, **kwargs): + return super().sem(ddof=ddof, **kwargs) + @Substitution(name="expanding", func_name="skew") @Appender(_doc_template) @Appender(_shared_docs["skew"]) @@ -240,20 +256,41 @@ def quantile(self, quantile, interpolation="linear", **kwargs): @Substitution(name="expanding", func_name="cov") @Appender(_doc_template) @Appender(_shared_docs["cov"]) - def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + def cov( + self, + other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + pairwise: Optional[bool] = None, + ddof: int = 1, + **kwargs, + ): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) @Substitution(name="expanding") @Appender(_shared_docs["corr"]) - def corr(self, other=None, pairwise=None, **kwargs): + def corr( + self, + other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + pairwise: Optional[bool] = None, + **kwargs, + ): return super().corr(other=other, pairwise=pairwise, **kwargs) -class ExpandingGroupby(WindowGroupByMixin, Expanding): +class ExpandingGroupby(BaseWindowGroupby, Expanding): """ Provide a expanding groupby implementation. """ - @property - def _constructor(self): - return Expanding + def _get_window_indexer(self) -> GroupbyIndexer: + """ + Return an indexer class that will compute the window start and end bounds + + Returns + ------- + GroupbyIndexer + """ + window_indexer = GroupbyIndexer( + groupby_indicies=self._groupby.indices, + window_indexer=ExpandingIndexer, + ) + return window_indexer diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0898836ed2e0e..a3b9695d777d9 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -1,12 +1,14 @@ """Indexer objects for computing start/end window bounds for rolling operations""" from datetime import timedelta -from typing import Dict, Optional, Tuple, Type, Union +from typing import Dict, Optional, Tuple, Type import numpy as np from pandas._libs.window.indexers import calculate_variable_window_bounds from pandas.util._decorators import Appender +from pandas.core.dtypes.common import ensure_platform_int + from pandas.tseries.offsets import Nano get_window_bounds_doc = """ @@ -38,7 +40,7 @@ class BaseIndexer: """Base class for window bounds calculations.""" def __init__( - self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs ): """ Parameters @@ -76,17 +78,21 @@ def get_window_bounds( closed: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: - start_s = np.zeros(self.window_size, dtype="int64") - start_e = ( - np.arange(self.window_size, num_values, dtype="int64") - - self.window_size - + 1 - ) - start = np.concatenate([start_s, start_e])[:num_values] + if center: + offset = (self.window_size - 1) // 2 + else: + offset = 0 + + end = np.arange(1 + offset, num_values + 1 + offset, dtype="int64") + start = end - self.window_size + if closed in ["left", "both"]: + start -= 1 + if closed in ["left", "neither"]: + end -= 1 + + end = np.clip(end, 0, num_values) + start = np.clip(start, 0, num_values) - end_s = np.arange(self.window_size, dtype="int64") + 1 - end_e = start_e + self.window_size - end = np.concatenate([end_s, end_e])[:num_values] return start, end @@ -103,7 +109,7 @@ def get_window_bounds( ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, self.window_size, min_periods, center, closed, self.index_array, + num_values, self.window_size, min_periods, center, closed, self.index_array ) @@ -257,26 +263,42 @@ def get_window_bounds( return start, end -class GroupbyRollingIndexer(BaseIndexer): +class GroupbyIndexer(BaseIndexer): """Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()""" def __init__( self, - index_array: Optional[np.ndarray], - window_size: int, - groupby_indicies: Dict, - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]], + index_array: Optional[np.ndarray] = None, + window_size: int = 0, + groupby_indicies: Optional[Dict] = None, + window_indexer: Type[BaseIndexer] = BaseIndexer, + indexer_kwargs: Optional[Dict] = None, **kwargs, ): """ Parameters ---------- + index_array : np.ndarray or None + np.ndarray of the index of the original object that we are performing + a chained groupby operation over. This index has been pre-sorted relative to + the groups + window_size : int + window size during the windowing operation + groupby_indicies : dict or None + dict of {group label: [positional index of rows belonging to the group]} + window_indexer : BaseIndexer + BaseIndexer class determining the start and end bounds of each group + indexer_kwargs : dict or None + Custom kwargs to be passed to window_indexer **kwargs : keyword arguments that will be available when get_window_bounds is called """ - self.groupby_indicies = groupby_indicies - self.rolling_indexer = rolling_indexer - super().__init__(index_array, window_size, **kwargs) + self.groupby_indicies = groupby_indicies or {} + self.window_indexer = window_indexer + self.indexer_kwargs = indexer_kwargs or {} + super().__init__( + index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs + ) @Appender(get_window_bounds_doc) def get_window_bounds( @@ -292,31 +314,48 @@ def get_window_bounds( start_arrays = [] end_arrays = [] window_indicies_start = 0 - for key, indicies in self.groupby_indicies.items(): + for key, indices in self.groupby_indicies.items(): if self.index_array is not None: - index_array = self.index_array.take(indicies) + index_array = self.index_array.take(ensure_platform_int(indices)) else: index_array = self.index_array - indexer = self.rolling_indexer( - index_array=index_array, window_size=self.window_size, + indexer = self.window_indexer( + index_array=index_array, + window_size=self.window_size, + **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( - len(indicies), min_periods, center, closed + len(indices), min_periods, center, closed ) start = start.astype(np.int64) end = end.astype(np.int64) # Cannot use groupby_indicies as they might not be monotonic with the object # we're rolling over window_indicies = np.arange( - window_indicies_start, window_indicies_start + len(indicies), + window_indicies_start, window_indicies_start + len(indices) ) - window_indicies_start += len(indicies) + window_indicies_start += len(indices) # Extend as we'll be slicing window like [start, end) window_indicies = np.append( window_indicies, [window_indicies[-1] + 1] ).astype(np.int64) - start_arrays.append(window_indicies.take(start)) - end_arrays.append(window_indicies.take(end)) + start_arrays.append(window_indicies.take(ensure_platform_int(start))) + end_arrays.append(window_indicies.take(ensure_platform_int(end))) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) return start, end + + +class ExponentialMovingWindowIndexer(BaseIndexer): + """Calculate ewm window bounds (the entire window)""" + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 5d35ec7457ab0..274586e1745b5 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -6,7 +6,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.core.util.numba_ import ( - check_kwargs_and_nopython, + NUMBA_FUNC_CACHE, get_jit_arguments, jit_user_function, ) @@ -42,14 +42,14 @@ def generate_numba_apply_func( ------- Numba function """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) - check_kwargs_and_nopython(kwargs, nopython) + cache_key = (func, "rolling_apply") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] numba_func = jit_user_function(func, nopython, nogil, parallel) - numba = import_optional_dependency("numba") - if parallel: loop_range = numba.prange else: @@ -57,7 +57,7 @@ def generate_numba_apply_func( @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( - values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int ) -> np.ndarray: result = np.empty(len(begin)) for i in loop_range(len(result)): @@ -72,3 +72,92 @@ def roll_apply( return result return roll_apply + + +def generate_numba_groupby_ewma_func( + engine_kwargs: Optional[Dict[str, bool]], + com: float, + adjust: bool, + ignore_na: bool, +): + """ + Generate a numba jitted groupby ewma function specified by values + from engine_kwargs. + + Parameters + ---------- + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + com : float + adjust : bool + ignore_na : bool + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + cache_key = (lambda x: x, "groupby_ewma") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] + + numba = import_optional_dependency("numba") + if parallel: + loop_range = numba.prange + else: + loop_range = range + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def groupby_ewma( + values: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + minimum_periods: int, + ) -> np.ndarray: + result = np.empty(len(values)) + alpha = 1.0 / (1.0 + com) + for i in loop_range(len(begin)): + start = begin[i] + stop = end[i] + window = values[start:stop] + sub_result = np.empty(len(window)) + + old_wt_factor = 1.0 - alpha + new_wt = 1.0 if adjust else alpha + + weighted_avg = window[0] + nobs = int(not np.isnan(weighted_avg)) + sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan + old_wt = 1.0 + + for j in range(1, len(window)): + cur = window[j] + is_observation = not np.isnan(cur) + nobs += is_observation + if not np.isnan(weighted_avg): + + if is_observation or not ignore_na: + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ( + (old_wt * weighted_avg) + (new_wt * cur) + ) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1.0 + elif is_observation: + weighted_avg = cur + + sub_result[j] = weighted_avg if nobs >= minimum_periods else np.nan + + result[start:stop] = sub_result + + return result + + return groupby_ewma diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8cb53ebd92214..e6185f8ae0679 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -6,13 +6,25 @@ from functools import partial import inspect from textwrap import dedent -from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Set, + Tuple, + Type, + Union, +) +import warnings import numpy as np from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import Axis, FrameOrSeries, Scalar +from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -34,104 +46,36 @@ ABCSeries, ABCTimedeltaIndex, ) +from pandas.core.dtypes.missing import notna -from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin -import pandas.core.common as com +from pandas.core.aggregation import aggregate +from pandas.core.base import DataError, SelectionMixin from pandas.core.construction import extract_array -from pandas.core.indexes.api import Index, MultiIndex, ensure_index -from pandas.core.util.numba_ import NUMBA_FUNC_CACHE +from pandas.core.groupby.base import GotItemMixin, ShallowMixin +from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.core.window.common import ( - WindowGroupByMixin, _doc_template, - _flex_binary_moment, _shared_docs, + flex_binary_moment, zsqrt, ) from pandas.core.window.indexers import ( BaseIndexer, FixedWindowIndexer, - GroupbyRollingIndexer, + GroupbyIndexer, VariableWindowIndexer, ) from pandas.core.window.numba_ import generate_numba_apply_func +if TYPE_CHECKING: + from pandas import DataFrame, Series + from pandas.core.internals import Block # noqa:F401 -def calculate_center_offset(window) -> int: - """ - Calculate an offset necessary to have the window label to be centered. - - Parameters - ---------- - window: ndarray or int - window weights or window - - Returns - ------- - int - """ - if not is_integer(window): - window = len(window) - return int((window - 1) / 2.0) - - -def calculate_min_periods( - window: int, - min_periods: Optional[int], - num_values: int, - required_min_periods: int, - floor: int, -) -> int: - """ - Calculate final minimum periods value for rolling aggregations. - Parameters - ---------- - window : passed window value - min_periods : passed min periods value - num_values : total number of values - required_min_periods : required min periods per aggregation function - floor : required min periods per aggregation function +class BaseWindow(ShallowMixin, SelectionMixin): + """Provides utilities for performing windowing operations.""" - Returns - ------- - min_periods : int - """ - if min_periods is None: - min_periods = window - else: - min_periods = max(required_min_periods, min_periods) - if min_periods > window: - raise ValueError(f"min_periods {min_periods} must be <= window {window}") - elif min_periods > num_values: - min_periods = num_values + 1 - elif min_periods < 0: - raise ValueError("min_periods must be >= 0") - return max(min_periods, floor) - - -def get_weighted_roll_func(cfunc: Callable) -> Callable: - """ - Wrap weighted rolling cython function with min periods argument. - - Parameters - ---------- - cfunc : function - Cython weighted rolling function - - Returns - ------- - function - """ - - def func(arg, window, min_periods=None): - if min_periods is None: - min_periods = len(window) - return cfunc(arg, window, min_periods) - - return func - - -class _Window(PandasObject, ShallowMixin, SelectionMixin): _attributes: List[str] = [ "window", "min_periods", @@ -145,7 +89,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): def __init__( self, - obj, + obj: FrameOrSeries, window=None, min_periods: Optional[int] = None, center: bool = False, @@ -168,10 +112,6 @@ def __init__( self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() - @property - def _constructor(self): - return Window - @property def is_datetimelike(self) -> Optional[bool]: return None @@ -187,8 +127,15 @@ def is_freq_type(self) -> bool: def validate(self) -> None: if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") - if self.min_periods is not None and not is_integer(self.min_periods): - raise ValueError("min_periods must be an integer") + if self.min_periods is not None: + if not is_integer(self.min_periods): + raise ValueError("min_periods must be an integer") + elif self.min_periods < 0: + raise ValueError("min_periods must be >= 0") + elif is_integer(self.window) and self.min_periods > self.window: + raise ValueError( + f"min_periods {self.min_periods} must be <= window {self.window}" + ) if self.closed is not None and self.closed not in [ "right", "both", @@ -199,27 +146,21 @@ def validate(self) -> None: if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): raise TypeError(f"invalid type: {type(self)}") if isinstance(self.window, BaseIndexer): - self._validate_get_window_bounds_signature(self.window) - - @staticmethod - def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: - """ - Validate that the passed BaseIndexer subclass has - a get_window_bounds with the correct signature. - """ - get_window_bounds_signature = inspect.signature( - window.get_window_bounds - ).parameters.keys() - expected_signature = inspect.signature( - BaseIndexer().get_window_bounds - ).parameters.keys() - if get_window_bounds_signature != expected_signature: - raise ValueError( - f"{type(window).__name__} does not implement the correct signature for " - f"get_window_bounds" - ) + # Validate that the passed BaseIndexer subclass has + # a get_window_bounds with the correct signature. + get_window_bounds_signature = inspect.signature( + self.window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + BaseIndexer().get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(self.window).__name__} does not implement " + f"the correct signature for get_window_bounds" + ) - def _create_blocks(self, obj: FrameOrSeries): + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ Split data into blocks & return conformed data. """ @@ -227,9 +168,14 @@ def _create_blocks(self, obj: FrameOrSeries): if self.on is not None and not isinstance(self.on, Index): if obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - blocks = obj._to_dict_of_blocks(copy=False).values() - - return blocks, obj + if self.axis == 1: + # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything + # to float to calculate the complete row at once. We exclude all non-numeric + # dtypes. + obj = obj.select_dtypes(include=["integer", "float"], exclude=["timedelta"]) + obj = obj.astype("float64", copy=False) + obj._mgr = obj._mgr.consolidate() + return obj def _gotitem(self, key, ndim, subset=None): """ @@ -266,38 +212,21 @@ def __getattr__(self, attr: str): def _dir_additions(self): return self.obj._dir_additions() - def _get_win_type(self, kwargs: Dict): - """ - Exists for compatibility, overridden by subclass Window. - - Parameters - ---------- - kwargs : dict - ignored, exists for compatibility - - Returns - ------- - None - """ - return None - - def _get_window(self, other=None, win_type: Optional[str] = None) -> int: + def _get_cov_corr_window( + self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None + ) -> Optional[Union[int, timedelta, BaseOffset, BaseIndexer]]: """ Return window length. Parameters ---------- other : - ignored, exists for compatibility - win_type : - ignored, exists for compatibility + Used in Expanding Returns ------- window : int """ - if isinstance(self.window, BaseIndexer): - return self.min_periods or 0 return self.window @property @@ -317,11 +246,10 @@ def __repr__(self) -> str: return f"{self._window_type} [{attrs}]" def __iter__(self): - window = self._get_window(win_type=None) - blocks, obj = self._create_blocks(self._selected_obj) - index = self._get_window_indexer(window=window) + obj = self._create_data(self._selected_obj) + indexer = self._get_window_indexer() - start, end = index.get_window_bounds( + start, end = indexer.get_window_bounds( num_values=len(obj), min_periods=self.min_periods, center=self.center, @@ -363,91 +291,32 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: return values - def _wrap_result(self, result, block=None, obj=None): - """ - Wrap a single result. - """ - if obj is None: - obj = self._selected_obj - index = obj.index - - if isinstance(result, np.ndarray): - - if result.ndim == 1: - from pandas import Series - - return Series(result, index, name=obj.name) - - return type(obj)(result, index=index, columns=block.columns) - return result - - def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: - """ - Wrap the results. - - Parameters - ---------- - results : list of ndarrays - blocks : list of blocks - obj : conformed data (may be resampled) - exclude: list of columns to exclude, default to None - """ - from pandas import Series, concat - - final = [] - for result, block in zip(results, blocks): - - result = self._wrap_result(result, block=block, obj=obj) - if result.ndim == 1: - return result - final.append(result) + def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): + # if we have an 'on' column we want to put it back into + # the results in the same location + from pandas import Series - # if we have an 'on' column - # we want to put it back into the results - # in the same location - columns = self._selected_obj.columns if self.on is not None and not self._on.equals(obj.index): - name = self._on.name - final.append(Series(self._on, index=obj.index, name=name)) - - if self._selection is not None: - - selection = ensure_index(self._selection) - - # need to reorder to include original location of - # the on column (if its not already there) - if name not in selection: - columns = self.obj.columns - indexer = columns.get_indexer(selection.tolist() + [name]) - columns = columns.take(sorted(indexer)) - - # exclude nuisance columns so that they are not reindexed - if exclude is not None and exclude: - columns = [c for c in columns if c not in exclude] - - if not columns: - raise DataError("No numeric types to aggregate") - - if not len(final): - return obj.astype("float64") - return concat(final, axis=1).reindex(columns=columns, copy=False) - - def _center_window(self, result, window) -> np.ndarray: - """ - Center the result in the window. - """ - if self.axis > result.ndim - 1: - raise ValueError("Requested axis is larger then no. of argument dimensions") - - offset = calculate_center_offset(window) - if offset > 0: - lead_indexer = [slice(None)] * result.ndim - lead_indexer[self.axis] = slice(offset, None) - result = np.copy(result[tuple(lead_indexer)]) - return result + extra_col = Series(self._on, index=self.obj.index, name=name) + if name in result.columns: + # TODO: sure we want to overwrite results? + result[name] = extra_col + elif name in result.index.names: + pass + elif name in self._selected_obj.columns: + # insert in the same location as we had in _selected_obj + old_cols = self._selected_obj.columns + new_cols = result.columns + old_loc = old_cols.get_loc(name) + overlap = new_cols.intersection(old_cols[:old_loc]) + new_loc = len(overlap) + result.insert(new_loc, name, extra_col) + else: + # insert at the end + result[name] = extra_col - def _get_roll_func(self, func_name: str) -> Callable: + def _get_roll_func(self, func_name: str) -> Callable[..., Any]: """ Wrap rolling function to check values passed. @@ -467,35 +336,82 @@ def _get_roll_func(self, func_name: str) -> Callable: ) return window_func - def _get_cython_func_type(self, func: str) -> Callable: - """ - Return a variable or fixed cython function type. - - Variable algorithms do not use window while fixed do. - """ - if self.is_freq_type or isinstance(self.window, BaseIndexer): - return self._get_roll_func(f"{func}_variable") - return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) + @property + def _index_array(self): + # TODO: why do we get here with e.g. MultiIndex? + if needs_i8_conversion(self._on.dtype): + return self._on.asi8 + return None - def _get_window_indexer(self, window: int) -> BaseIndexer: + def _get_window_indexer(self) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ if isinstance(self.window, BaseIndexer): return self.window if self.is_freq_type: - return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) - return FixedWindowIndexer(window_size=window) + return VariableWindowIndexer( + index_array=self._index_array, window_size=self.window + ) + return FixedWindowIndexer(window_size=self.window) + + def _apply_series( + self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None + ) -> "Series": + """ + Series version of _apply_blockwise + """ + obj = self._create_data(self._selected_obj) + + try: + # GH 12541: Special case for count where we support date-like types + input = obj.values if name != "count" else notna(obj.values).astype(int) + values = self._prep_values(input) + except (TypeError, NotImplementedError) as err: + raise DataError("No numeric types to aggregate") from err + + result = homogeneous_func(values) + return obj._constructor(result, index=obj.index, name=obj.name) + + def _apply_blockwise( + self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None + ) -> FrameOrSeriesUnion: + """ + Apply the given function to the DataFrame broken down into homogeneous + sub-frames. + """ + if self._selected_obj.ndim == 1: + return self._apply_series(homogeneous_func, name) + + obj = self._create_data(self._selected_obj) + if name == "count": + # GH 12541: Special case for count where we support date-like types + obj = notna(obj).astype(int) + obj._mgr = obj._mgr.consolidate() + mgr = obj._mgr + + def hfunc(bvalues: ArrayLike) -> ArrayLike: + # TODO(EA2D): getattr unnecessary with 2D EAs + values = self._prep_values(getattr(bvalues, "T", bvalues)) + res_values = homogeneous_func(values) + return getattr(res_values, "T", res_values) + + new_mgr = mgr.apply(hfunc, ignore_failures=True) + out = obj._constructor(new_mgr) + + if out.shape[1] == 0 and obj.shape[1] > 0: + raise DataError("No numeric types to aggregate") + elif out.shape[1] == 0: + return obj.astype("float64") + + self._insert_on_column(out, obj) + return out def _apply( self, - func: Callable, - center: bool, - require_min_periods: int = 0, - floor: int = 1, - is_weighted: bool = False, + func: Callable[..., Any], name: Optional[str] = None, - use_numba_cache: bool = False, + numba_cache_key: Optional[Tuple[Callable, str]] = None, **kwargs, ): """ @@ -506,15 +422,9 @@ def _apply( Parameters ---------- func : callable function to apply - center : bool - require_min_periods : int - floor : int - is_weighted : bool name : str, - compatibility with groupby.rolling - use_numba_cache : bool - whether to cache a numba compiled function. Only available for numba - enabled methods (so far only apply) + numba_cache_key : tuple + caching key to be used to store a compiled numba func **kwargs additional arguments for rolling function and window function @@ -522,64 +432,27 @@ def _apply( ------- y : type of input """ - win_type = self._get_win_type(kwargs) - window = self._get_window(win_type=win_type) - - blocks, obj = self._create_blocks(self._selected_obj) - block_list = list(blocks) - window_indexer = self._get_window_indexer(window) - - results = [] - exclude: List[Scalar] = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") from err - - if values.size == 0: - results.append(values.copy()) - continue + window_indexer = self._get_window_indexer() + min_periods = ( + self.min_periods + if self.min_periods is not None + else window_indexer.window_size + ) + def homogeneous_func(values: np.ndarray): # calculation function - offset = calculate_center_offset(window) if center else 0 - additional_nans = np.array([np.nan] * offset) - - if not is_weighted: - - def calc(x): - x = np.concatenate((x, additional_nans)) - if not isinstance(self.window, BaseIndexer): - min_periods = calculate_min_periods( - window, self.min_periods, len(x), require_min_periods, floor - ) - else: - min_periods = calculate_min_periods( - window_indexer.window_size, - self.min_periods, - len(x), - require_min_periods, - floor, - ) - start, end = window_indexer.get_window_bounds( - num_values=len(x), - min_periods=self.min_periods, - center=self.center, - closed=self.closed, - ) - return func(x, start, end, min_periods) - else: - - def calc(x): - x = np.concatenate((x, additional_nans)) - return func(x, window, self.min_periods) + if values.size == 0: + return values.copy() + + def calc(x): + start, end = window_indexer.get_window_bounds( + num_values=len(x), + min_periods=min_periods, + center=self.center, + closed=self.closed, + ) + return func(x, start, end, min_periods) with np.errstate(all="ignore"): if values.ndim > 1: @@ -588,18 +461,15 @@ def calc(x): result = calc(values) result = np.asarray(result) - if use_numba_cache: - NUMBA_FUNC_CACHE[(kwargs["original_func"], "rolling_apply")] = func - - if center: - result = self._center_window(result, window) + if numba_cache_key is not None: + NUMBA_FUNC_CACHE[numba_cache_key] = func - results.append(result) + return result - return self._wrap_results(results, block_list, obj, exclude) + return self._apply_blockwise(homogeneous_func, name) def aggregate(self, func, *args, **kwargs): - result, how = self._aggregate(func, *args, **kwargs) + result, how = aggregate(self, func, *args, **kwargs) if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -850,7 +720,133 @@ def aggregate(self, func, *args, **kwargs): ) -class Window(_Window): +def dispatch(name: str, *args, **kwargs): + """ + Dispatch to groupby apply. + """ + + def outer(self, *args, **kwargs): + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return getattr(x, name)(*args, **kwargs) + + return self._groupby.apply(f) + + outer.__name__ = name + return outer + + +class BaseWindowGroupby(GotItemMixin, BaseWindow): + """ + Provide the groupby windowing facilities. + """ + + def __init__(self, obj, *args, **kwargs): + kwargs.pop("parent", None) + groupby = kwargs.pop("groupby", None) + if groupby is None: + groupby, obj = obj, obj._selected_obj + self._groupby = groupby + self._groupby.mutated = True + self._groupby.grouper.mutated = True + super().__init__(obj, *args, **kwargs) + + corr = dispatch("corr", other=None, pairwise=None) + cov = dispatch("cov", other=None, pairwise=None) + + def _apply( + self, + func: Callable[..., Any], + name: Optional[str] = None, + numba_cache_key: Optional[Tuple[Callable, str]] = None, + **kwargs, + ) -> FrameOrSeries: + result = super()._apply( + func, + name, + numba_cache_key, + **kwargs, + ) + # Reconstruct the resulting MultiIndex from tuples + # 1st set of levels = group by labels + # 2nd set of levels = original index + # Ignore 2nd set of levels if a group by label include an index level + result_index_names = [ + grouping.name for grouping in self._groupby.grouper._groupings + ] + grouped_object_index = None + + column_keys = [ + key + for key in result_index_names + if key not in self.obj.index.names or key is None + ] + + if len(column_keys) == len(result_index_names): + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + result_index_names += grouped_index_name + else: + # Our result will have still kept the column in the result + result = result.drop(columns=column_keys, errors="ignore") + + codes = self._groupby.grouper.codes + levels = self._groupby.grouper.levels + + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + codes = [c.take(indexer) for c in codes] + + # if the index of the original dataframe needs to be preserved, append + # this index (but reordered) to the codes/levels from the groupby + if grouped_object_index is not None: + idx = grouped_object_index.take(indexer) + if not isinstance(idx, MultiIndex): + idx = MultiIndex.from_arrays([idx]) + codes.extend(list(idx.codes)) + levels.extend(list(idx.levels)) + + result_index = MultiIndex( + levels, codes, names=result_index_names, verify_integrity=False + ) + + result.index = result_index + return result + + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: + """ + Split data into blocks & return conformed data. + """ + # Ensure the object we're rolling over is monotonically sorted relative + # to the groups + # GH 36197 + if not obj.empty: + groupby_order = np.concatenate( + list(self._groupby.grouper.indices.values()) + ).astype(np.int64) + obj = obj.take(groupby_order) + return super()._create_data(obj) + + def _gotitem(self, key, ndim, subset=None): + # we are setting the index on the actual object + # here so our index is carried through to the selected obj + # when we do the splitting for the groupby + if self.on is not None: + self.obj = self.obj.set_index(self._on) + self.on = None + return super()._gotitem(key, ndim, subset=subset) + + def _validate_monotonic(self): + """ + Validate that "on" is monotonic; already validated at a higher level. + """ + pass + + +class Window(BaseWindow): """ Provide rolling window calculations. @@ -886,10 +882,11 @@ class Window(_Window): axis : int or str, default 0 closed : str, default None Make the interval closed on the 'right', 'left', 'both' or - 'neither' endpoints. - For offset-based windows, it defaults to 'right'. - For fixed windows, defaults to 'both'. Remaining cases not implemented - for fixed windows. + 'neither' endpoints. Defaults to 'right'. + + .. versionchanged:: 1.2.0 + + The closed parameter with fixed windows is now supported. Returns ------- @@ -908,30 +905,14 @@ class Window(_Window): To learn more about the offsets & frequency strings, please see `this link `__. - The recognized win_types are: - - * ``boxcar`` - * ``triang`` - * ``blackman`` - * ``hamming`` - * ``bartlett`` - * ``parzen`` - * ``bohman`` - * ``blackmanharris`` - * ``nuttall`` - * ``barthann`` - * ``kaiser`` (needs parameter: beta) - * ``gaussian`` (needs parameter: std) - * ``general_gaussian`` (needs parameters: power, width) - * ``slepian`` (needs parameter: width) - * ``exponential`` (needs parameter: tau), center is set to None. - - If ``win_type=None`` all points are evenly weighted. To learn more about - different window types see `scipy.signal window functions - `__. - - Certain window types require additional parameters to be passed. Please see - the third example below on how to add the additional parameters. + If ``win_type=None``, all points are evenly weighted; otherwise, ``win_type`` + can accept a string of any `scipy.signal window function + `__. + + Certain Scipy window types require additional parameters to be passed + in the aggregation function. The additional parameters must match + the keywords specified in the Scipy window type method signature. + Please see the third example below on how to add the additional parameters. Examples -------- @@ -1028,101 +1009,99 @@ class Window(_Window): 2013-01-01 09:00:06 4.0 """ + @property + def _constructor(self): + return Window + def validate(self): super().validate() - window = self.window - if isinstance(window, BaseIndexer): + if isinstance(self.window, BaseIndexer): raise NotImplementedError( "BaseIndexer subclasses not implemented with win_types." ) - elif isinstance(window, (list, tuple, np.ndarray)): - pass - elif is_integer(window): - if window <= 0: + elif is_integer(self.window): + if self.window <= 0: raise ValueError("window must be > 0 ") - import_optional_dependency( - "scipy", extra="Scipy is required to generate window weight." + sig = import_optional_dependency( + "scipy.signal", extra="Scipy is required to generate window weight." ) - import scipy.signal as sig - if not isinstance(self.win_type, str): raise ValueError(f"Invalid win_type {self.win_type}") if getattr(sig, self.win_type, None) is None: raise ValueError(f"Invalid win_type {self.win_type}") else: - raise ValueError(f"Invalid window {window}") + raise ValueError(f"Invalid window {self.window}") - def _get_win_type(self, kwargs: Dict) -> Union[str, Tuple]: + def _center_window(self, result: np.ndarray, offset: int) -> np.ndarray: + """ + Center the result in the window for weighted rolling aggregations. """ - Extract arguments for the window type, provide validation for it - and return the validated window type. + if self.axis > result.ndim - 1: + raise ValueError("Requested axis is larger then no. of argument dimensions") - Parameters - ---------- - kwargs : dict + if offset > 0: + lead_indexer = [slice(None)] * result.ndim + lead_indexer[self.axis] = slice(offset, None) + result = np.copy(result[tuple(lead_indexer)]) + return result - Returns - ------- - win_type : str, or tuple - """ - # the below may pop from kwargs - def _validate_win_type(win_type, kwargs): - arg_map = { - "kaiser": ["beta"], - "gaussian": ["std"], - "general_gaussian": ["power", "width"], - "slepian": ["width"], - "exponential": ["tau"], - } - - if win_type in arg_map: - win_args = _pop_args(win_type, arg_map[win_type], kwargs) - if win_type == "exponential": - # exponential window requires the first arg (center) - # to be set to None (necessary for symmetric window) - win_args.insert(0, None) - - return tuple([win_type] + win_args) - - return win_type - - def _pop_args(win_type, arg_names, kwargs): - all_args = [] - for n in arg_names: - if n not in kwargs: - raise ValueError(f"{win_type} window requires {n}") - all_args.append(kwargs.pop(n)) - return all_args - - return _validate_win_type(self.win_type, kwargs) - - def _get_window( - self, other=None, win_type: Optional[Union[str, Tuple]] = None - ) -> np.ndarray: + def _apply( + self, + func: Callable[[np.ndarray, int, int], np.ndarray], + name: Optional[str] = None, + numba_cache_key: Optional[Tuple[Callable, str]] = None, + **kwargs, + ): """ - Get the window, weights. + Rolling with weights statistical measure using supplied function. + + Designed to be used with passed-in Cython array-based functions. Parameters ---------- - other : - ignored, exists for compatibility - win_type : str, or tuple - type of window to create + func : callable function to apply + name : str, + use_numba_cache : tuple + unused + **kwargs + additional arguments for scipy windows if necessary Returns ------- - window : ndarray - the window, weights + y : type of input """ - window = self.window - if isinstance(window, (list, tuple, np.ndarray)): - return com.asarray_tuplesafe(window).astype(float) - elif is_integer(window): - import scipy.signal as sig + signal = import_optional_dependency( + "scipy.signal", extra="Scipy is required to generate window weight." + ) + assert self.win_type is not None # for mypy + window = getattr(signal, self.win_type)(self.window, **kwargs) + offset = (len(window) - 1) // 2 if self.center else 0 - # GH #15662. `False` makes symmetric window, rather than periodic. - return sig.get_window(win_type, window, False).astype(float) + def homogeneous_func(values: np.ndarray): + # calculation function + + if values.size == 0: + return values.copy() + + def calc(x): + additional_nans = np.array([np.nan] * offset) + x = np.concatenate((x, additional_nans)) + return func(x, window, self.min_periods or len(window)) + + with np.errstate(all="ignore"): + if values.ndim > 1: + result = np.apply_along_axis(calc, self.axis, values) + else: + result = calc(values) + result = np.asarray(result) + + if self.center: + result = self._center_window(result, offset) + + return result + + return self._apply_blockwise(homogeneous_func, name) _agg_see_also_doc = dedent( """ @@ -1156,12 +1135,11 @@ def _get_window( _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/DataFrame", axis="", ) def aggregate(self, func, *args, **kwargs): - result, how = self._aggregate(func, *args, **kwargs) + result, how = aggregate(self, func, *args, **kwargs) if result is None: # these must apply directly @@ -1176,46 +1154,31 @@ def aggregate(self, func, *args, **kwargs): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_roll_func("roll_weighted_sum") - window_func = get_weighted_roll_func(window_func) - return self._apply( - window_func, center=self.center, is_weighted=True, name="sum", **kwargs - ) + return self._apply(window_func, name="sum", **kwargs) @Substitution(name="window") @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = self._get_roll_func("roll_weighted_mean") - window_func = get_weighted_roll_func(window_func) - return self._apply( - window_func, center=self.center, is_weighted=True, name="mean", **kwargs - ) + return self._apply(window_func, name="mean", **kwargs) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["var"]) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) - window_func = get_weighted_roll_func(window_func) kwargs.pop("name", None) - return self._apply( - window_func, center=self.center, is_weighted=True, name="var", **kwargs - ) + return self._apply(window_func, name="var", **kwargs) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["std"]) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) -class _Rolling(_Window): - @property - def _constructor(self): - return Rolling - - -class _Rolling_and_Expanding(_Rolling): +class RollingAndExpandingMixin(BaseWindow): _shared_docs["count"] = dedent( r""" @@ -1258,25 +1221,8 @@ class _Rolling_and_Expanding(_Rolling): ) def count(self): - # GH 32865. Using count with custom BaseIndexer subclass - # implementations shouldn't end up here - assert not isinstance(self.window, BaseIndexer) - - blocks, obj = self._create_blocks(self._selected_obj) - results = [] - for b in blocks: - result = b.notna().astype(int) - result = self._constructor( - result, - window=self._get_window(), - min_periods=self.min_periods or 0, - center=self.center, - axis=self.axis, - closed=self.closed, - ).sum() - results.append(result) - - return self._wrap_results(results, blocks, obj) + window_func = self._get_roll_func("roll_sum") + return self._apply(window_func, name="count") _shared_docs["apply"] = dedent( r""" @@ -1298,10 +1244,11 @@ def count(self): objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. - engine : str, default 'cython' + engine : str, default None * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` .. versionadded:: 1.0.0 @@ -1334,78 +1281,61 @@ def count(self): Notes ----- - See :ref:`stats.rolling_apply` for extended documentation and performance + See :ref:`window.numba_engine` for extended documentation and performance considerations for the Numba engine. """ ) def apply( self, - func, + func: Callable[..., Any], raw: bool = False, - engine: str = "cython", - engine_kwargs: Optional[Dict] = None, - args: Optional[Tuple] = None, - kwargs: Optional[Dict] = None, + engine: Optional[str] = None, + engine_kwargs: Optional[Dict[str, bool]] = None, + args: Optional[Tuple[Any, ...]] = None, + kwargs: Optional[Dict[str, Any]] = None, ): if args is None: args = () if kwargs is None: kwargs = {} - kwargs.pop("_level", None) - kwargs.pop("floor", None) + if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") - if engine == "cython": - if engine_kwargs is not None: - raise ValueError("cython engine does not accept engine_kwargs") - # Cython apply functions handle center, so don't need to use - # _apply's center handling - window = self._get_window() - offset = calculate_center_offset(window) if self.center else 0 - apply_func = self._generate_cython_apply_func( - args, kwargs, raw, offset, func - ) - center = False - elif engine == "numba": + numba_cache_key = None + if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") - cache_key = (func, "rolling_apply") - if cache_key in NUMBA_FUNC_CACHE: - # Return an already compiled version of roll_apply if available - apply_func = NUMBA_FUNC_CACHE[cache_key] - else: - apply_func = generate_numba_apply_func( - args, kwargs, func, engine_kwargs - ) - center = self.center + apply_func = generate_numba_apply_func(args, kwargs, func, engine_kwargs) + numba_cache_key = (func, "rolling_apply") + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + apply_func = self._generate_cython_apply_func(args, kwargs, raw, func) else: raise ValueError("engine must be either 'numba' or 'cython'") - # name=func & raw=raw for WindowGroupByMixin._apply return self._apply( apply_func, - center=center, - floor=0, - name=func, - use_numba_cache=engine == "numba", - raw=raw, - original_func=func, - args=args, - kwargs=kwargs, + numba_cache_key=numba_cache_key, ) - def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): + def _generate_cython_apply_func( + self, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + raw: bool, + function: Callable[..., Any], + ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, int], np.ndarray]: from pandas import Series window_func = partial( - self._get_cython_func_type("roll_generic"), + self._get_roll_func("roll_apply"), args=args, kwargs=kwargs, raw=raw, - offset=offset, - func=func, + function=function, ) def apply_func(values, begin, end, min_periods, raw=raw): @@ -1417,11 +1347,8 @@ def apply_func(values, begin, end, min_periods, raw=raw): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - window_func = self._get_cython_func_type("roll_sum") - kwargs.pop("floor", None) - return self._apply( - window_func, center=self.center, floor=0, name="sum", **kwargs - ) + window_func = self._get_roll_func("roll_sum") + return self._apply(window_func, name="sum", **kwargs) _shared_docs["max"] = dedent( """ @@ -1436,8 +1363,8 @@ def sum(self, *args, **kwargs): def max(self, *args, **kwargs): nv.validate_window_func("max", args, kwargs) - window_func = self._get_cython_func_type("roll_max") - return self._apply(window_func, center=self.center, name="max", **kwargs) + window_func = self._get_roll_func("roll_max") + return self._apply(window_func, name="max", **kwargs) _shared_docs["min"] = dedent( """ @@ -1478,13 +1405,13 @@ def max(self, *args, **kwargs): def min(self, *args, **kwargs): nv.validate_window_func("min", args, kwargs) - window_func = self._get_cython_func_type("roll_min") - return self._apply(window_func, center=self.center, name="min", **kwargs) + window_func = self._get_roll_func("roll_min") + return self._apply(window_func, name="min", **kwargs) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - window_func = self._get_cython_func_type("roll_mean") - return self._apply(window_func, center=self.center, name="mean", **kwargs) + window_func = self._get_roll_func("roll_mean") + return self._apply(window_func, name="mean", **kwargs) _shared_docs["median"] = dedent( """ @@ -1527,37 +1454,27 @@ def median(self, **kwargs): window_func = self._get_roll_func("roll_median_c") # GH 32865. Move max window size calculation to # the median function implementation - return self._apply(window_func, center=self.center, name="median", **kwargs) + return self._apply(window_func, name="median", **kwargs) - def std(self, ddof=1, *args, **kwargs): + def std(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - kwargs.pop("require_min_periods", None) - window_func = self._get_cython_func_type("roll_var") + window_func = self._get_roll_func("roll_var") def zsqrt_func(values, begin, end, min_periods): return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) - # ddof passed again for compat with groupby.rolling return self._apply( zsqrt_func, - center=self.center, - require_min_periods=1, name="std", - ddof=ddof, **kwargs, ) - def var(self, ddof=1, *args, **kwargs): + def var(self, ddof: int = 1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) - kwargs.pop("require_min_periods", None) - window_func = partial(self._get_cython_func_type("roll_var"), ddof=ddof) - # ddof passed again for compat with groupby.rolling + window_func = partial(self._get_roll_func("roll_var"), ddof=ddof) return self._apply( window_func, - center=self.center, - require_min_periods=1, name="var", - ddof=ddof, **kwargs, ) @@ -1573,12 +1490,9 @@ def var(self, ddof=1, *args, **kwargs): """ def skew(self, **kwargs): - window_func = self._get_cython_func_type("roll_skew") - kwargs.pop("require_min_periods", None) + window_func = self._get_roll_func("roll_skew") return self._apply( window_func, - center=self.center, - require_min_periods=3, name="skew", **kwargs, ) @@ -1615,13 +1529,63 @@ def skew(self, **kwargs): """ ) + def sem(self, ddof: int = 1, *args, **kwargs): + return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) + + _shared_docs["sem"] = dedent( + """ + Compute %(name)s standard error of mean. + + Parameters + ---------- + + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + *args, **kwargs + For NumPy compatibility. No additional arguments are used. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + pandas.Series.%(name)s : Calling object with Series data. + pandas.DataFrame.%(name)s : Calling object with DataFrames. + pandas.Series.sem : Equivalent method for Series. + pandas.DataFrame.sem : Equivalent method for DataFrame. + + Notes + ----- + A minimum of one period is required for the rolling calculation. + + Examples + -------- + >>> s = pd.Series([0, 1, 2, 3]) + >>> s.rolling(2, min_periods=1).sem() + 0 NaN + 1 0.707107 + 2 0.707107 + 3 0.707107 + dtype: float64 + + >>> s.expanding().sem() + 0 NaN + 1 0.707107 + 2 0.707107 + 3 0.745356 + dtype: float64 + """ + ) + def kurt(self, **kwargs): - window_func = self._get_cython_func_type("roll_kurt") - kwargs.pop("require_min_periods", None) + window_func = self._get_roll_func("roll_kurt") return self._apply( window_func, - center=self.center, - require_min_periods=4, name="kurt", **kwargs, ) @@ -1635,8 +1599,6 @@ def kurt(self, **kwargs): quantile : float Quantile to compute. 0 <= quantile <= 1. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - .. versionadded:: 0.23.0 - This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -1682,23 +1644,19 @@ def kurt(self, **kwargs): """ ) - def quantile(self, quantile, interpolation="linear", **kwargs): + def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): if quantile == 1.0: - window_func = self._get_cython_func_type("roll_max") + window_func = self._get_roll_func("roll_max") elif quantile == 0.0: - window_func = self._get_cython_func_type("roll_min") + window_func = self._get_roll_func("roll_min") else: window_func = partial( self._get_roll_func("roll_quantile"), - win=self._get_window(), quantile=quantile, interpolation=interpolation, ) - # Pass through for groupby.rolling - kwargs["quantile"] = quantile - kwargs["interpolation"] = interpolation - return self._apply(window_func, center=self.center, name="quantile", **kwargs) + return self._apply(window_func, name="quantile", **kwargs) _shared_docs[ "cov" @@ -1734,14 +1692,10 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): # GH 32865. We leverage rolling.mean, so we pass # to the rolling constructors the data used when constructing self: # window width, frequency data, or a BaseIndexer subclass - if isinstance(self.window, BaseIndexer): - window = self.window - else: - # GH 16058: offset window - if self.is_freq_type: - window = self.win_freq - else: - window = self._get_window(other) + # GH 16058: offset window + window = ( + self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq + ) def _get_cov(X, Y): # GH #12373 : rolling functions error on float32 data @@ -1759,7 +1713,7 @@ def _get_cov(X, Y): bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -1883,10 +1837,10 @@ def corr(self, other=None, pairwise=None, **kwargs): # GH 32865. We leverage rolling.cov and rolling.std here, so we pass # to the rolling constructors the data used when constructing self: # window width, frequency data, or a BaseIndexer subclass - if isinstance(self.window, BaseIndexer): - window = self.window - else: - window = self._get_window(other) if not self.is_freq_type else self.win_freq + # GH 16058: offset window + window = ( + self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq + ) def _get_corr(a, b): a = a.rolling( @@ -1895,15 +1849,17 @@ def _get_corr(a, b): b = b.rolling( window=window, min_periods=self.min_periods, center=self.center ) + # GH 31286: Through using var instead of std we can avoid numerical + # issues when the result of var is withing floating proint precision + # while std is not. + return a.cov(b, **kwargs) / (a.var(**kwargs) * b.var(**kwargs)) ** 0.5 - return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) - - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) -class Rolling(_Rolling_and_Expanding): +class Rolling(RollingAndExpandingMixin): @cache_readonly def is_datetimelike(self) -> bool: return isinstance( @@ -1928,6 +1884,10 @@ def _on(self) -> Index: "must be a column (of DataFrame), an Index or None" ) + @property + def _constructor(self): + return Rolling + def validate(self): super().validate() @@ -1937,7 +1897,6 @@ def validate(self): ): self._validate_monotonic() - freq = self._validate_freq() # we don't allow center if self.center: @@ -1948,7 +1907,7 @@ def validate(self): # this will raise ValueError on non-fixed freqs self.win_freq = self.window - self.window = freq.nanos + self.window = self._determine_window_length() self.win_type = "freq" # min_periods must be an integer @@ -1963,20 +1922,28 @@ def validate(self): elif self.window < 0: raise ValueError("window must be non-negative") - if not self.is_datetimelike and self.closed is not None: - raise ValueError( - "closed only implemented for datetimelike and offset based windows" - ) + def _determine_window_length(self) -> Union[int, float]: + """ + Calculate freq for PeriodIndexes based on Index freq. Can not use + nanos, because asi8 of PeriodIndex is not in nanos + """ + freq = self._validate_freq() + if isinstance(self._on, ABCPeriodIndex): + return freq.nanos / (self._on.freq.nanos / self._on.freq.n) + return freq.nanos def _validate_monotonic(self): """ Validate monotonic (increasing or decreasing). """ if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): - formatted = self.on - if self.on is None: - formatted = "index" - raise ValueError(f"{formatted} must be monotonic") + self._raise_monotonic_error() + + def _raise_monotonic_error(self): + formatted = self.on + if self.on is None: + formatted = "index" + raise ValueError(f"{formatted} must be monotonic") def _validate_freq(self): """ @@ -2028,7 +1995,6 @@ def _validate_freq(self): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/Dataframe", axis="", ) @@ -2040,26 +2006,22 @@ def aggregate(self, func, *args, **kwargs): @Substitution(name="rolling") @Appender(_shared_docs["count"]) def count(self): - - # different impl for freq counting - # GH 32865. Use a custom count function implementation - # when using a BaseIndexer subclass as a window - if self.is_freq_type or isinstance(self.window, BaseIndexer): - window_func = self._get_roll_func("roll_count") - return self._apply(window_func, center=self.center, name="count") - + if self.min_periods is None: + warnings.warn( + ( + "min_periods=None will default to the size of window " + "consistent with other methods in a future version. " + "Specify min_periods=0 instead." + ), + FutureWarning, + ) + self.min_periods = 0 return super().count() @Substitution(name="rolling") @Appender(_shared_docs["apply"]) def apply( - self, - func, - raw=False, - engine="cython", - engine_kwargs=None, - args=None, - kwargs=None, + self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None ): return super().apply( func, @@ -2118,6 +2080,11 @@ def var(self, ddof=1, *args, **kwargs): def skew(self, **kwargs): return super().skew(**kwargs) + @Substitution(name="rolling") + @Appender(_shared_docs["sem"]) + def sem(self, ddof=1, *args, **kwargs): + return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) + _agg_doc = dedent( """ Examples @@ -2171,124 +2138,49 @@ def corr(self, other=None, pairwise=None, **kwargs): Rolling.__doc__ = Window.__doc__ -class RollingGroupby(WindowGroupByMixin, Rolling): +class RollingGroupby(BaseWindowGroupby, Rolling): """ Provide a rolling groupby implementation. """ - def _apply( - self, - func: Callable, - center: bool, - require_min_periods: int = 0, - floor: int = 1, - is_weighted: bool = False, - name: Optional[str] = None, - use_numba_cache: bool = False, - **kwargs, - ): - result = Rolling._apply( - self, - func, - center, - require_min_periods, - floor, - is_weighted, - name, - use_numba_cache, - **kwargs, - ) - # Cannot use _wrap_outputs because we calculate the result all at once - # Compose MultiIndex result from grouping levels then rolling level - # Aggregate the MultiIndex data as tuples then the level names - grouped_object_index = self._groupby._selected_obj.index - grouped_index_name = [grouped_object_index.name] - groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] - result_index_names = groupby_keys + grouped_index_name - - result_index_data = [] - for key, values in self._groupby.grouper.indices.items(): - for value in values: - if not is_list_like(key): - data = [key, grouped_object_index[value]] - else: - data = [*key, grouped_object_index[value]] - result_index_data.append(tuple(data)) - - result_index = MultiIndex.from_tuples( - result_index_data, names=result_index_names - ) - result.index = result_index - return result - - @property - def _constructor(self): - return Rolling - - def _create_blocks(self, obj: FrameOrSeries): - """ - Split data into blocks & return conformed data. - """ - # Ensure the object we're rolling over is monotonically sorted relative - # to the groups - groupby_order = np.concatenate( - list(self._groupby.grouper.indices.values()) - ).astype(np.int64) - obj = obj.take(groupby_order) - return super()._create_blocks(obj) - - def _get_cython_func_type(self, func: str) -> Callable: - """ - Return the cython function type. - - RollingGroupby needs to always use "variable" algorithms since processing - the data in group order may not be monotonic with the data which - "fixed" algorithms assume - """ - return self._get_roll_func(f"{func}_variable") - - def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: + def _get_window_indexer(self) -> GroupbyIndexer: """ Return an indexer class that will compute the window start and end bounds - Parameters - ---------- - window : int - window size for FixedWindowIndexer - Returns ------- - GroupbyRollingIndexer + GroupbyIndexer """ - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] - if self.is_freq_type: + rolling_indexer: Type[BaseIndexer] + indexer_kwargs: Optional[Dict[str, Any]] = None + index_array = self._index_array + window = self.window + if isinstance(self.window, BaseIndexer): + rolling_indexer = type(self.window) + indexer_kwargs = self.window.__dict__ + assert isinstance(indexer_kwargs, dict) # for mypy + # We'll be using the index of each group later + indexer_kwargs.pop("index_array", None) + window = 0 + elif self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self._groupby._selected_obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None - window_indexer = GroupbyRollingIndexer( + window_indexer = GroupbyIndexer( index_array=index_array, window_size=window, groupby_indicies=self._groupby.indices, - rolling_indexer=rolling_indexer, + window_indexer=rolling_indexer, + indexer_kwargs=indexer_kwargs, ) return window_indexer - def _gotitem(self, key, ndim, subset=None): - # we are setting the index on the actual object - # here so our index is carried thru to the selected obj - # when we do the splitting for the groupby - if self.on is not None: - self._groupby.obj = self._groupby.obj.set_index(self._on) - self.on = None - return super()._gotitem(key, ndim, subset=subset) - def _validate_monotonic(self): """ Validate that on is monotonic; - we don't care for groupby.rolling - because we have already validated at a higher - level. + in this case we have to check only for nans, because + monotonicy was already validated at a higher level. """ - pass + if self._on.hasnans: + self._raise_monotonic_error() diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 6ac3004d29996..ea60ae5c1d227 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -202,9 +202,30 @@ class NumbaUtilError(Exception): """ +class DuplicateLabelError(ValueError): + """ + Error raised when an operation would introduce duplicate labels. + + .. versionadded:: 1.2.0 + + Examples + -------- + >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( + ... allows_duplicate_labels=False + ... ) + >>> s.reindex(['a', 'a', 'b']) + Traceback (most recent call last): + ... + DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + + class InvalidIndexError(Exception): """ - Exception raised when attemping to use an invalid index key. + Exception raised when attempting to use an invalid index key. .. versionadded:: 1.1.0 """ diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 40bff5a75709b..a8020f4bb4e4f 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -274,7 +274,7 @@ def copy_dev_clipboard(text): fo.write(text) def paste_dev_clipboard() -> str: - with open("/dev/clipboard", "rt") as fo: + with open("/dev/clipboard") as fo: content = fo.read() return content @@ -311,17 +311,17 @@ def init_windows_clipboard(): global HGLOBAL, LPVOID, DWORD, LPCSTR, INT global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE from ctypes.wintypes import ( - HGLOBAL, - LPVOID, + BOOL, DWORD, - LPCSTR, - INT, - HWND, + HANDLE, + HGLOBAL, HINSTANCE, HMENU, - BOOL, + HWND, + INT, + LPCSTR, + LPVOID, UINT, - HANDLE, ) windll = ctypes.windll @@ -521,15 +521,15 @@ def determine_clipboard(): return init_windows_clipboard() if platform.system() == "Linux": - with open("/proc/version", "r") as f: + with open("/proc/version") as f: if "Microsoft" in f.read(): return init_wsl_clipboard() # Setup for the MAC OS X platform: if os.name == "mac" or platform.system() == "Darwin": try: - import Foundation # check if pyobjc is installed import AppKit + import Foundation # check if pyobjc is installed except ImportError: return init_osx_pbcopy_clipboard() else: diff --git a/pandas/io/common.py b/pandas/io/common.py index 51323c5ff3ef5..9fede5180e727 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -2,24 +2,12 @@ import bz2 from collections import abc +import dataclasses import gzip -from io import BufferedIOBase, BytesIO, RawIOBase +from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper import mmap import os -import pathlib -from typing import ( - IO, - TYPE_CHECKING, - Any, - AnyStr, - Dict, - List, - Mapping, - Optional, - Tuple, - Type, - Union, -) +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast from urllib.parse import ( urljoin, urlparse as parse_url, @@ -27,23 +15,93 @@ uses_params, uses_relative, ) +import warnings import zipfile -from pandas._typing import FilePathOrBuffer -from pandas.compat import _get_lzma_file, _import_lzma +from pandas._typing import ( + Buffer, + CompressionDict, + CompressionOptions, + FileOrBuffer, + FilePathOrBuffer, + StorageOptions, +) +from pandas.compat import get_lzma_file, import_lzma from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_file_like -lzma = _import_lzma() +lzma = import_lzma() _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -if TYPE_CHECKING: - from io import IOBase # noqa: F401 +@dataclasses.dataclass +class IOArgs: + """ + Return value of io/common.py:_get_filepath_or_buffer. + + Note (copy&past from io/parsers): + filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] + though mypy handling of conditional imports is difficult. + See https://github.com/python/mypy/issues/1297 + """ + + filepath_or_buffer: FileOrBuffer + encoding: str + mode: str + compression: CompressionDict + should_close: bool = False + + +@dataclasses.dataclass +class IOHandles: + """ + Return value of io/common.py:get_handle + + Can be used as a context manager. + + This is used to easily close created buffers and to handle corner cases when + TextIOWrapper is inserted. + + handle: The file handle to be used. + created_handles: All file handles that are created by get_handle + is_wrapped: Whether a TextIOWrapper needs to be detached. + """ + + handle: Buffer + compression: CompressionDict + created_handles: List[Buffer] = dataclasses.field(default_factory=list) + is_wrapped: bool = False + is_mmap: bool = False + + def close(self) -> None: + """ + Close all created buffers. + + Note: If a TextIOWrapper was inserted, it is flushed and detached to + avoid closing the potentially user-created buffer. + """ + if self.is_wrapped: + assert isinstance(self.handle, TextIOWrapper) + self.handle.flush() + self.handle.detach() + self.created_handles.remove(self.handle) + try: + for handle in self.created_handles: + handle.close() + except (OSError, ValueError): + pass + self.created_handles = [] + self.is_wrapped = False + + def __enter__(self) -> "IOHandles": + return self + + def __exit__(self, *args: Any) -> None: + self.close() def is_url(url) -> bool: @@ -64,9 +122,7 @@ def is_url(url) -> bool: return parse_url(url).scheme in _VALID_URLS -def _expand_user( - filepath_or_buffer: FilePathOrBuffer[AnyStr], -) -> FilePathOrBuffer[AnyStr]: +def _expand_user(filepath_or_buffer: FileOrBuffer[AnyStr]) -> FileOrBuffer[AnyStr]: """ Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -96,7 +152,7 @@ def validate_header_arg(header) -> None: def stringify_path( filepath_or_buffer: FilePathOrBuffer[AnyStr], -) -> FilePathOrBuffer[AnyStr]: +) -> FileOrBuffer[AnyStr]: """ Attempt to convert a path-like object to a string. @@ -119,11 +175,8 @@ def stringify_path( Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ - if hasattr(filepath_or_buffer, "__fspath__"): - # https://github.com/python/mypy/issues/1424 - return filepath_or_buffer.__fspath__() # type: ignore - elif isinstance(filepath_or_buffer, pathlib.Path): - return str(filepath_or_buffer) + if isinstance(filepath_or_buffer, os.PathLike): + filepath_or_buffer = filepath_or_buffer.__fspath__() return _expand_user(filepath_or_buffer) @@ -149,13 +202,13 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: ) -def get_filepath_or_buffer( +def _get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, - encoding: Optional[str] = None, - compression: Optional[str] = None, - mode: Optional[str] = None, - storage_options: Optional[Dict[str, Any]] = None, -): + encoding: str = "utf-8", + compression: CompressionOptions = None, + mode: str = "r", + storage_options: StorageOptions = None, +) -> IOArgs: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -167,27 +220,81 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional - storage_options: dict, optional - passed on to fsspec, if using it; this is not yet accessed by the public API - Returns - ------- - Tuple[FilePathOrBuffer, str, str, bool] - Tuple containing the filepath or buffer, the encoding, the compression - and should_close. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + + ..versionchange:: 1.2.0 + + Returns the dataclass IOArgs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) + # handle compression dict + compression_method, compression = get_compression_method(compression) + compression_method = infer_compression(filepath_or_buffer, compression_method) + + # GH21227 internal compression is not used for non-binary handles. + if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode: + warnings.warn( + "compression has no effect when passing a non-binary object as input.", + RuntimeWarning, + stacklevel=2, + ) + compression_method = None + + compression = dict(compression, method=compression_method) + + # uniform encoding names + if encoding is not None: + encoding = encoding.replace("_", "-").lower() + + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 + # print a warning when writing such files + if ( + "w" in mode + and compression_method in ["bz2", "xz"] + and encoding in ["utf-16", "utf-32"] + ): + warnings.warn( + f"{compression} will not write the byte order mark for {encoding}", + UnicodeWarning, + ) + + # Use binary mode when converting path-like objects to file-like objects (fsspec) + # except when text mode is explicitly requested. The original mode is returned if + # fsspec is not used. + fsspec_mode = mode + if "t" not in fsspec_mode and "b" not in fsspec_mode: + fsspec_mode += "b" + if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header - compression = "gzip" + compression = {"method": "gzip"} reader = BytesIO(req.read()) req.close() - return reader, encoding, compression, True + return IOArgs( + filepath_or_buffer=reader, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) if is_fsspec_url(filepath_or_buffer): assert isinstance( @@ -202,19 +309,69 @@ def get_filepath_or_buffer( filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") - file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) - ).open() - return file_obj, encoding, compression, True + # If botocore is installed we fallback to reading with anon=True + # to allow reads from public buckets + err_types_to_retry_with_anon: List[Any] = [] + try: + import_optional_dependency("botocore") + from botocore.exceptions import ClientError, NoCredentialsError + + err_types_to_retry_with_anon = [ + ClientError, + NoCredentialsError, + PermissionError, + ] + except ImportError: + pass + + try: + file_obj = fsspec.open( + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) + ).open() + # GH 34626 Reads from Public Buckets without Credentials needs anon=True + except tuple(err_types_to_retry_with_anon): + if storage_options is None: + storage_options = {"anon": True} + else: + # don't mutate user input. + storage_options = dict(storage_options) + storage_options["anon"] = True + file_obj = fsspec.open( + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) + ).open() + + return IOArgs( + filepath_or_buffer=file_obj, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) + elif storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): - return _expand_user(filepath_or_buffer), None, compression, False + return IOArgs( + filepath_or_buffer=_expand_user(filepath_or_buffer), + encoding=encoding, + compression=compression, + should_close=False, + mode=mode, + ) if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) - return filepath_or_buffer, None, compression, False + return IOArgs( + filepath_or_buffer=filepath_or_buffer, + encoding=encoding, + compression=compression, + should_close=False, + mode=mode, + ) def file_path_to_url(path: str) -> str: @@ -239,8 +396,8 @@ def file_path_to_url(path: str) -> str: def get_compression_method( - compression: Optional[Union[str, Mapping[str, str]]] -) -> Tuple[Optional[str], Dict[str, str]]: + compression: CompressionOptions, +) -> Tuple[Optional[str], CompressionDict]: """ Simplifies a compression argument to a compression method string and a mapping containing additional arguments. @@ -254,21 +411,23 @@ def get_compression_method( Returns ------- tuple of ({compression method}, Optional[str] - {compression arguments}, Dict[str, str]) + {compression arguments}, Dict[str, Any]) Raises ------ ValueError on mapping missing 'method' key """ + compression_method: Optional[str] if isinstance(compression, Mapping): compression_args = dict(compression) try: - compression = compression_args.pop("method") + compression_method = compression_args.pop("method") except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: compression_args = {} - return compression, compression_args + compression_method = compression + return compression_method, compression_args def infer_compression( @@ -297,7 +456,6 @@ def infer_compression( ------ ValueError on invalid compression specified. """ - # No compression has been explicitly specified if compression is None: return None @@ -311,7 +469,7 @@ def infer_compression( # Infer compression from the filename/URL extension for compression, extension in _compression_to_extension.items(): - if filepath_or_buffer.endswith(extension): + if filepath_or_buffer.lower().endswith(extension): return compression return None @@ -326,14 +484,15 @@ def infer_compression( def get_handle( - path_or_buf, + path_or_buf: FilePathOrBuffer, mode: str, - encoding=None, - compression: Optional[Union[str, Mapping[str, Any]]] = None, + encoding: Optional[str] = None, + compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, - errors=None, -): + errors: Optional[str] = None, + storage_options: StorageOptions = None, +) -> IOHandles: """ Get file handle for given path/buffer and mode. @@ -369,77 +528,85 @@ def get_handle( memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True - whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.). + Whether the type of the content passed to the file/buffer is string or + bytes. This is not the same as `"b" not in mode`. If a string content is + passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. + storage_options: StorageOptions = None + Passed to _get_filepath_or_buffer - .. versionadded:: 1.1.0 + .. versionchanged:: 1.2.0 - Returns - ------- - f : file-like - A file-like object. - handles : list of file-like objects - A list of file-like object that were opened in this function. + Returns the dataclass IOHandles """ - need_text_wrapping: Tuple[Type["IOBase"], ...] - try: - from s3fs import S3File - - need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) - except ImportError: - need_text_wrapping = (BufferedIOBase, RawIOBase) + # Windows does not default to utf-8. Set to utf-8 for a consistent behavior + if encoding is None: + encoding = "utf-8" + + # read_csv does not know whether the buffer is opened in binary/text mode + if _is_binary_mode(path_or_buf, mode) and "b" not in mode: + mode += "b" + + # open URLs + ioargs = _get_filepath_or_buffer( + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, + ) - handles: List[IO] = list() - f = path_or_buf + handle = ioargs.filepath_or_buffer + handles: List[Buffer] - # Convert pathlib.Path/py.path.local or string - path_or_buf = stringify_path(path_or_buf) - is_path = isinstance(path_or_buf, str) + # memory mapping needs to be the first step + handle, memory_map, handles = _maybe_memory_map( + handle, memory_map, ioargs.encoding, ioargs.mode, errors + ) - compression, compression_args = get_compression_method(compression) - if is_path: - compression = infer_compression(path_or_buf, compression) + is_path = isinstance(handle, str) + compression_args = dict(ioargs.compression) + compression = compression_args.pop("method") if compression: - - # GH33398 the type ignores here seem related to mypy issue #5382; - # it may be possible to remove them once that is resolved. + # compression libraries do not like an explicit text-mode + ioargs.mode = ioargs.mode.replace("t", "") # GZ Compression if compression == "gzip": if is_path: - f = gzip.open( - path_or_buf, mode, **compression_args # type: ignore + assert isinstance(handle, str) + handle = gzip.GzipFile( + filename=handle, + mode=ioargs.mode, + **compression_args, ) else: - f = gzip.GzipFile( - fileobj=path_or_buf, **compression_args # type: ignore + handle = gzip.GzipFile( + fileobj=handle, # type: ignore[arg-type] + mode=ioargs.mode, + **compression_args, ) # BZ Compression elif compression == "bz2": - if is_path: - f = bz2.BZ2File( - path_or_buf, mode, **compression_args # type: ignore - ) - else: - f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore + handle = bz2.BZ2File( + handle, # type: ignore[arg-type] + mode=ioargs.mode, + **compression_args, + ) # ZIP Compression elif compression == "zip": - zf = _BytesZipFile(path_or_buf, mode, **compression_args) - # Ensure the container is closed as well. - handles.append(zf) - if zf.mode == "w": - f = zf - elif zf.mode == "r": - zip_names = zf.namelist() + handle = _BytesZipFile(handle, ioargs.mode, **compression_args) + if handle.mode == "r": + handles.append(handle) + zip_names = handle.namelist() if len(zip_names) == 1: - f = zf.open(zip_names.pop()) + handle = handle.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: @@ -450,52 +617,76 @@ def get_handle( # XZ Compression elif compression == "xz": - f = _get_lzma_file(lzma)(path_or_buf, mode) + handle = get_lzma_file(lzma)(handle, ioargs.mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) - handles.append(f) + assert not isinstance(handle, str) + handles.append(handle) - elif is_path: - if encoding: + elif isinstance(handle, str): + # Check whether the filename is to be opened in binary mode. + # Binary mode does not support 'encoding' and 'newline'. + if ioargs.encoding and "b" not in ioargs.mode: # Encoding - f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") - elif is_text: - # No explicit encoding - f = open(path_or_buf, mode, errors="replace", newline="") + handle = open( + handle, + ioargs.mode, + encoding=ioargs.encoding, + errors=errors, + newline="", + ) else: # Binary mode - f = open(path_or_buf, mode) - handles.append(f) + handle = open(handle, ioargs.mode) + handles.append(handle) # Convert BytesIO or file objects passed with an encoding - if is_text and (compression or isinstance(f, need_text_wrapping)): - from io import TextIOWrapper - - g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") - if not isinstance(f, (BufferedIOBase, RawIOBase)): - handles.append(g) - f = g - - if memory_map and hasattr(f, "fileno"): - try: - wrapped = _MMapWrapper(f) - f.close() - f = wrapped - except Exception: - # we catch any errors that may have occurred - # because that is consistent with the lower-level - # functionality of the C engine (pd.read_csv), so - # leave the file handler as is then - pass + is_wrapped = False + if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): + handle = TextIOWrapper( + handle, # type: ignore[arg-type] + encoding=ioargs.encoding, + errors=errors, + newline="", + ) + handles.append(handle) + # only marked as wrapped when the caller provided a handle + is_wrapped = not ( + isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close + ) - return f, handles + handles.reverse() # close the most recently added buffer first + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + handles.append(ioargs.filepath_or_buffer) + + assert not isinstance(handle, str) + return IOHandles( + handle=handle, + created_handles=handles, + is_wrapped=is_wrapped, + is_mmap=memory_map, + compression=ioargs.compression, + ) -class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore +# error: Definition of "__exit__" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "BinaryIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "IO" [misc] +# error: Definition of "read" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "read" in base class "ZipFile" is incompatible with +# definition in base class "IO" [misc] +class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc] """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -515,12 +706,13 @@ def __init__( if mode in ["wb", "rb"]: mode = mode.replace("b", "") self.archive_name = archive_name - super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} + kwargs_zip.update(kwargs) + super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] def write(self, data): - archive_name = self.filename - if self.archive_name is not None: - archive_name = self.archive_name + # ZipFile needs a non-empty string + archive_name = self.archive_name or self.filename or "zip" super().writestr(archive_name, data) @property @@ -542,9 +734,16 @@ class _MMapWrapper(abc.Iterator): """ def __init__(self, f: IO): + self.attributes = {} + for attribute in ("seekable", "readable", "writeable"): + if not hasattr(f, attribute): + continue + self.attributes[attribute] = getattr(f, attribute)() self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) def __getattr__(self, name: str): + if name in self.attributes: + return lambda: self.attributes[name] return getattr(self.mmap, name) def __iter__(self) -> "_MMapWrapper": @@ -563,3 +762,66 @@ def __next__(self) -> str: if newline == "": raise StopIteration return newline + + +def _maybe_memory_map( + handle: FileOrBuffer, + memory_map: bool, + encoding: str, + mode: str, + errors: Optional[str], +) -> Tuple[FileOrBuffer, bool, List[Buffer]]: + """Try to memory map file/buffer.""" + handles: List[Buffer] = [] + memory_map &= hasattr(handle, "fileno") or isinstance(handle, str) + if not memory_map: + return handle, memory_map, handles + + # need to open the file first + if isinstance(handle, str): + if encoding and "b" not in mode: + # Encoding + handle = open(handle, mode, encoding=encoding, errors=errors, newline="") + else: + # Binary mode + handle = open(handle, mode) + handles.append(handle) + + try: + wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] + handle.close() + handles.remove(handle) + handles.append(wrapped) + handle = wrapped + except Exception: + # we catch any errors that may have occurred + # because that is consistent with the lower-level + # functionality of the C engine (pd.read_csv), so + # leave the file handler as is then + memory_map = False + + return handle, memory_map, handles + + +def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: + """Test whether file exists.""" + exists = False + filepath_or_buffer = stringify_path(filepath_or_buffer) + if not isinstance(filepath_or_buffer, str): + return exists + try: + exists = os.path.exists(filepath_or_buffer) + # gh-5874: if the filepath is too long will raise here + except (TypeError, ValueError): + pass + return exists + + +def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: + """Whether the handle is opened in binary mode""" + # classes that expect bytes + binary_classes = [BufferedIOBase, RawIOBase] + + return isinstance(handle, tuple(binary_classes)) or "b" in getattr( + handle, "mode", mode + ) diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 07919dbda63ae..f079a25f69fec 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,16 +1,46 @@ """This module is designed for community supported date conversion functions""" +import warnings + import numpy as np from pandas._libs.tslibs import parsing def parse_date_time(date_col, time_col): + """ + Parse columns with dates and times into a single datetime column. + + .. deprecated:: 1.2 + """ + warnings.warn( + """ + Use pd.to_datetime(date_col + " " + time_col) instead to get a Pandas Series. + Use pd.to_datetime(date_col + " " + time_col).to_pydatetime() instead to get a Numpy array. +""", # noqa: E501 + FutureWarning, + stacklevel=2, + ) date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) return parsing.try_parse_date_and_time(date_col, time_col) def parse_date_fields(year_col, month_col, day_col): + """ + Parse columns with years, months and days into a single date column. + + .. deprecated:: 1.2 + """ + warnings.warn( + """ + Use pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) instead to get a Pandas Series. + Use ser = pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) and + np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. +""", # noqa: E501 + FutureWarning, + stacklevel=2, + ) + year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) @@ -18,6 +48,24 @@ def parse_date_fields(year_col, month_col, day_col): def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col): + """ + Parse columns with datetime information into a single datetime column. + + .. deprecated:: 1.2 + """ + + warnings.warn( + """ + Use pd.to_datetime({"year": year_col, "month": month_col, "day": day_col, + "hour": hour_col, "minute": minute_col, second": second_col}) instead to get a Pandas Series. + Use ser = pd.to_datetime({"year": year_col, "month": month_col, "day": day_col, + "hour": hour_col, "minute": minute_col, second": second_col}) and + np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. +""", # noqa: E501 + FutureWarning, + stacklevel=2, + ) + year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) @@ -30,6 +78,20 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_ def generic_parser(parse_func, *cols): + """ + Use dateparser to parse columns with data information into a single datetime column. + + .. deprecated:: 1.2 + """ + + warnings.warn( + """ + Use pd.to_datetime instead. +""", + FutureWarning, + stacklevel=2, + ) + N = _check_columns(cols) results = np.empty(N, dtype=object) diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index d035223957a76..3bad493dee388 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -1,9 +1,9 @@ from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel -from pandas.io.excel._odswriter import _ODSWriter -from pandas.io.excel._openpyxl import _OpenpyxlWriter +from pandas.io.excel._odswriter import ODSWriter as _ODSWriter +from pandas.io.excel._openpyxl import OpenpyxlWriter as _OpenpyxlWriter from pandas.io.excel._util import register_writer -from pandas.io.excel._xlsxwriter import _XlsxWriter -from pandas.io.excel._xlwt import _XlwtWriter +from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter +from pandas.io.excel._xlwt import XlwtWriter as _XlwtWriter __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2a12f779230b2..bf1011176693f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,13 +1,17 @@ import abc import datetime +import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Union +from typing import Any, Dict, Mapping, Union, cast +import warnings from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions +from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -15,19 +19,13 @@ from pandas.core.frame import DataFrame -from pandas.io.common import ( - get_filepath_or_buffer, - is_url, - stringify_path, - urlopen, - validate_header_arg, -) +from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg from pandas.io.excel._util import ( - _fill_mi_header, - _get_default_writer, - _maybe_convert_usecols, - _pop_header_name, + fill_mi_header, + get_default_writer, get_writer, + maybe_convert_usecols, + pop_header_name, ) from pandas.io.parsers import TextParser @@ -49,7 +47,7 @@ If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. sheet_name : str, int, list, or None, default 0 Strings are used for sheet names. Integers are used in zero-indexed @@ -104,12 +102,32 @@ of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". Engine compatibility : + - "xlrd" supports most old/new Excel file formats. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + + .. versionchanged:: 1.2.0 + The engine `xlrd `_ + is no longer maintained, and is not supported with + python >= 3.9. When ``engine=None``, the following logic will be + used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will + be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future. + converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -119,13 +137,14 @@ Values to consider as True. false_values : list, default None Values to consider as False. -skiprows : list-like - Rows to skip at the beginning (0-indexed). +skiprows : list-like, int, or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) at the + start of the file. If callable, the callable function will be evaluated + against the row indices, returning True if the row should be skipped and + False otherwise. An example of a valid callable argument would be ``lambda + x: x in [0, 2]``. nrows : int, default None Number of rows to parse. - - .. versionadded:: 0.23.0 - na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted @@ -199,6 +218,15 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. +storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. + + .. versionadded:: 1.2.0 Returns ------- @@ -298,61 +326,70 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, + storage_options: StorageOptions = None, ): + should_close = False if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + should_close = True + io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " "an ExcelFile - ExcelFile already has the engine set" ) - return io.parse( - sheet_name=sheet_name, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - dtype=dtype, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - verbose=verbose, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, - ) - - -class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): - # If filepath_or_buffer is a url, load the data into a BytesIO - if is_url(filepath_or_buffer): - filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) - elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) - - if isinstance(filepath_or_buffer, self._workbook_class): - self.book = filepath_or_buffer - elif hasattr(filepath_or_buffer, "read"): + try: + data = io.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + dtype=dtype, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + na_filter=na_filter, + verbose=verbose, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + ) + finally: + # make sure to close opened file handles + if should_close: + io.close() + return data + + +class BaseExcelReader(metaclass=abc.ABCMeta): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): + self.handles = IOHandles( + handle=filepath_or_buffer, compression={"method": None} + ) + if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): + self.handles = get_handle( + filepath_or_buffer, "rb", storage_options=storage_options, is_text=False + ) + + if isinstance(self.handles.handle, self._workbook_class): + self.book = self.handles.handle + elif hasattr(self.handles.handle, "read"): # N.B. xlrd.Book has a read attribute too - filepath_or_buffer.seek(0) - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, str): - self.book = self.load_workbook(filepath_or_buffer) - elif isinstance(filepath_or_buffer, bytes): - self.book = self.load_workbook(BytesIO(filepath_or_buffer)) + self.handles.handle.seek(0) + self.book = self.load_workbook(self.handles.handle) + elif isinstance(self.handles.handle, bytes): + self.book = self.load_workbook(BytesIO(self.handles.handle)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -368,7 +405,7 @@ def load_workbook(self, filepath_or_buffer): pass def close(self): - pass + self.handles.close() @property @abc.abstractmethod @@ -441,7 +478,7 @@ def parse( sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) - usecols = _maybe_convert_usecols(usecols) + usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() @@ -460,10 +497,10 @@ def parse( if is_integer(skiprows): row += skiprows - data[row], control_row = _fill_mi_header(data[row], control_row) + data[row], control_row = fill_mi_header(data[row], control_row) if index_col is not None: - header_name, _ = _pop_header_name(data[row], index_col) + header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): @@ -539,23 +576,39 @@ class ExcelWriter(metaclass=abc.ABCMeta): Default is to use xlwt for xls, openpyxl for xlsx, odf for ods. See DataFrame.to_excel for typical usage. + The writer should be used as a context manager. Otherwise, call `close()` to save + and close any opened file handles. + Parameters ---------- - path : str + path : str or typing.BinaryIO Path to xls or xlsx or ods file. engine : str (optional) Engine to use for writing. If None, defaults to ``io.excel..writer``. NOTE: can only be passed as a keyword argument. + + .. deprecated:: 1.2.0 + + As the `xlwt `__ package is no longer + maintained, the ``xlwt`` engine will be removed in a future + version of pandas. + date_format : str, default None Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). datetime_format : str, default None Format string for datetime objects written into Excel files. (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' - File mode to use (write or append). + File mode to use (write or append). Append does not work with fsspec URLs. .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". + + .. versionadded:: 1.2.0 Attributes ---------- @@ -588,14 +641,29 @@ class ExcelWriter(metaclass=abc.ABCMeta): You can set the date format or datetime format: >>> with ExcelWriter('path_to_file.xlsx', - date_format='YYYY-MM-DD', - datetime_format='YYYY-MM-DD HH:MM:SS') as writer: + ... date_format='YYYY-MM-DD', + ... datetime_format='YYYY-MM-DD HH:MM:SS') as writer: ... df.to_excel(writer) You can also append to an existing Excel file: >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: ... df.to_excel(writer, sheet_name='Sheet3') + + You can store Excel file in RAM: + + >>> import io + >>> buffer = io.BytesIO() + >>> with pd.ExcelWriter(buffer) as writer: + ... df.to_excel(writer) + + You can pack Excel file into zip archive: + + >>> import zipfile + >>> with zipfile.ZipFile('path_to_file.zip', 'w') as zf: + ... with zf.open('filename.xlsx', 'w') as buffer: + ... with pd.ExcelWriter(buffer) as writer: + ... df.to_excel(writer) """ # Defining an ExcelWriter implementation (see abstract methods for more...) @@ -630,17 +698,36 @@ def __new__(cls, path, engine=None, **kwargs): ext = "xlsx" try: - engine = config.get_option(f"io.excel.{ext}.writer") + engine = config.get_option(f"io.excel.{ext}.writer", silent=True) if engine == "auto": - engine = _get_default_writer(ext) + engine = get_default_writer(ext) except KeyError as err: raise ValueError(f"No engine for filetype: '{ext}'") from err + + if engine == "xlwt": + xls_config_engine = config.get_option( + "io.excel.xls.writer", silent=True + ) + # Don't warn a 2nd time if user has changed the default engine for xls + if xls_config_engine != "xlwt": + warnings.warn( + "As the xlwt package is no longer maintained, the xlwt " + "engine will be removed in a future version of pandas. " + "This is the only engine in pandas that supports writing " + "in the xls format. Install openpyxl and write to an xlsx " + "file instead. You can set the option io.excel.xls.writer " + "to 'xlwt' to silence this warning. While this option is " + "deprecated and will also raise a warning, it can " + "be globally set and the warning suppressed.", + FutureWarning, + stacklevel=4, + ) + cls = get_writer(engine) return object.__new__(cls) # declare external properties you can count on - book = None curr_sheet = None path = None @@ -685,11 +772,12 @@ def save(self): def __init__( self, - path, + path: Union[FilePathOrBuffer, "ExcelWriter"], engine=None, date_format=None, datetime_format=None, - mode="w", + mode: str = "w", + storage_options: StorageOptions = None, **engine_kwargs, ): # validate that this engine can handle the extension @@ -697,8 +785,20 @@ def __init__( ext = os.path.splitext(path)[-1] self.check_extension(ext) - self.path = path - self.sheets = {} + # use mode to open the file + if "b" not in mode: + mode += "b" + # use "a" for the user to append data to excel but internally use "r+" to let + # the excel backend first read the existing file and then write any data to it + mode = mode.replace("a", "r+") + + # cast ExcelWriter to avoid adding 'if self.handles is not None' + self.handles = IOHandles(cast(Buffer, path), compression={"copression": None}) + if not isinstance(path, ExcelWriter): + self.handles = get_handle( + path, mode, storage_options=storage_options, is_text=False + ) + self.sheets: Dict[str, Any] = {} self.cur_sheet = None if date_format is None: @@ -713,7 +813,7 @@ def __init__( self.mode = mode def __fspath__(self): - return stringify_path(self.path) + return getattr(self.handles.handle, "name", "") def _get_sheet_name(self, sheet_name): if sheet_name is None: @@ -757,14 +857,19 @@ def _value_with_fmt(self, val): return val, fmt @classmethod - def check_extension(cls, ext): + def check_extension(cls, ext: str): """ checks that path's extension against the Writer's supported extensions. If it isn't supported, raises UnsupportedFiletypeError. """ if ext.startswith("."): ext = ext[1:] - if not any(ext in extension for extension in cls.supported_extensions): + # error: "Callable[[ExcelWriter], Any]" has no attribute "__iter__" + # (not iterable) [attr-defined] + if not any( + ext in extension + for extension in cls.supported_extensions # type: ignore[attr-defined] + ): raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") else: return True @@ -778,7 +883,9 @@ def __exit__(self, exc_type, exc_value, traceback): def close(self): """synonym for save, to make it more file-like""" - return self.save() + content = self.save() + self.handles.close() + return content def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: @@ -823,48 +930,115 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``xlrd``. + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` Engine compatibility : + - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + + .. versionchanged:: 1.2.0 + + The engine `xlrd `_ + is no longer maintained, and is not supported with + python >= 3.9. When ``engine=None``, the following logic will be + used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` + will be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future. """ - from pandas.io.excel._odfreader import _ODFReader - from pandas.io.excel._openpyxl import _OpenpyxlReader - from pandas.io.excel._xlrd import _XlrdReader - from pandas.io.excel._pyxlsb import _PyxlsbReader + from pandas.io.excel._odfreader import ODFReader + from pandas.io.excel._openpyxl import OpenpyxlReader + from pandas.io.excel._pyxlsb import PyxlsbReader + from pandas.io.excel._xlrd import XlrdReader - _engines = { - "xlrd": _XlrdReader, - "openpyxl": _OpenpyxlReader, - "odf": _ODFReader, - "pyxlsb": _PyxlsbReader, + _engines: Mapping[str, Any] = { + "xlrd": XlrdReader, + "openpyxl": OpenpyxlReader, + "odf": ODFReader, + "pyxlsb": PyxlsbReader, } - def __init__(self, path_or_buffer, engine=None): + def __init__( + self, path_or_buffer, engine=None, storage_options: StorageOptions = None + ): if engine is None: - engine = "xlrd" + # Determine ext and use odf for ods stream/file if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + ext = None if _is_ods_stream(path_or_buffer): engine = "odf" else: ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" + + if ( + import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is not None + ): + from xlrd import Book + + if isinstance(path_or_buffer, Book): + engine = "xlrd" + + # GH 35029 - Prefer openpyxl except for xls files + if engine is None: + if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": + engine = "xlrd" + elif ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ): + engine = "openpyxl" + else: + caller = inspect.stack()[1] + if ( + caller.filename.endswith("pandas/io/excel/_base.py") + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + "The xlrd engine is no longer maintained and is not " + "supported when using pandas with python >= 3.9. However, " + "the engine xlrd will continue to be allowed for the " + "indefinite future. Beginning with pandas 1.2.0, the " + "openpyxl engine will be used if it is installed and the " + "engine argument is not specified. Either install openpyxl " + "or specify engine='xlrd' to silence this warning.", + FutureWarning, + stacklevel=stacklevel, + ) + engine = "xlrd" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") self.engine = engine + self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io) + self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 85ec9afaaec25..c5c3927216850 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -2,27 +2,33 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency import pandas as pd -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _ODFReader(_BaseExcelReader): +class ODFReader(BaseExcelReader): """ Read tables out of OpenDocument formatted files. Parameters ---------- - filepath_or_buffer: string, path to be parsed or + filepath_or_buffer : string, path to be parsed or an open readable stream. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): import_optional_dependency("odf") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): @@ -63,6 +69,7 @@ def get_sheet_by_name(self, name: str): if table.getAttribute("name") == name: return table + self.close() raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: @@ -184,6 +191,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: result = cast(pd.Timestamp, result) return result.time() else: + self.close() raise ValueError(f"Unrecognized type {cell_type}") def _get_cell_string_value(self, cell) -> str: @@ -191,22 +199,24 @@ def _get_cell_string_value(self, cell) -> str: Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ - from odf.element import Text, Element - from odf.text import S, P + from odf.element import Element from odf.namespaces import TEXTNS + from odf.text import S - text_p = P().qname text_s = S().qname - p = cell.childNodes[0] - value = [] - if p.qname == text_p: - for k, fragment in enumerate(p.childNodes): - if isinstance(fragment, Text): - value.append(fragment.data) - elif isinstance(fragment, Element): - if fragment.qname == text_s: - spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) + + for fragment in cell.childNodes: + if isinstance(fragment, Element): + if fragment.qname == text_s: + spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) value.append(" " * spaces) + else: + # recursive impl needed in case of nested fragments + # with multiple spaces + # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704 + value.append(self._get_cell_string_value(fragment)) + else: + value.append(str(fragment)) return "".join(value) diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 0131240f99cf6..0bea19bec2cdd 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -3,18 +3,24 @@ from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union import pandas._libs.json as json +from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._util import validate_freeze_panes from pandas.io.formats.excel import ExcelCell -class _ODSWriter(ExcelWriter): +class ODSWriter(ExcelWriter): engine = "odf" supported_extensions = (".ods",) def __init__( - self, path: str, engine: Optional[str] = None, mode: str = "w", **engine_kwargs + self, + path: str, + engine: Optional[str] = None, + mode: str = "w", + storage_options: StorageOptions = None, + **engine_kwargs, ): from odf.opendocument import OpenDocumentSpreadsheet @@ -23,9 +29,11 @@ def __init__( if mode == "a": raise ValueError("Append mode is not supported with odf!") - super().__init__(path, mode=mode, **engine_kwargs) + super().__init__( + path, mode=mode, storage_options=storage_options, **engine_kwargs + ) - self.book: OpenDocumentSpreadsheet = OpenDocumentSpreadsheet() + self.book = OpenDocumentSpreadsheet() self._style_dict: Dict[str, str] = {} def save(self) -> None: @@ -34,7 +42,7 @@ def save(self) -> None: """ for sheet in self.sheets.values(): self.book.spreadsheet.addElement(sheet) - self.book.save(self.path) + self.book.save(self.handles.handle) def write_cells( self, @@ -42,7 +50,7 @@ def write_cells( sheet_name: Optional[str] = None, startrow: int = 0, startcol: int = 0, - freeze_panes: Optional[List] = None, + freeze_panes: Optional[Tuple[int, int]] = None, ) -> None: """ Write the frame cells using odf @@ -59,7 +67,7 @@ def write_cells( wks = Table(name=sheet_name) self.sheets[sheet_name] = wks - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): assert freeze_panes is not None self._create_freeze_panes(sheet_name, freeze_panes) @@ -174,7 +182,7 @@ def _process_style(self, style: Dict[str, Any]) -> str: Returns ------- style_key : str - Unique style key for for later reference in sheet + Unique style key for later reference in sheet """ from odf.style import ( ParagraphProperties, @@ -215,14 +223,17 @@ def _process_style(self, style: Dict[str, Any]) -> str: self.book.styles.addElement(odf_style) return name - def _create_freeze_panes(self, sheet_name: str, freeze_panes: List[int]) -> None: - """Create freeze panes in the sheet + def _create_freeze_panes( + self, sheet_name: str, freeze_panes: Tuple[int, int] + ) -> None: + """ + Create freeze panes in the sheet. Parameters ---------- sheet_name : str Name of the spreadsheet - freeze_panes : list + freeze_panes : tuple of (int, int) Freeze pane location x and y """ from odf.config import ( diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f34..7de958df206d5 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,74 +1,57 @@ -from typing import List +from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import ExcelWriter, _BaseExcelReader -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._base import BaseExcelReader, ExcelWriter +from pandas.io.excel._util import validate_freeze_panes +if TYPE_CHECKING: + from openpyxl.descriptors.serialisable import Serialisable -class _OpenpyxlWriter(ExcelWriter): + +class OpenpyxlWriter(ExcelWriter): engine = "openpyxl" supported_extensions = (".xlsx", ".xlsm") - def __init__(self, path, engine=None, mode="w", **engine_kwargs): + def __init__( + self, + path, + engine=None, + mode: str = "w", + storage_options: StorageOptions = None, + **engine_kwargs, + ): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - super().__init__(path, mode=mode, **engine_kwargs) + super().__init__( + path, mode=mode, storage_options=storage_options, **engine_kwargs + ) - if self.mode == "a": # Load from existing workbook + # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from + # the file and later write to it + if "r+" in self.mode: # Load from existing workbook from openpyxl import load_workbook - book = load_workbook(self.path) - self.book = book + self.book = load_workbook(self.handles.handle) else: # Create workbook object with default optimized_write=True. self.book = Workbook() if self.book.worksheets: - try: - self.book.remove(self.book.worksheets[0]) - except AttributeError: - - # compat - for openpyxl <= 2.4 - self.book.remove_sheet(self.book.worksheets[0]) + self.book.remove(self.book.worksheets[0]) def save(self): """ Save workbook to disk. """ - return self.book.save(self.path) - - @classmethod - def _convert_to_style(cls, style_dict): - """ - Converts a style_dict to an openpyxl style object. - - Parameters - ---------- - style_dict : style dictionary to convert - """ - from openpyxl.style import Style - - xls_style = Style() - for key, value in style_dict.items(): - for nk, nv in value.items(): - if key == "borders": - ( - xls_style.borders.__getattribute__(nk).__setattr__( - "border_style", nv - ) - ) - else: - xls_style.__getattribute__(key).__setattr__(nk, nv) - - return xls_style + self.book.save(self.handles.handle) @classmethod - def _convert_to_style_kwargs(cls, style_dict): + def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, "Serialisable"]: """ Convert a style_dict to a set of kwargs suitable for initializing or updating-on-copy an openpyxl v2 style object. @@ -93,7 +76,7 @@ def _convert_to_style_kwargs(cls, style_dict): """ _style_key_map = {"borders": "border"} - style_kwargs = {} + style_kwargs: Dict[str, Serialisable] = {} for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] @@ -225,7 +208,7 @@ def _convert_to_fill(cls, fill_dict): ------- fill : openpyxl.styles.Fill """ - from openpyxl.styles import PatternFill, GradientFill + from openpyxl.styles import GradientFill, PatternFill _pattern_fill_key_map = { "patternType": "fill_type", @@ -404,7 +387,7 @@ def write_cells( # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) - _style_cache = {} + _style_cache: Dict[str, Dict[str, Serialisable]] = {} if sheet_name in self.sheets: wks = self.sheets[sheet_name] @@ -413,7 +396,7 @@ def write_cells( wks.title = sheet_name self.sheets[sheet_name] = wks - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): wks.freeze_panes = wks.cell( row=freeze_panes[0] + 1, column=freeze_panes[1] + 1 ) @@ -426,7 +409,7 @@ def write_cells( if fmt: xcell.number_format = fmt - style_kwargs = {} + style_kwargs: Optional[Dict[str, Serialisable]] = {} if cell.style: key = str(cell.style) style_kwargs = _style_cache.get(key) @@ -466,8 +449,12 @@ def write_cells( setattr(xcell, k, v) -class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: +class OpenpyxlReader(BaseExcelReader): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ) -> None: """ Reader using openpyxl engine. @@ -475,9 +462,11 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): @@ -496,6 +485,7 @@ def close(self): # https://stackoverflow.com/questions/31416842/ # openpyxl-does-not-close-excel-workbook-in-read-only-mode self.book.close() + super().close() @property def sheet_names(self) -> List[str]: @@ -509,16 +499,17 @@ def get_sheet_by_index(self, index: int): def _convert_cell(self, cell, convert_float: bool) -> Scalar: - # TODO: replace with openpyxl constants + from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC + if cell.is_date: return cell.value - elif cell.data_type == "e": + elif cell.data_type == TYPE_ERROR: return np.nan - elif cell.data_type == "b": + elif cell.data_type == TYPE_BOOL: return bool(cell.value) elif cell.value is None: return "" # compat with xlrd - elif cell.data_type == "n": + elif cell.data_type == TYPE_NUMERIC: # GH5394 if convert_float: val = int(cell.value) diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..de4f7bba1a179 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,25 +1,31 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _PyxlsbReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer): +class PyxlsbReader(BaseExcelReader): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer: str, path object, or Workbook + filepath_or_buffer : str, path object, or Workbook Object to be parsed. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 285aeaf7d4c6e..47105916a9c78 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,3 +1,5 @@ +from typing import List + from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_integer, is_list_like @@ -21,7 +23,7 @@ def register_writer(klass): _writers[engine_name] = klass -def _get_default_writer(ext): +def get_default_writer(ext): """ Return the default writer for the given extension. @@ -56,7 +58,7 @@ def get_writer(engine_name): raise ValueError(f"No Excel writer '{engine_name}'") from err -def _excel2num(x): +def _excel2num(x: str) -> int: """ Convert Excel column name like 'AB' to 0-based column index. @@ -88,7 +90,7 @@ def _excel2num(x): return index - 1 -def _range2cols(areas): +def _range2cols(areas: str) -> List[int]: """ Convert comma separated list of column names and ranges to indices. @@ -109,19 +111,19 @@ def _range2cols(areas): >>> _range2cols('A,C,Z:AB') [0, 2, 25, 26, 27] """ - cols = [] + cols: List[int] = [] for rng in areas.split(","): if ":" in rng: - rng = rng.split(":") - cols.extend(range(_excel2num(rng[0]), _excel2num(rng[1]) + 1)) + rngs = rng.split(":") + cols.extend(range(_excel2num(rngs[0]), _excel2num(rngs[1]) + 1)) else: cols.append(_excel2num(rng)) return cols -def _maybe_convert_usecols(usecols): +def maybe_convert_usecols(usecols): """ Convert `usecols` into a compatible format for parsing in `parsers.py`. @@ -150,7 +152,7 @@ def _maybe_convert_usecols(usecols): return usecols -def _validate_freeze_panes(freeze_panes): +def validate_freeze_panes(freeze_panes): if freeze_panes is not None: if len(freeze_panes) == 2 and all( isinstance(item, int) for item in freeze_panes @@ -167,15 +169,7 @@ def _validate_freeze_panes(freeze_panes): return False -def _trim_excel_header(row): - # trim header row so auto-index inference works - # xlrd uses '' , openpyxl None - while len(row) > 0 and (row[0] == "" or row[0] is None): - row = row[1:] - return row - - -def _fill_mi_header(row, control_row): +def fill_mi_header(row, control_row): """ Forward fill blank entries in row but only inside the same parent index. @@ -208,7 +202,7 @@ def _fill_mi_header(row, control_row): return row, control_row -def _pop_header_name(row, index_col): +def pop_header_name(row, index_col): """ Pop the header name for MultiIndex parsing. diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..c655db4bc772b 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -2,13 +2,14 @@ import numpy as np +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): +class XlrdReader(BaseExcelReader): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. @@ -16,10 +17,12 @@ def __init__(self, filepath_or_buffer): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : dict, optional + passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): @@ -48,11 +51,11 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): from xlrd import ( - xldate, + XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_ERROR, - XL_CELL_BOOLEAN, XL_CELL_NUMBER, + xldate, ) epoch1904 = self.book.datemode diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 85a1bb031f457..d7bbec578d89d 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -1,14 +1,17 @@ +from typing import Dict, List, Tuple + import pandas._libs.json as json +from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._util import validate_freeze_panes class _XlsxStyler: # Map from openpyxl-oriented styles to flatter xlsxwriter representation # Ordering necessary for both determinism and because some are keyed by # prefixes of others. - STYLE_MAPPING = { + STYLE_MAPPING: Dict[str, List[Tuple[Tuple[str, ...], str]]] = { "font": [ (("name",), "font_name"), (("sz",), "font_size"), @@ -156,7 +159,7 @@ def convert(cls, style_dict, num_format_str=None): return props -class _XlsxWriter(ExcelWriter): +class XlsxWriter(ExcelWriter): engine = "xlsxwriter" supported_extensions = (".xlsx",) @@ -166,11 +169,12 @@ def __init__( engine=None, date_format=None, datetime_format=None, - mode="w", + mode: str = "w", + storage_options: StorageOptions = None, **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. - import xlsxwriter + from xlsxwriter import Workbook if mode == "a": raise ValueError("Append mode is not supported with xlsxwriter!") @@ -181,10 +185,11 @@ def __init__( date_format=date_format, datetime_format=datetime_format, mode=mode, + storage_options=storage_options, **engine_kwargs, ) - self.book = xlsxwriter.Workbook(path, **engine_kwargs) + self.book = Workbook(self.handles.handle, **engine_kwargs) def save(self): """ @@ -206,7 +211,7 @@ def write_cells( style_dict = {"null": None} - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): wks.freeze_panes(*(freeze_panes)) for cell in cells: diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 78efe77e9fe2d..9a725c15de61e 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -1,14 +1,28 @@ +from typing import TYPE_CHECKING, Dict + import pandas._libs.json as json +from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._util import validate_freeze_panes + +if TYPE_CHECKING: + from xlwt import XFStyle -class _XlwtWriter(ExcelWriter): +class XlwtWriter(ExcelWriter): engine = "xlwt" supported_extensions = (".xls",) - def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs): + def __init__( + self, + path, + engine=None, + encoding=None, + mode: str = "w", + storage_options: StorageOptions = None, + **engine_kwargs, + ): # Use the xlwt module as the Excel writer. import xlwt @@ -17,7 +31,9 @@ def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs): if mode == "a": raise ValueError("Append mode is not supported with xlwt!") - super().__init__(path, mode=mode, **engine_kwargs) + super().__init__( + path, mode=mode, storage_options=storage_options, **engine_kwargs + ) if encoding is None: encoding = "ascii" @@ -29,12 +45,13 @@ def save(self): """ Save workbook to disk. """ - return self.book.save(self.path) + if self.sheets: + # fails when the ExcelWriter is just opened and then closed + self.book.save(self.handles.handle) def write_cells( self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None ): - # Write the frame cells using xlwt. sheet_name = self._get_sheet_name(sheet_name) @@ -44,12 +61,12 @@ def write_cells( wks = self.book.add_sheet(sheet_name) self.sheets[sheet_name] = wks - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): wks.set_panes_frozen(True) wks.set_horz_split_pos(freeze_panes[0]) wks.set_vert_split_pos(freeze_panes[1]) - style_dict = {} + style_dict: Dict[str, XFStyle] = {} for cell in cells: val, fmt = self._value_with_fmt(cell.val) @@ -101,14 +118,14 @@ def _style_to_xlwt( f"{key}: {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = f"{(line_sep).join(it)} " + out = f"{line_sep.join(it)} " return out else: it = [ f"{key} {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = f"{(field_sep).join(it)} " + out = f"{field_sep.join(it)} " return out else: item = f"{item}" diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dfa43942fc8b3..422677771b4d0 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,13 +1,24 @@ """ feather-format compat """ +from typing import AnyStr + +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc from pandas import DataFrame, Int64Index, RangeIndex +from pandas.core import generic -from pandas.io.common import get_filepath_or_buffer, stringify_path +from pandas.io.common import get_handle -def to_feather(df: DataFrame, path, **kwargs): +@doc(storage_options=generic._shared_docs["storage_options"]) +def to_feather( + df: DataFrame, + path: FilePathOrBuffer[AnyStr], + storage_options: StorageOptions = None, + **kwargs, +): """ Write a DataFrame to the binary Feather format. @@ -15,6 +26,10 @@ def to_feather(df: DataFrame, path, **kwargs): ---------- df : DataFrame path : string file path, or file-like object + {storage_options} + + .. versionadded:: 1.2.0 + **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. @@ -23,8 +38,6 @@ def to_feather(df: DataFrame, path, **kwargs): import_optional_dependency("pyarrow") from pyarrow import feather - path = stringify_path(path) - if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -61,10 +74,16 @@ def to_feather(df: DataFrame, path, **kwargs): if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, path, **kwargs) + with get_handle( + path, "wb", storage_options=storage_options, is_text=False + ) as handles: + feather.write_feather(df, handles.handle, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True): +@doc(storage_options=generic._shared_docs["storage_options"]) +def read_feather( + path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None +): """ Load a feather-format object from the file path. @@ -80,7 +99,7 @@ def read_feather(path, columns=None, use_threads: bool = True): ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. columns : sequence, default None If not provided, all columns are read. @@ -90,6 +109,9 @@ def read_feather(path, columns=None, use_threads: bool = True): Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 + {storage_options} + + .. versionadded:: 1.2.0 Returns ------- @@ -98,12 +120,10 @@ def read_feather(path, columns=None, use_threads: bool = True): import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer(path) - - df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) + with get_handle( + path, "rb", storage_options=storage_options, is_text=False + ) as handles: - # s3fs only validates the credentials when the file is closed. - if should_close: - path.close() - - return df + return feather.read_feather( + handles.handle, columns=columns, use_threads=bool(use_threads) + ) diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index bed29e1fd4792..ea291bcbfa44c 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -69,21 +69,25 @@ def check_main(): return not hasattr(main, "__file__") or get_option("mode.sim_interactive") try: - return __IPYTHON__ or check_main() # noqa + # pandas\io\formats\console.py:72: error: Name '__IPYTHON__' is not + # defined [name-defined] + return __IPYTHON__ or check_main() # type: ignore[name-defined] except NameError: return check_main() def in_ipython_frontend(): """ - Check if we're inside an an IPython zmq frontend. + Check if we're inside an IPython zmq frontend. Returns ------- bool """ try: - ip = get_ipython() # noqa + # pandas\io\formats\console.py:86: error: Name 'get_ipython' is not + # defined [name-defined] + ip = get_ipython() # type: ignore[name-defined] return "zmq" in str(type(ip)).lower() except NameError: pass diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index b40d2a57b8106..8abe13db370ca 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -3,6 +3,7 @@ """ import re +from typing import Dict, Optional import warnings @@ -11,8 +12,6 @@ class CSSWarning(UserWarning): This CSS syntax cannot currently be parsed. """ - pass - def _side_expander(prop_fmt: str): def expand(self, prop, value: str): @@ -20,9 +19,7 @@ def expand(self, prop, value: str): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn( - f'Could not expand "{prop}: {value}"', CSSWarning, - ) + warnings.warn(f'Could not expand "{prop}: {value}"', CSSWarning) return for key, idx in zip(self.SIDES, mapping): yield prop_fmt.format(key), tokens[idx] @@ -35,7 +32,64 @@ class CSSResolver: A callable for parsing and resolving CSS to atomic properties. """ - def __call__(self, declarations_str, inherited=None): + UNIT_RATIOS = { + "rem": ("pt", 12), + "ex": ("em", 0.5), + # 'ch': + "px": ("pt", 0.75), + "pc": ("pt", 12), + "in": ("pt", 72), + "cm": ("in", 1 / 2.54), + "mm": ("in", 1 / 25.4), + "q": ("mm", 0.25), + "!!default": ("em", 0), + } + + FONT_SIZE_RATIOS = UNIT_RATIOS.copy() + FONT_SIZE_RATIOS.update( + { + "%": ("em", 0.01), + "xx-small": ("rem", 0.5), + "x-small": ("rem", 0.625), + "small": ("rem", 0.8), + "medium": ("rem", 1), + "large": ("rem", 1.125), + "x-large": ("rem", 1.5), + "xx-large": ("rem", 2), + "smaller": ("em", 1 / 1.2), + "larger": ("em", 1.2), + "!!default": ("em", 1), + } + ) + + MARGIN_RATIOS = UNIT_RATIOS.copy() + MARGIN_RATIOS.update({"none": ("pt", 0)}) + + BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() + BORDER_WIDTH_RATIOS.update( + { + "none": ("pt", 0), + "thick": ("px", 4), + "medium": ("px", 2), + "thin": ("px", 1), + # Default: medium only if solid + } + ) + + SIDE_SHORTHANDS = { + 1: [0, 0, 0, 0], + 2: [0, 1, 0, 1], + 3: [0, 1, 2, 1], + 4: [0, 1, 2, 3], + } + + SIDES = ("top", "right", "bottom", "left") + + def __call__( + self, + declarations_str: str, + inherited: Optional[Dict[str, str]] = None, + ) -> Dict[str, str]: """ The given declarations to atomic properties. @@ -77,111 +131,88 @@ def __call__(self, declarations_str, inherited=None): if inherited is None: inherited = {} + props = self._update_initial(props, inherited) + props = self._update_font_size(props, inherited) + return self._update_other_units(props) + + def _update_initial( + self, + props: Dict[str, str], + inherited: Dict[str, str], + ) -> Dict[str, str]: # 1. resolve inherited, initial for prop, val in inherited.items(): if prop not in props: props[prop] = val - for prop, val in list(props.items()): + new_props = props.copy() + for prop, val in props.items(): if val == "inherit": val = inherited.get(prop, "initial") - if val == "initial": - val = None - if val is None: + if val in ("initial", None): # we do not define a complete initial stylesheet - del props[prop] + del new_props[prop] else: - props[prop] = val - + new_props[prop] = val + return new_props + + def _update_font_size( + self, + props: Dict[str, str], + inherited: Dict[str, str], + ) -> Dict[str, str]: # 2. resolve relative font size if props.get("font-size"): - if "font-size" in inherited: - em_pt = inherited["font-size"] - assert em_pt[-2:] == "pt" - em_pt = float(em_pt[:-2]) - else: - em_pt = None props["font-size"] = self.size_to_pt( - props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS + props["font-size"], + self._get_font_size(inherited), + conversions=self.FONT_SIZE_RATIOS, ) + return props - font_size = float(props["font-size"][:-2]) - else: - font_size = None + def _get_font_size(self, props: Dict[str, str]) -> Optional[float]: + if props.get("font-size"): + font_size_string = props["font-size"] + return self._get_float_font_size_from_pt(font_size_string) + return None + def _get_float_font_size_from_pt(self, font_size_string: str) -> float: + assert font_size_string.endswith("pt") + return float(font_size_string.rstrip("pt")) + + def _update_other_units(self, props: Dict[str, str]) -> Dict[str, str]: + font_size = self._get_font_size(props) # 3. TODO: resolve other font-relative units for side in self.SIDES: prop = f"border-{side}-width" if prop in props: props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS + props[prop], + em_pt=font_size, + conversions=self.BORDER_WIDTH_RATIOS, ) - for prop in [ - f"margin-{side}", - f"padding-{side}", - ]: + + for prop in [f"margin-{side}", f"padding-{side}"]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS + props[prop], + em_pt=font_size, + conversions=self.MARGIN_RATIOS, ) - return props - UNIT_RATIOS = { - "rem": ("pt", 12), - "ex": ("em", 0.5), - # 'ch': - "px": ("pt", 0.75), - "pc": ("pt", 12), - "in": ("pt", 72), - "cm": ("in", 1 / 2.54), - "mm": ("in", 1 / 25.4), - "q": ("mm", 0.25), - "!!default": ("em", 0), - } - - FONT_SIZE_RATIOS = UNIT_RATIOS.copy() - FONT_SIZE_RATIOS.update( - { - "%": ("em", 0.01), - "xx-small": ("rem", 0.5), - "x-small": ("rem", 0.625), - "small": ("rem", 0.8), - "medium": ("rem", 1), - "large": ("rem", 1.125), - "x-large": ("rem", 1.5), - "xx-large": ("rem", 2), - "smaller": ("em", 1 / 1.2), - "larger": ("em", 1.2), - "!!default": ("em", 1), - } - ) - - MARGIN_RATIOS = UNIT_RATIOS.copy() - MARGIN_RATIOS.update({"none": ("pt", 0)}) - - BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() - BORDER_WIDTH_RATIOS.update( - { - "none": ("pt", 0), - "thick": ("px", 4), - "medium": ("px", 2), - "thin": ("px", 1), - # Default: medium only if solid - } - ) - def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): def _error(): warnings.warn(f"Unhandled size: {repr(in_val)}", CSSWarning) return self.size_to_pt("1!!default", conversions=conversions) - try: - val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups() - except AttributeError: + match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val) + if match is None: return _error() + + val, unit = match.groups() if val == "": # hack for 'large' etc. val = 1 @@ -224,14 +255,6 @@ def atomize(self, declarations): for prop, value in expand(prop, value): yield prop, value - SIDE_SHORTHANDS = { - 1: [0, 0, 0, 0], - 2: [0, 1, 0, 1], - 3: [0, 1, 2, 1], - 4: [0, 1, 2, 3], - } - SIDES = ("top", "right", "bottom", "left") - expand_border_color = _side_expander("border-{:s}-color") expand_border_style = _side_expander("border-{:s}-style") expand_border_width = _side_expander("border-{:s}-width") diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5bd51dc8351f6..6d14d6172aa6c 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -3,16 +3,20 @@ """ import csv as csvlib -from io import StringIO import os -from typing import Hashable, List, Mapping, Optional, Sequence, Union -import warnings -from zipfile import ZipFile +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + FloatFormatType, + IndexLabel, + Label, + StorageOptions, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -22,179 +26,217 @@ ) from pandas.core.dtypes.missing import notna -from pandas.io.common import ( - get_compression_method, - get_filepath_or_buffer, - get_handle, - infer_compression, -) +from pandas.core.indexes.api import Index + +from pandas.io.common import get_handle + +if TYPE_CHECKING: + from pandas.io.formats.format import DataFrameFormatter class CSVFormatter: def __init__( self, - obj, - path_or_buf: Optional[FilePathOrBuffer[str]] = None, + formatter: "DataFrameFormatter", + path_or_buf: FilePathOrBuffer[str] = "", sep: str = ",", - na_rep: str = "", - float_format: Optional[str] = None, - cols=None, - header: Union[bool, Sequence[Hashable]] = True, - index: bool = True, - index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, + cols: Optional[Sequence[Label]] = None, + index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, - quotechar='"', + quotechar: Optional[str] = '"', date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, - decimal=".", + storage_options: StorageOptions = None, ): - self.obj = obj + self.fmt = formatter - if path_or_buf is None: - path_or_buf = StringIO() + self.obj = self.fmt.frame - # Extract compression mode as given, if dict - compression, self.compression_args = get_compression_method(compression) + self.filepath_or_buffer = path_or_buf + self.encoding = encoding + self.compression = compression + self.mode = mode + self.storage_options = storage_options - self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode - ) self.sep = sep - self.na_rep = na_rep - self.float_format = float_format - self.decimal = decimal - - self.header = header - self.index = index - self.index_label = index_label - self.mode = mode - if encoding is None: - encoding = "utf-8" - self.encoding = encoding + self.index_label = self._initialize_index_label(index_label) self.errors = errors - self.compression = infer_compression(self.path_or_buf, compression) - - if quoting is None: - quoting = csvlib.QUOTE_MINIMAL - self.quoting = quoting - - if quoting == csvlib.QUOTE_NONE: - # prevents crash in _csv - quotechar = None - self.quotechar = quotechar - + self.quoting = quoting or csvlib.QUOTE_MINIMAL + self.quotechar = self._initialize_quotechar(quotechar) self.doublequote = doublequote self.escapechar = escapechar - self.line_terminator = line_terminator or os.linesep - self.date_format = date_format + self.cols = self._initialize_columns(cols) + self.chunksize = self._initialize_chunksize(chunksize) + + @property + def na_rep(self) -> str: + return self.fmt.na_rep + + @property + def float_format(self) -> Optional["FloatFormatType"]: + return self.fmt.float_format + + @property + def decimal(self) -> str: + return self.fmt.decimal + + @property + def header(self) -> Union[bool, Sequence[str]]: + return self.fmt.header + + @property + def index(self) -> bool: + return self.fmt.index + + def _initialize_index_label(self, index_label: Optional[IndexLabel]) -> IndexLabel: + if index_label is not False: + if index_label is None: + return self._get_index_label_from_obj() + elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): + # given a string for a DF with Index + return [index_label] + return index_label + + def _get_index_label_from_obj(self) -> List[str]: + if isinstance(self.obj.index, ABCMultiIndex): + return self._get_index_label_multiindex() + else: + return self._get_index_label_flat() + + def _get_index_label_multiindex(self) -> List[str]: + return [name or "" for name in self.obj.index.names] - self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) + def _get_index_label_flat(self) -> List[str]: + index_label = self.obj.index.name + return [""] if index_label is None else [index_label] + def _initialize_quotechar(self, quotechar: Optional[str]) -> Optional[str]: + if self.quoting != csvlib.QUOTE_NONE: + # prevents crash in _csv + return quotechar + return None + + @property + def has_mi_columns(self) -> bool: + return bool(isinstance(self.obj.columns, ABCMultiIndex)) + + def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # validate mi options if self.has_mi_columns: if cols is not None: - raise TypeError("cannot specify cols with a MultiIndex on the columns") + msg = "cannot specify cols with a MultiIndex on the columns" + raise TypeError(msg) if cols is not None: if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types( - na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting, - ) + cols = cols._format_native_types(**self._number_format) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes - # and make sure sure cols is just a list of labels - cols = self.obj.columns - if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types( - na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting, + # and make sure cols is just a list of labels + new_cols = self.obj.columns + if isinstance(new_cols, ABCIndexClass): + return new_cols._format_native_types(**self._number_format) + else: + return list(new_cols) + + def _initialize_chunksize(self, chunksize: Optional[int]) -> int: + if chunksize is None: + return (100000 // (len(self.cols) or 1)) or 1 + return int(chunksize) + + @property + def _number_format(self) -> Dict[str, Any]: + """Dictionary used for storing number formatting settings.""" + return { + "na_rep": self.na_rep, + "float_format": self.float_format, + "date_format": self.date_format, + "quoting": self.quoting, + "decimal": self.decimal, + } + + @property + def data_index(self) -> Index: + data_index = self.obj.index + if ( + isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and self.date_format is not None + ): + data_index = Index( + [x.strftime(self.date_format) if notna(x) else "" for x in data_index] ) + return data_index + + @property + def nlevels(self) -> int: + if self.index: + return getattr(self.data_index, "nlevels", 1) else: - cols = list(cols) + return 0 - # save it - self.cols = cols + @property + def _has_aliases(self) -> bool: + return isinstance(self.header, (tuple, list, np.ndarray, ABCIndexClass)) - # preallocate data 2d list - ncols = self.obj.shape[-1] - self.data = [None] * ncols + @property + def _need_to_save_header(self) -> bool: + return bool(self._has_aliases or self.header) - if chunksize is None: - chunksize = (100000 // (len(self.cols) or 1)) or 1 - self.chunksize = int(chunksize) + @property + def write_cols(self) -> Sequence[Label]: + if self._has_aliases: + assert not isinstance(self.header, bool) + if len(self.header) != len(self.cols): + raise ValueError( + f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" + ) + else: + return self.header + else: + return self.cols - self.data_index = obj.index - if ( - isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) - and date_format is not None - ): - from pandas import Index + @property + def encoded_labels(self) -> List[Label]: + encoded_labels: List[Label] = [] - self.data_index = Index( - [x.strftime(date_format) if notna(x) else "" for x in self.data_index] - ) + if self.index and self.index_label: + assert isinstance(self.index_label, Sequence) + encoded_labels = list(self.index_label) - self.nlevels = getattr(self.data_index, "nlevels", 1) - if not index: - self.nlevels = 0 + if not self.has_mi_columns or self._has_aliases: + encoded_labels += list(self.write_cols) + + return encoded_labels def save(self) -> None: """ Create the writer & save. """ - # GH21227 internal compression is not used when file-like passed. - if self.compression and hasattr(self.path_or_buf, "write"): - warnings.warn( - "compression has no effect when passing file-like object as input.", - RuntimeWarning, - stacklevel=2, - ) + # apply compression and byte/text conversion + with get_handle( + self.filepath_or_buffer, + self.mode, + encoding=self.encoding, + errors=self.errors, + compression=self.compression, + storage_options=self.storage_options, + ) as handles: - # when zip compression is called. - is_zip = isinstance(self.path_or_buf, ZipFile) or ( - not hasattr(self.path_or_buf, "write") and self.compression == "zip" - ) - - if is_zip: - # zipfile doesn't support writing string to archive. uses string - # buffer to receive csv writing and dump into zip compression - # file handle. GH21241, GH21118 - f = StringIO() - close = False - elif hasattr(self.path_or_buf, "write"): - f = self.path_or_buf - close = False - else: - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=dict(self.compression_args, method=self.compression), - ) - close = True - - try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( - f, + handles.handle, # type: ignore[arg-type] lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, @@ -205,158 +247,56 @@ def save(self) -> None: self._save() - finally: - if is_zip: - # GH17778 handles zip compression separately. - buf = f.getvalue() - if hasattr(self.path_or_buf, "write"): - self.path_or_buf.write(buf) - else: - compression = dict(self.compression_args, method=self.compression) - - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=compression, - ) - f.write(buf) - close = True - if close: - f.close() - for _fh in handles: - _fh.close() - elif self.should_close: - f.close() - - def _save_header(self): - writer = self.writer - obj = self.obj - index_label = self.index_label - cols = self.cols - has_mi_columns = self.has_mi_columns - header = self.header - encoded_labels: List[str] = [] - - has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) - if not (has_aliases or self.header): - return - if has_aliases: - if len(header) != len(cols): - raise ValueError( - f"Writing {len(cols)} cols but got {len(header)} aliases" - ) - else: - write_cols = header - else: - write_cols = cols - - if self.index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(obj.index, ABCMultiIndex): - index_label = [] - for i, name in enumerate(obj.index.names): - if name is None: - name = "" - index_label.append(name) - else: - index_label = obj.index.name - if index_label is None: - index_label = [""] - else: - index_label = [index_label] - elif not isinstance( - index_label, (list, tuple, np.ndarray, ABCIndexClass) - ): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] - - if not has_mi_columns or has_aliases: - encoded_labels += list(write_cols) - writer.writerow(encoded_labels) - else: - # write out the mi - columns = obj.columns - - # write out the names for each level, then ALL of the values for - # each level - for i in range(columns.nlevels): - - # we need at least 1 index column to write our col names - col_line = [] - if self.index: - - # name is the first column - col_line.append(columns.names[i]) - - if isinstance(index_label, list) and len(index_label) > 1: - col_line.extend([""] * (len(index_label) - 1)) - - col_line.extend(columns._get_level_values(i)) - - writer.writerow(col_line) - - # Write out the index line if it's not empty. - # Otherwise, we will print out an extraneous - # blank line between the mi and the data rows. - if encoded_labels and set(encoded_labels) != {""}: - encoded_labels.extend([""] * len(columns)) - writer.writerow(encoded_labels) - def _save(self) -> None: - self._save_header() + if self._need_to_save_header: + self._save_header() + self._save_body() + def _save_header(self) -> None: + if not self.has_mi_columns or self._has_aliases: + self.writer.writerow(self.encoded_labels) + else: + for row in self._generate_multiindex_header_rows(): + self.writer.writerow(row) + + def _generate_multiindex_header_rows(self) -> Iterator[List[Label]]: + columns = self.obj.columns + for i in range(columns.nlevels): + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + # name is the first column + col_line.append(columns.names[i]) + + if isinstance(self.index_label, list) and len(self.index_label) > 1: + col_line.extend([""] * (len(self.index_label) - 1)) + + col_line.extend(columns._get_level_values(i)) + yield col_line + + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if self.encoded_labels and set(self.encoded_labels) != {""}: + yield self.encoded_labels + [""] * len(columns) + + def _save_body(self) -> None: nrows = len(self.data_index) - - # write in chunksize bites - chunksize = self.chunksize - chunks = int(nrows / chunksize) + 1 - + chunks = int(nrows / self.chunksize) + 1 for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, nrows) + start_i = i * self.chunksize + end_i = min(start_i + self.chunksize, nrows) if start_i >= end_i: break - self._save_chunk(start_i, end_i) def _save_chunk(self, start_i: int, end_i: int) -> None: - data_index = self.data_index - # create the data for a chunk slicer = slice(start_i, end_i) - df = self.obj.iloc[slicer] - blocks = df._mgr.blocks - - for i in range(len(blocks)): - b = blocks[i] - d = b.to_native_types( - na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting, - ) - - for col_loc, col in zip(b.mgr_locs, d): - # self.data is a preallocated list - self.data[col_loc] = col - ix = data_index.to_native_types( - slicer=slicer, - na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting, - ) + res = df._mgr.to_native_types(**self._number_format) + data = [res.iget_values(i) for i in range(len(res.items))] - libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + ix = self.data_index[slicer]._format_native_types(**self._number_format) + libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index bf4586a4b5b96..be8f2de1d53fb 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -5,21 +5,22 @@ from functools import reduce import itertools import re -from typing import Callable, Dict, Optional, Sequence, Union +from typing import Callable, Dict, Iterable, Mapping, Optional, Sequence, Union, cast import warnings import numpy as np -from pandas._typing import Label +from pandas._libs.lib import is_list_like +from pandas._typing import Label, StorageOptions +from pandas.util._decorators import doc from pandas.core.dtypes import missing from pandas.core.dtypes.common import is_float, is_scalar -from pandas.core.dtypes.generic import ABCIndex from pandas import DataFrame, Index, MultiIndex, PeriodIndex +from pandas.core import generic import pandas.core.common as com -from pandas.io.common import stringify_path from pandas.io.formats.css import CSSResolver, CSSWarning from pandas.io.formats.format import get_level_lengths from pandas.io.formats.printing import pprint_thing @@ -30,7 +31,13 @@ class ExcelCell: __slots__ = __fields__ def __init__( - self, row: int, col: int, val, style=None, mergestart=None, mergeend=None + self, + row: int, + col: int, + val, + style=None, + mergestart: Optional[int] = None, + mergeend: Optional[int] = None, ): self.row = row self.col = col @@ -58,16 +65,79 @@ class CSSToExcelConverter: CSS processed by :meth:`__call__`. """ + NAMED_COLORS = { + "maroon": "800000", + "brown": "A52A2A", + "red": "FF0000", + "pink": "FFC0CB", + "orange": "FFA500", + "yellow": "FFFF00", + "olive": "808000", + "green": "008000", + "purple": "800080", + "fuchsia": "FF00FF", + "lime": "00FF00", + "teal": "008080", + "aqua": "00FFFF", + "blue": "0000FF", + "navy": "000080", + "black": "000000", + "gray": "808080", + "grey": "808080", + "silver": "C0C0C0", + "white": "FFFFFF", + } + + VERTICAL_MAP = { + "top": "top", + "text-top": "top", + "middle": "center", + "baseline": "bottom", + "bottom": "bottom", + "text-bottom": "bottom", + # OpenXML also has 'justify', 'distributed' + } + + BOLD_MAP = { + "bold": True, + "bolder": True, + "600": True, + "700": True, + "800": True, + "900": True, + "normal": False, + "lighter": False, + "100": False, + "200": False, + "300": False, + "400": False, + "500": False, + } + + ITALIC_MAP = { + "normal": False, + "italic": True, + "oblique": True, + } + + FAMILY_MAP = { + "serif": 1, # roman + "sans-serif": 2, # swiss + "cursive": 4, # script + "fantasy": 5, # decorative + } + # NB: Most of the methods here could be classmethods, as only __init__ # and __call__ make use of instance attributes. We leave them as # instancemethods so that users can easily experiment with extensions # without monkey-patching. + inherited: Optional[Dict[str, str]] def __init__(self, inherited: Optional[str] = None): if inherited is not None: - inherited = self.compute_css(inherited) - - self.inherited = inherited + self.inherited = self.compute_css(inherited) + else: + self.inherited = None compute_css = CSSResolver() @@ -91,7 +161,7 @@ def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: properties = self.compute_css(declarations_str, self.inherited) return self.build_xlstyle(properties) - def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]: + def build_xlstyle(self, props: Mapping[str, str]) -> Dict[str, Dict[str, str]]: out = { "alignment": self.build_alignment(props), "border": self.build_border(props), @@ -115,29 +185,30 @@ def remove_none(d: Dict[str, str]) -> None: remove_none(out) return out - VERTICAL_MAP = { - "top": "top", - "text-top": "top", - "middle": "center", - "baseline": "bottom", - "bottom": "bottom", - "text-bottom": "bottom", - # OpenXML also has 'justify', 'distributed' - } - - def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]: + def build_alignment( + self, props: Mapping[str, str] + ) -> Dict[str, Optional[Union[bool, str]]]: # TODO: text-indent, padding-left -> alignment.indent return { "horizontal": props.get("text-align"), - "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")), - "wrap_text": ( - None - if props.get("white-space") is None - else props["white-space"] not in ("nowrap", "pre", "pre-line") - ), + "vertical": self._get_vertical_alignment(props), + "wrap_text": self._get_is_wrap_text(props), } - def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: + def _get_vertical_alignment(self, props: Mapping[str, str]) -> Optional[str]: + vertical_align = props.get("vertical-align") + if vertical_align: + return self.VERTICAL_MAP.get(vertical_align) + return None + + def _get_is_wrap_text(self, props: Mapping[str, str]) -> Optional[bool]: + if props.get("white-space") is None: + return None + return bool(props["white-space"] not in ("nowrap", "pre", "pre-line")) + + def build_border( + self, props: Mapping[str, str] + ) -> Dict[str, Dict[str, Optional[str]]]: return { side: { "style": self._border_style( @@ -149,7 +220,7 @@ def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style: Optional[str], width): + def _border_style(self, style: Optional[str], width: Optional[str]): # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -169,26 +240,16 @@ def _border_style(self, style: Optional[str], width): if style == "none" or style == "hidden": return None - if width is None: - width = "2pt" - width = float(width[:-2]) - if width < 1e-5: + width_name = self._get_width_name(width) + if width_name is None: return None - elif width < 1.3: - width_name = "thin" - elif width < 2.8: - width_name = "medium" - else: - width_name = "thick" - if style in (None, "groove", "ridge", "inset", "outset"): + if style in (None, "groove", "ridge", "inset", "outset", "solid"): # not handled - style = "solid" + return width_name if style == "double": return "double" - if style == "solid": - return width_name if style == "dotted": if width_name in ("hair", "thin"): return "dotted" @@ -198,36 +259,89 @@ def _border_style(self, style: Optional[str], width): return "dashed" return "mediumDashed" - def build_fill(self, props: Dict[str, str]): + def _get_width_name(self, width_input: Optional[str]) -> Optional[str]: + width = self._width_to_float(width_input) + if width < 1e-5: + return None + elif width < 1.3: + return "thin" + elif width < 2.8: + return "medium" + return "thick" + + def _width_to_float(self, width: Optional[str]) -> float: + if width is None: + width = "2pt" + return self._pt_to_float(width) + + def _pt_to_float(self, pt_string: str) -> float: + assert pt_string.endswith("pt") + return float(pt_string.rstrip("pt")) + + def build_fill(self, props: Mapping[str, str]): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type fill_color = props.get("background-color") if fill_color not in (None, "transparent", "none"): return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} - BOLD_MAP = { - "bold": True, - "bolder": True, - "600": True, - "700": True, - "800": True, - "900": True, - "normal": False, - "lighter": False, - "100": False, - "200": False, - "300": False, - "400": False, - "500": False, - } - ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} + def build_number_format(self, props: Mapping[str, str]) -> Dict[str, Optional[str]]: + return {"format_code": props.get("number-format")} - def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: - size = props.get("font-size") - if size is not None: - assert size.endswith("pt") - size = float(size[:-2]) + def build_font( + self, props: Mapping[str, str] + ) -> Dict[str, Optional[Union[bool, int, float, str]]]: + font_names = self._get_font_names(props) + decoration = self._get_decoration(props) + return { + "name": font_names[0] if font_names else None, + "family": self._select_font_family(font_names), + "size": self._get_font_size(props), + "bold": self._get_is_bold(props), + "italic": self._get_is_italic(props), + "underline": ("single" if "underline" in decoration else None), + "strike": ("line-through" in decoration) or None, + "color": self.color_to_excel(props.get("color")), + # shadow if nonzero digit before shadow color + "shadow": self._get_shadow(props), + # FIXME: dont leave commented-out + # 'vertAlign':, + # 'charset': , + # 'scheme': , + # 'outline': , + # 'condense': , + } + + def _get_is_bold(self, props: Mapping[str, str]) -> Optional[bool]: + weight = props.get("font-weight") + if weight: + return self.BOLD_MAP.get(weight) + return None + + def _get_is_italic(self, props: Mapping[str, str]) -> Optional[bool]: + font_style = props.get("font-style") + if font_style: + return self.ITALIC_MAP.get(font_style) + return None + def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]: + decoration = props.get("text-decoration") + if decoration is not None: + return decoration.split() + else: + return () + + def _get_underline(self, decoration: Sequence[str]) -> Optional[str]: + if "underline" in decoration: + return "single" + return None + + def _get_shadow(self, props: Mapping[str, str]) -> Optional[bool]: + if "text-shadow" in props: + return bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) + return None + + def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]: font_names_tmp = re.findall( r"""(?x) ( @@ -240,6 +354,7 @@ def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: """, props.get("font-family", ""), ) + font_names = [] for name in font_names_tmp: if name[:1] == '"': @@ -250,88 +365,58 @@ def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: name = name.strip() if name: font_names.append(name) + return font_names + + def _get_font_size(self, props: Mapping[str, str]) -> Optional[float]: + size = props.get("font-size") + if size is None: + return size + return self._pt_to_float(size) + def _select_font_family(self, font_names) -> Optional[int]: family = None for name in font_names: - if name == "serif": - family = 1 # roman - break - elif name == "sans-serif": - family = 2 # swiss - break - elif name == "cursive": - family = 4 # script - break - elif name == "fantasy": - family = 5 # decorative + family = self.FAMILY_MAP.get(name) + if family: break - decoration = props.get("text-decoration") - if decoration is not None: - decoration = decoration.split() - else: - decoration = () + return family - return { - "name": font_names[0] if font_names else None, - "family": family, - "size": size, - "bold": self.BOLD_MAP.get(props.get("font-weight")), - "italic": self.ITALIC_MAP.get(props.get("font-style")), - "underline": ("single" if "underline" in decoration else None), - "strike": ("line-through" in decoration) or None, - "color": self.color_to_excel(props.get("color")), - # shadow if nonzero digit before shadow color - "shadow": ( - bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) - if "text-shadow" in props - else None - ), - # FIXME: dont leave commented-out - # 'vertAlign':, - # 'charset': , - # 'scheme': , - # 'outline': , - # 'condense': , - } - - NAMED_COLORS = { - "maroon": "800000", - "brown": "A52A2A", - "red": "FF0000", - "pink": "FFC0CB", - "orange": "FFA500", - "yellow": "FFFF00", - "olive": "808000", - "green": "008000", - "purple": "800080", - "fuchsia": "FF00FF", - "lime": "00FF00", - "teal": "008080", - "aqua": "00FFFF", - "blue": "0000FF", - "navy": "000080", - "black": "000000", - "gray": "808080", - "grey": "808080", - "silver": "C0C0C0", - "white": "FFFFFF", - } - - def color_to_excel(self, val: Optional[str]): + def color_to_excel(self, val: Optional[str]) -> Optional[str]: if val is None: return None - if val.startswith("#") and len(val) == 7: - return val[1:].upper() - if val.startswith("#") and len(val) == 4: - return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper() + + if self._is_hex_color(val): + return self._convert_hex_to_excel(val) + try: return self.NAMED_COLORS[val] except KeyError: warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning) + return None - def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]: - return {"format_code": props.get("number-format")} + def _is_hex_color(self, color_string: str) -> bool: + return bool(color_string.startswith("#")) + + def _convert_hex_to_excel(self, color_string: str) -> str: + code = color_string.lstrip("#") + if self._is_shorthand_color(color_string): + return (code[0] * 2 + code[1] * 2 + code[2] * 2).upper() + else: + return code.upper() + + def _is_shorthand_color(self, color_string: str) -> bool: + """Check if color code is shorthand. + + #FFF is a shorthand as opposed to full #FFFFFF. + """ + code = color_string.lstrip("#") + if len(code) == 3: + return True + elif len(code) == 6: + return False + else: + raise ValueError(f"Unexpected color {color_string}") class ExcelFormatter: @@ -346,7 +431,7 @@ class ExcelFormatter: Format string for floating point numbers cols : sequence, optional Columns to write - header : boolean or list of string, default True + header : boolean or sequence of str, default True Write out column names. If a list of string is given it is assumed to be aliases for the column names index : boolean, default True @@ -397,10 +482,10 @@ def __init__( if cols is not None: # all missing, raise - if not len(Index(cols) & df.columns): + if not len(Index(cols).intersection(df.columns)): raise KeyError("passes columns are not ALL present dataframe") - if len(Index(cols) & df.columns) != len(cols): + if len(Index(cols).intersection(df.columns)) != len(cols): # Deprecated in GH#17295, enforced in 1.0.0 raise KeyError("Not all names specified in 'columns' are found") @@ -445,7 +530,7 @@ def _format_value(self, val): ) return val - def _format_header_mi(self): + def _format_header_mi(self) -> Iterable[ExcelCell]: if self.columns.nlevels > 1: if not self.index: raise NotImplementedError( @@ -453,8 +538,7 @@ def _format_header_mi(self): "index ('index'=False) is not yet implemented." ) - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if not (has_aliases or self.header): + if not (self._has_aliases or self.header): return columns = self.columns @@ -470,28 +554,30 @@ def _format_header_mi(self): if self.merge_cells: # Format multi-index as a merged cells. - for lnum in range(len(level_lengths)): - name = columns.names[lnum] - yield ExcelCell(lnum, coloffset, name, self.header_style) + for lnum, name in enumerate(columns.names): + yield ExcelCell( + row=lnum, + col=coloffset, + val=name, + style=self.header_style, + ) for lnum, (spans, levels, level_codes) in enumerate( zip(level_lengths, columns.levels, columns.codes) ): values = levels.take(level_codes) - for i in spans: - if spans[i] > 1: - yield ExcelCell( - lnum, - coloffset + i + 1, - values[i], - self.header_style, - lnum, - coloffset + i + spans[i], - ) - else: - yield ExcelCell( - lnum, coloffset + i + 1, values[i], self.header_style - ) + for i, span_val in spans.items(): + spans_multiple_cells = span_val > 1 + yield ExcelCell( + row=lnum, + col=coloffset + i + 1, + val=values[i], + style=self.header_style, + mergestart=lnum if spans_multiple_cells else None, + mergeend=( + coloffset + i + span_val if spans_multiple_cells else None + ), + ) else: # Format in legacy format with dots to indicate levels. for i, values in enumerate(zip(*level_strs)): @@ -500,9 +586,8 @@ def _format_header_mi(self): self.rowcounter = lnum - def _format_header_regular(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if has_aliases or self.header: + def _format_header_regular(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: coloffset = 0 if self.index: @@ -511,11 +596,12 @@ def _format_header_regular(self): coloffset = len(self.df.index[0]) colnames = self.columns - if has_aliases: + if self._has_aliases: + self.header = cast(Sequence, self.header) if len(self.header) != len(self.columns): raise ValueError( - f"Writing {len(self.columns)} cols but got {len(self.header)} " - "aliases" + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" ) else: colnames = self.header @@ -525,7 +611,7 @@ def _format_header_regular(self): self.rowcounter, colindex + coloffset, colname, self.header_style ) - def _format_header(self): + def _format_header(self) -> Iterable[ExcelCell]: if isinstance(self.columns, MultiIndex): gen = self._format_header_mi() else: @@ -537,22 +623,24 @@ def _format_header(self): "" ] * len(self.columns) if reduce(lambda x, y: x and y, map(lambda x: x != "", row)): - gen2 = ( + # pandas\io\formats\excel.py:618: error: Incompatible types in + # assignment (expression has type "Generator[ExcelCell, None, + # None]", variable has type "Tuple[]") [assignment] + gen2 = ( # type: ignore[assignment] ExcelCell(self.rowcounter, colindex, val, self.header_style) for colindex, val in enumerate(row) ) self.rowcounter += 1 return itertools.chain(gen, gen2) - def _format_body(self): + def _format_body(self) -> Iterable[ExcelCell]: if isinstance(self.df.index, MultiIndex): return self._format_hierarchical_rows() else: return self._format_regular_rows() - def _format_regular_rows(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if has_aliases or self.header: + def _format_regular_rows(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: self.rowcounter += 1 # output index and index_label? @@ -587,12 +675,10 @@ def _format_regular_rows(self): else: coloffset = 0 - for cell in self._generate_body(coloffset): - yield cell + yield from self._generate_body(coloffset) - def _format_hierarchical_rows(self): - has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) - if has_aliases or self.header: + def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: + if self._has_aliases or self.header: self.rowcounter += 1 gcolidx = 0 @@ -630,26 +716,25 @@ def _format_hierarchical_rows(self): ): values = levels.take( - level_codes, allow_fill=levels._can_hold_na, fill_value=True + level_codes, + allow_fill=levels._can_hold_na, + fill_value=levels._na_value, ) - for i in spans: - if spans[i] > 1: - yield ExcelCell( - self.rowcounter + i, - gcolidx, - values[i], - self.header_style, - self.rowcounter + i + spans[i] - 1, - gcolidx, - ) - else: - yield ExcelCell( - self.rowcounter + i, - gcolidx, - values[i], - self.header_style, - ) + for i, span_val in spans.items(): + spans_multiple_cells = span_val > 1 + yield ExcelCell( + row=self.rowcounter + i, + col=gcolidx, + val=values[i], + style=self.header_style, + mergestart=( + self.rowcounter + i + span_val - 1 + if spans_multiple_cells + else None + ), + mergeend=gcolidx if spans_multiple_cells else None, + ) gcolidx += 1 else: @@ -657,17 +742,21 @@ def _format_hierarchical_rows(self): for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): yield ExcelCell( - self.rowcounter + idx, - gcolidx, - indexcolval, - self.header_style, + row=self.rowcounter + idx, + col=gcolidx, + val=indexcolval, + style=self.header_style, ) gcolidx += 1 - for cell in self._generate_body(gcolidx): - yield cell + yield from self._generate_body(gcolidx) - def _generate_body(self, coloffset: int): + @property + def _has_aliases(self) -> bool: + """Whether the aliases for column names are present.""" + return is_list_like(self.header) + + def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: if self.styler is None: styles = None else: @@ -684,11 +773,12 @@ def _generate_body(self, coloffset: int): xlstyle = self.style_converter(";".join(styles[i, colidx])) yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) - def get_formatted_cells(self): + def get_formatted_cells(self) -> Iterable[ExcelCell]: for cell in itertools.chain(self._format_header(), self._format_body()): cell.val = self._format_value(cell.val) yield cell + @doc(storage_options=generic._shared_docs["storage_options"]) def write( self, writer, @@ -697,9 +787,10 @@ def write( startcol=0, freeze_panes=None, engine=None, + storage_options: StorageOptions = None, ): """ - writer : string or ExcelWriter object + writer : path-like, file-like, or ExcelWriter object File path or existing ExcelWriter sheet_name : string, default 'Sheet1' Name of sheet which will contain DataFrame @@ -714,6 +805,16 @@ def write( write engine to use if writer is a path - you can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. + + .. deprecated:: 1.2.0 + + As the `xlwt `__ package is no longer + maintained, the ``xlwt`` engine will be removed in a future + version of pandas. + + {storage_options} + + .. versionadded:: 1.2.0 """ from pandas.io.excel import ExcelWriter @@ -724,19 +825,27 @@ def write( f"Max sheet size is: {self.max_rows}, {self.max_cols}" ) + formatted_cells = self.get_formatted_cells() if isinstance(writer, ExcelWriter): need_save = False else: - writer = ExcelWriter(stringify_path(writer), engine=engine) + # pandas\io\formats\excel.py:808: error: Cannot instantiate + # abstract class 'ExcelWriter' with abstract attributes 'engine', + # 'save', 'supported_extensions' and 'write_cells' [abstract] + writer = ExcelWriter( # type: ignore[abstract] + writer, engine=engine, storage_options=storage_options + ) need_save = True - formatted_cells = self.get_formatted_cells() - writer.write_cells( - formatted_cells, - sheet_name, - startrow=startrow, - startcol=startcol, - freeze_panes=freeze_panes, - ) - if need_save: - writer.save() + try: + writer.write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) + finally: + # make sure to close opened file handles + if need_save: + writer.close() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 27df014620f56..db34b882a3c35 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -5,7 +5,6 @@ from contextlib import contextmanager from csv import QUOTE_NONE, QUOTE_NONNUMERIC -from datetime import tzinfo import decimal from functools import partial from io import StringIO @@ -36,11 +35,17 @@ from pandas._libs import lib from pandas._libs.missing import NA -from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType -from pandas._typing import FilePathOrBuffer, Label -from pandas.errors import AbstractMethodError +from pandas._typing import ( + ArrayLike, + CompressionOptions, + FilePathOrBuffer, + FloatFormatType, + IndexLabel, + Label, + StorageOptions, +) from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -67,20 +72,21 @@ from pandas.core.indexes.api import Index, MultiIndex, PeriodIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.core.reshape.concat import concat from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: - from pandas import Series, DataFrame, Categorical + from pandas import Categorical, DataFrame, Series + FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] -FloatFormatType = Union[str, Callable, "EngFormatter"] ColspaceType = Mapping[Label, Union[str, int]] ColspaceArgType = Union[ - str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]], + str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]] ] common_docstring = """ @@ -97,7 +103,7 @@ index : bool, optional, default True Whether to print index (row) labels. na_rep : str, optional, default 'NaN' - String representation of NAN to use. + String representation of ``NaN`` to use. formatters : list, tuple or dict of one-param. functions, optional Formatter functions to apply to columns' elements by position or name. @@ -105,7 +111,12 @@ List/tuple must be of length equal to the number of columns. float_format : one-parameter function, optional, default None Formatter function to apply to columns' elements if they are - floats. The result of this function must be a unicode string. + floats. This function must return a unicode string and will be + applied only to the non-``NaN`` elements, with ``NaN`` being + handled by ``na_rep``. + + .. versionchanged:: 1.2.0 + sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. @@ -256,22 +267,20 @@ def __init__( float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype - self.adj = _get_adjustment() + self.adj = get_adjustment() self._chk_truncate() def _chk_truncate(self) -> None: - from pandas.core.reshape.concat import concat - self.tr_row_num: Optional[int] min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows # used below by min_rows - truncate_v = max_rows and (len(self.series) > max_rows) + is_truncated_vertically = max_rows and (len(self.series) > max_rows) series = self.series - if truncate_v: + if is_truncated_vertically: max_rows = cast(int, max_rows) if min_rows: # if min_rows is set (not None or 0), set max_rows to minimum @@ -287,7 +296,7 @@ def _chk_truncate(self) -> None: else: self.tr_row_num = None self.tr_series = series - self.truncate_v = truncate_v + self.is_truncated_vertically = is_truncated_vertically def _get_footer(self) -> str: name = self.series.name @@ -306,7 +315,9 @@ def _get_footer(self) -> str: series_name = pprint_thing(name, escape_chars=("\t", "\r", "\n")) footer += f"Name: {series_name}" - if self.length is True or (self.length == "truncate" and self.truncate_v): + if self.length is True or ( + self.length == "truncate" and self.is_truncated_vertically + ): if footer: footer += ", " footer += f"Length: {len(self.series)}" @@ -330,9 +341,8 @@ def _get_footer(self) -> str: def _get_formatted_index(self) -> Tuple[List[str], bool]: index = self.tr_series.index - is_multi = isinstance(index, MultiIndex) - if is_multi: + if isinstance(index, MultiIndex): have_header = any(name for name in index.names) fmt_index = index.format(names=True) else: @@ -346,6 +356,7 @@ def _get_formatted_values(self) -> List[str]: None, float_format=self.float_format, na_rep=self.na_rep, + leading_space=self.index, ) def to_string(self) -> str: @@ -358,7 +369,7 @@ def to_string(self) -> str: fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() - if self.truncate_v: + if self.is_truncated_vertically: n_header_rows = 0 row_num = self.tr_row_num row_num = cast(int, row_num) @@ -440,7 +451,7 @@ def _get_pad(t): return [x.rjust(_get_pad(x)) for x in texts] -def _get_adjustment() -> TextAdjustment: +def get_adjustment() -> TextAdjustment: use_east_asian_width = get_option("display.unicode.east_asian_width") if use_east_asian_width: return EastAsianTextAdjustment() @@ -448,97 +459,12 @@ def _get_adjustment() -> TextAdjustment: return TextAdjustment() -class TableFormatter: - - show_dimensions: Union[bool, str] - is_truncated: bool - formatters: FormattersType - columns: Index - - @property - def should_show_dimensions(self) -> bool: - return self.show_dimensions is True or ( - self.show_dimensions == "truncate" and self.is_truncated - ) - - def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: - if isinstance(self.formatters, (list, tuple)): - if is_integer(i): - i = cast(int, i) - return self.formatters[i] - else: - return None - else: - if is_integer(i) and i not in self.columns: - i = self.columns[i] - return self.formatters.get(i, None) - - @contextmanager - def get_buffer( - self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None - ): - """ - Context manager to open, yield and close buffer for filenames or Path-like - objects, otherwise yield buf unchanged. - """ - if buf is not None: - buf = stringify_path(buf) - else: - buf = StringIO() - - if encoding is None: - encoding = "utf-8" - elif not isinstance(buf, str): - raise ValueError("buf is not a file name and encoding is specified.") - - if hasattr(buf, "write"): - yield buf - elif isinstance(buf, str): - with open(buf, "w", encoding=encoding, newline="") as f: - # GH#30034 open instead of codecs.open prevents a file leak - # if we have an invalid encoding argument. - # newline="" is needed to roundtrip correctly on - # windows test_to_latex_filename - yield f - else: - raise TypeError("buf is not a file name and it has no write method") - - def write_result(self, buf: IO[str]) -> None: - """ - Write the result of serialization to buf. - """ - raise AbstractMethodError(self) - - def get_result( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - """ - Perform serialization. Write to buf or return as string if buf is None. - """ - with self.get_buffer(buf, encoding=encoding) as f: - self.write_result(buf=f) - if buf is None: - return f.getvalue() - return None - - -class DataFrameFormatter(TableFormatter): - """ - Render a DataFrame - - self.to_string() : console-friendly tabular output - self.to_html() : html table - self.to_latex() : LaTeX tabular environment table - - """ +class DataFrameFormatter: + """Class for processing dataframe formatting options and data.""" __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring - col_space: ColspaceType - def __init__( self, frame: "DataFrame", @@ -552,404 +478,322 @@ def __init__( float_format: Optional[FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, - line_width: Optional[int] = None, max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, show_dimensions: Union[bool, str] = False, decimal: str = ".", - table_id: Optional[str] = None, - render_links: bool = False, bold_rows: bool = False, escape: bool = True, ): self.frame = frame + self.columns = self._initialize_columns(columns) + self.col_space = self._initialize_colspace(col_space) + self.header = header + self.index = index + self.na_rep = na_rep + self.formatters = self._initialize_formatters(formatters) + self.justify = self._initialize_justify(justify) + self.float_format = float_format + self.sparsify = self._initialize_sparsify(sparsify) self.show_index_names = index_names + self.decimal = decimal + self.bold_rows = bold_rows + self.escape = escape + self.max_rows = max_rows + self.min_rows = min_rows + self.max_cols = max_cols + self.show_dimensions = show_dimensions - if sparsify is None: - sparsify = get_option("display.multi_sparse") + self.max_cols_fitted = self._calc_max_cols_fitted() + self.max_rows_fitted = self._calc_max_rows_fitted() - self.sparsify = sparsify + self.tr_frame = self.frame + self.truncate() + self.adj = get_adjustment() - self.float_format = float_format + def get_strcols(self) -> List[List[str]]: + """ + Render a DataFrame to a list of columns (as lists of strings). + """ + strcols = self._get_strcols_without_index() + + if self.index: + str_index = self._get_formatted_index(self.tr_frame) + strcols.insert(0, str_index) + + return strcols + + @property + def should_show_dimensions(self) -> bool: + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) + + @property + def is_truncated(self) -> bool: + return bool(self.is_truncated_horizontally or self.is_truncated_vertically) + + @property + def is_truncated_horizontally(self) -> bool: + return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) + + @property + def is_truncated_vertically(self) -> bool: + return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) + + @property + def dimensions_info(self) -> str: + return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" + + @property + def has_index_names(self) -> bool: + return _has_names(self.frame.index) + + @property + def has_column_names(self) -> bool: + return _has_names(self.frame.columns) + + @property + def show_row_idx_names(self) -> bool: + return all((self.has_index_names, self.index, self.show_index_names)) + + @property + def show_col_idx_names(self) -> bool: + return all((self.has_column_names, self.show_index_names, self.header)) + + @property + def max_rows_displayed(self) -> int: + return min(self.max_rows or len(self.frame), len(self.frame)) + + def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: + if sparsify is None: + return get_option("display.multi_sparse") + return sparsify + + def _initialize_formatters( + self, formatters: Optional[FormattersType] + ) -> FormattersType: if formatters is None: - self.formatters = {} - elif len(frame.columns) == len(formatters) or isinstance(formatters, dict): - self.formatters = formatters + return {} + elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict): + return formatters else: raise ValueError( f"Formatters length({len(formatters)}) should match " - f"DataFrame number of columns({len(frame.columns)})" + f"DataFrame number of columns({len(self.frame.columns)})" ) - self.na_rep = na_rep - self.decimal = decimal + + def _initialize_justify(self, justify: Optional[str]) -> str: + if justify is None: + return get_option("display.colheader_justify") + else: + return justify + + def _initialize_columns(self, columns: Optional[Sequence[str]]) -> Index: + if columns is not None: + cols = ensure_index(columns) + self.frame = self.frame[cols] + return cols + else: + return self.frame.columns + + def _initialize_colspace( + self, col_space: Optional[ColspaceArgType] + ) -> ColspaceType: + result: ColspaceType + if col_space is None: - self.col_space = {} + result = {} elif isinstance(col_space, (int, str)): - self.col_space = {"": col_space} - self.col_space.update({column: col_space for column in self.frame.columns}) + result = {"": col_space} + result.update({column: col_space for column in self.frame.columns}) elif isinstance(col_space, Mapping): for column in col_space.keys(): if column not in self.frame.columns and column != "": raise ValueError( f"Col_space is defined for an unknown column: {column}" ) - self.col_space = col_space + result = col_space else: - if len(frame.columns) != len(col_space): + if len(self.frame.columns) != len(col_space): raise ValueError( f"Col_space length({len(col_space)}) should match " - f"DataFrame number of columns({len(frame.columns)})" + f"DataFrame number of columns({len(self.frame.columns)})" ) - self.col_space = dict(zip(self.frame.columns, col_space)) - - self.header = header - self.index = index - self.line_width = line_width - self.max_rows = max_rows - self.min_rows = min_rows - self.max_cols = max_cols - self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) - self.show_dimensions = show_dimensions - self.table_id = table_id - self.render_links = render_links - - if justify is None: - self.justify = get_option("display.colheader_justify") - else: - self.justify = justify + result = dict(zip(self.frame.columns, col_space)) + return result - self.bold_rows = bold_rows - self.escape = escape + def _calc_max_cols_fitted(self) -> Optional[int]: + """Number of columns fitting the screen.""" + if not self._is_in_terminal(): + return self.max_cols - if columns is not None: - self.columns = ensure_index(columns) - self.frame = self.frame[self.columns] + width, _ = get_terminal_size() + if self._is_screen_narrow(width): + return width else: - self.columns = frame.columns - - self._chk_truncate() - self.adj = _get_adjustment() + return self.max_cols - def _chk_truncate(self) -> None: - """ - Checks whether the frame should be truncated. If so, slices - the frame up. - """ - from pandas.core.reshape.concat import concat + def _calc_max_rows_fitted(self) -> Optional[int]: + """Number of rows with data fitting the screen.""" + max_rows: Optional[int] - # Cut the data to the information actually printed - max_cols = self.max_cols - max_rows = self.max_rows - self.max_rows_adj: Optional[int] - max_rows_adj: Optional[int] - - if max_cols == 0 or max_rows == 0: # assume we are in the terminal - (w, h) = get_terminal_size() - self.w = w - self.h = h + if self._is_in_terminal(): + _, height = get_terminal_size() if self.max_rows == 0: - dot_row = 1 - prompt_row = 1 - if self.show_dimensions: - show_dimension_rows = 3 - # assume we only get here if self.header is boolean. - # i.e. not to_latex() where self.header may be List[str] - self.header = cast(bool, self.header) - n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row # rows available to fill with actual data - max_rows_adj = self.h - n_add_rows - self.max_rows_adj = max_rows_adj - - # Format only rows and columns that could potentially fit the - # screen - if max_cols == 0 and len(self.frame.columns) > w: - max_cols = w - if max_rows == 0 and len(self.frame) > h: - max_rows = h - - if not hasattr(self, "max_rows_adj"): - if max_rows: - if (len(self.frame) > max_rows) and self.min_rows: - # if truncated, set max_rows showed to min_rows - max_rows = min(self.min_rows, max_rows) - self.max_rows_adj = max_rows - if not hasattr(self, "max_cols_adj"): - self.max_cols_adj = max_cols - - max_cols_adj = self.max_cols_adj - max_rows_adj = self.max_rows_adj - - truncate_h = max_cols_adj and (len(self.columns) > max_cols_adj) - truncate_v = max_rows_adj and (len(self.frame) > max_rows_adj) - - frame = self.frame - if truncate_h: - # cast here since if truncate_h is True, max_cols_adj is not None - max_cols_adj = cast(int, max_cols_adj) - if max_cols_adj == 0: - col_num = len(frame.columns) - elif max_cols_adj == 1: - max_cols = cast(int, max_cols) - frame = frame.iloc[:, :max_cols] - col_num = max_cols - else: - col_num = max_cols_adj // 2 - frame = concat( - (frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1 - ) - # truncate formatter - if isinstance(self.formatters, (list, tuple)): - truncate_fmt = self.formatters - self.formatters = [ - *truncate_fmt[:col_num], - *truncate_fmt[-col_num:], - ] - self.tr_col_num = col_num - if truncate_v: - # cast here since if truncate_v is True, max_rows_adj is not None - max_rows_adj = cast(int, max_rows_adj) - if max_rows_adj == 1: - row_num = max_rows - frame = frame.iloc[:max_rows, :] + return height - self._get_number_of_auxillary_rows() + + if self._is_screen_short(height): + max_rows = height else: - row_num = max_rows_adj // 2 - frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) - self.tr_row_num = row_num + max_rows = self.max_rows else: - self.tr_row_num = None + max_rows = self.max_rows - self.tr_frame = frame - self.truncate_h = truncate_h - self.truncate_v = truncate_v - self.is_truncated = bool(self.truncate_h or self.truncate_v) + return self._adjust_max_rows(max_rows) - def _to_str_columns(self) -> List[List[str]]: - """ - Render a DataFrame to a list of columns (as lists of strings). - """ - # this method is not used by to_html where self.col_space - # could be a string so safe to cast - col_space = {k: cast(int, v) for k, v in self.col_space.items()} + def _adjust_max_rows(self, max_rows: Optional[int]) -> Optional[int]: + """Adjust max_rows using display logic. - frame = self.tr_frame - # may include levels names also + See description here: + https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options - str_index = self._get_formatted_index(frame) - - if not is_list_like(self.header) and not self.header: - stringified = [] - for i, c in enumerate(frame): - fmt_values = self._format_col(i) - fmt_values = _make_fixed_width( - fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj, - ) - stringified.append(fmt_values) - else: - if is_list_like(self.header): - # cast here since can't be bool if is_list_like - self.header = cast(List[str], self.header) - if len(self.header) != len(self.columns): - raise ValueError( - f"Writing {len(self.columns)} cols " - f"but got {len(self.header)} aliases" - ) - str_columns = [[label] for label in self.header] - else: - str_columns = self._get_formatted_column_labels(frame) + GH #37359 + """ + if max_rows: + if (len(self.frame) > max_rows) and self.min_rows: + # if truncated, set max_rows showed to min_rows + max_rows = min(self.min_rows, max_rows) + return max_rows - if self.show_row_idx_names: - for x in str_columns: - x.append("") + def _is_in_terminal(self) -> bool: + """Check if the output is to be shown in terminal.""" + return bool(self.max_cols == 0 or self.max_rows == 0) - stringified = [] - for i, c in enumerate(frame): - cheader = str_columns[i] - header_colwidth = max( - col_space.get(c, 0), *(self.adj.len(x) for x in cheader) - ) - fmt_values = self._format_col(i) - fmt_values = _make_fixed_width( - fmt_values, self.justify, minimum=header_colwidth, adj=self.adj - ) + def _is_screen_narrow(self, max_width) -> bool: + return bool(self.max_cols == 0 and len(self.frame.columns) > max_width) - max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) - cheader = self.adj.justify(cheader, max_len, mode=self.justify) - stringified.append(cheader + fmt_values) + def _is_screen_short(self, max_height) -> bool: + return bool(self.max_rows == 0 and len(self.frame) > max_height) - strcols = stringified - if self.index: - strcols.insert(0, str_index) + def _get_number_of_auxillary_rows(self) -> int: + """Get number of rows occupied by prompt, dots and dimension info.""" + dot_row = 1 + prompt_row = 1 + num_rows = dot_row + prompt_row - # Add ... to signal truncated - truncate_h = self.truncate_h - truncate_v = self.truncate_v + if self.show_dimensions: + num_rows += len(self.dimensions_info.splitlines()) - if truncate_h: - col_num = self.tr_col_num - strcols.insert(self.tr_col_num + 1, [" ..."] * (len(str_index))) - if truncate_v: - n_header_rows = len(str_index) - len(frame) - row_num = self.tr_row_num - # cast here since if truncate_v is True, self.tr_row_num is not None - row_num = cast(int, row_num) - for ix, col in enumerate(strcols): - # infer from above row - cwidth = self.adj.len(strcols[ix][row_num]) - is_dot_col = False - if truncate_h: - is_dot_col = ix == col_num + 1 - if cwidth > 3 or is_dot_col: - my_str = "..." - else: - my_str = ".." + if self.header: + num_rows += 1 - if ix == 0: - dot_mode = "left" - elif is_dot_col: - cwidth = 4 - dot_mode = "right" - else: - dot_mode = "right" - dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0] - strcols[ix].insert(row_num + n_header_rows, dot_str) - return strcols + return num_rows - def write_result(self, buf: IO[str]) -> None: + def truncate(self) -> None: """ - Render a DataFrame to a console-friendly tabular output. + Check whether the frame should be truncated. If so, slice the frame up. """ - from pandas import Series + if self.is_truncated_horizontally: + self._truncate_horizontally() - frame = self.frame + if self.is_truncated_vertically: + self._truncate_vertically() - if len(frame.columns) == 0 or len(frame.index) == 0: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(frame.columns)}\n" - f"Index: {pprint_thing(frame.index)}" - ) - text = info_line + def _truncate_horizontally(self) -> None: + """Remove columns, which are not to be displayed and adjust formatters. + + Attributes affected: + - tr_frame + - formatters + - tr_col_num + """ + assert self.max_cols_fitted is not None + col_num = self.max_cols_fitted // 2 + if col_num >= 1: + left = self.tr_frame.iloc[:, :col_num] + right = self.tr_frame.iloc[:, -col_num:] + self.tr_frame = concat((left, right), axis=1) + + # truncate formatter + if isinstance(self.formatters, (list, tuple)): + self.formatters = [ + *self.formatters[:col_num], + *self.formatters[-col_num:], + ] else: + col_num = cast(int, self.max_cols) + self.tr_frame = self.tr_frame.iloc[:, :col_num] + self.tr_col_num = col_num - strcols = self._to_str_columns() - if self.line_width is None: # no need to wrap around just print - # the whole frame - text = self.adj.adjoin(1, *strcols) - elif ( - not isinstance(self.max_cols, int) or self.max_cols > 0 - ): # need to wrap around - text = self._join_multiline(*strcols) - else: # max_cols == 0. Try to fit frame to terminal - lines = self.adj.adjoin(1, *strcols).split("\n") - max_len = Series(lines).str.len().max() - # plus truncate dot col - dif = max_len - self.w - # '+ 1' to avoid too wide repr (GH PR #17023) - adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) - n_cols = len(col_lens) - counter = 0 - while adj_dif > 0 and n_cols > 1: - counter += 1 - mid = int(round(n_cols / 2.0)) - mid_ix = col_lens.index[mid] - col_len = col_lens[mid_ix] - # adjoin adds one - adj_dif -= col_len + 1 - col_lens = col_lens.drop(mid_ix) - n_cols = len(col_lens) - # subtract index column - max_cols_adj = n_cols - self.index - # GH-21180. Ensure that we print at least two. - max_cols_adj = max(max_cols_adj, 2) - self.max_cols_adj = max_cols_adj - - # Call again _chk_truncate to cut frame appropriately - # and then generate string representation - self._chk_truncate() - strcols = self._to_str_columns() - text = self.adj.adjoin(1, *strcols) - buf.writelines(text) - - if self.should_show_dimensions: - buf.write(f"\n\n[{len(frame)} rows x {len(frame.columns)} columns]") - - def _join_multiline(self, *args) -> str: - lwidth = self.line_width - adjoin_width = 1 - strcols = list(args) - if self.index: - idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + def _truncate_vertically(self) -> None: + """Remove rows, which are not to be displayed. - col_widths = [ - np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 - for col in strcols - ] + Attributes affected: + - tr_frame + - tr_row_num + """ + assert self.max_rows_fitted is not None + row_num = self.max_rows_fitted // 2 + if row_num >= 1: + head = self.tr_frame.iloc[:row_num, :] + tail = self.tr_frame.iloc[-row_num:, :] + self.tr_frame = concat((head, tail)) + else: + row_num = cast(int, self.max_rows) + self.tr_frame = self.tr_frame.iloc[:row_num, :] + self.tr_row_num = row_num + + def _get_strcols_without_index(self) -> List[List[str]]: + strcols: List[List[str]] = [] - assert lwidth is not None - col_bins = _binify(col_widths, lwidth) - nbins = len(col_bins) + if not is_list_like(self.header) and not self.header: + for i, c in enumerate(self.tr_frame): + fmt_values = self.format_col(i) + fmt_values = _make_fixed_width( + strings=fmt_values, + justify=self.justify, + minimum=int(self.col_space.get(c, 0)), + adj=self.adj, + ) + strcols.append(fmt_values) + return strcols - if self.truncate_v: - # cast here since if truncate_v is True, max_rows_adj is not None - self.max_rows_adj = cast(int, self.max_rows_adj) - nrows = self.max_rows_adj + 1 + if is_list_like(self.header): + # cast here since can't be bool if is_list_like + self.header = cast(List[str], self.header) + if len(self.header) != len(self.columns): + raise ValueError( + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" + ) + str_columns = [[label] for label in self.header] else: - nrows = len(self.frame) - - str_lst = [] - st = 0 - for i, ed in enumerate(col_bins): - row = strcols[st:ed] - if self.index: - row.insert(0, idx) - if nbins > 1: - if ed <= len(strcols) and i < nbins - 1: - row.append([" \\"] + [" "] * (nrows - 1)) - else: - row.append([" "] * nrows) - str_lst.append(self.adj.adjoin(adjoin_width, *row)) - st = ed - return "\n\n".join(str_lst) + str_columns = self._get_formatted_column_labels(self.tr_frame) - def to_string( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - ) -> Optional[str]: - return self.get_result(buf=buf, encoding=encoding) + if self.show_row_idx_names: + for x in str_columns: + x.append("") - def to_latex( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - column_format: Optional[str] = None, - longtable: bool = False, - encoding: Optional[str] = None, - multicolumn: bool = False, - multicolumn_format: Optional[str] = None, - multirow: bool = False, - caption: Optional[str] = None, - label: Optional[str] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a LaTeX tabular/longtable environment output. - """ - from pandas.io.formats.latex import LatexFormatter + for i, c in enumerate(self.tr_frame): + cheader = str_columns[i] + header_colwidth = max( + int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) + ) + fmt_values = self.format_col(i) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=header_colwidth, adj=self.adj + ) - return LatexFormatter( - self, - column_format=column_format, - longtable=longtable, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow, - caption=caption, - label=label, - ).get_result(buf=buf, encoding=encoding) + max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) + cheader = self.adj.justify(cheader, max_len, mode=self.justify) + strcols.append(cheader + fmt_values) - def _format_col(self, i: int) -> List[str]: + return strcols + + def format_col(self, i: int) -> List[str]: frame = self.tr_frame formatter = self._get_formatter(i) return format_array( @@ -959,39 +803,23 @@ def _format_col(self, i: int) -> List[str]: na_rep=self.na_rep, space=self.col_space.get(frame.columns[i]), decimal=self.decimal, + leading_space=self.index, ) - def to_html( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - classes: Optional[Union[str, List, Tuple]] = None, - notebook: bool = False, - border: Optional[int] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a html table. - - Parameters - ---------- - classes : str or list-like - classes to include in the `class` attribute of the opening - ``
          `` tag, in addition to the default "dataframe". - notebook : {True, False}, optional, default False - Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
          `` tag. Default ``pd.options.display.html.border``. - """ - from pandas.io.formats.html import HTMLFormatter, NotebookFormatter - - Klass = NotebookFormatter if notebook else HTMLFormatter - return Klass(self, classes=classes, border=border).get_result( - buf=buf, encoding=encoding - ) + def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: - from pandas.core.indexes.multi import _sparsify + from pandas.core.indexes.multi import sparsify_labels columns = frame.columns @@ -1001,7 +829,7 @@ def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: dtypes = self.frame.dtypes._values # if we have a Float level, they don't use leading space at all - restrict_formatting = any(l.is_floating for l in columns.levels) + restrict_formatting = any(level.is_floating for level in columns.levels) need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) def space_format(x, y): @@ -1017,7 +845,7 @@ def space_format(x, y): zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) ) if self.sparsify and len(str_columns): - str_columns = _sparsify(str_columns) + str_columns = sparsify_labels(str_columns) str_columns = [list(x) for x in zip(*str_columns)] else: @@ -1031,22 +859,6 @@ def space_format(x, y): # self.str_columns = str_columns return str_columns - @property - def has_index_names(self) -> bool: - return _has_names(self.frame.index) - - @property - def has_column_names(self) -> bool: - return _has_names(self.frame.columns) - - @property - def show_row_idx_names(self) -> bool: - return all((self.has_index_names, self.index, self.show_index_names)) - - @property - def show_col_idx_names(self) -> bool: - return all((self.has_column_names, self.show_index_names, self.header)) - def _get_formatted_index(self, frame: "DataFrame") -> List[str]: # Note: this is only used by to_string() and to_latex(), not by # to_html(). so safe to cast col_space here. @@ -1068,7 +880,7 @@ def _get_formatted_index(self, frame: "DataFrame") -> List[str]: fmt_index = [ tuple( _make_fixed_width( - list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj, + list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj ) ) for x in fmt_index @@ -1097,6 +909,232 @@ def _get_column_name_list(self) -> List[str]: return names +class DataFrameRenderer: + """Class for creating dataframe output in multiple formats. + + Called in pandas.core.generic.NDFrame: + - to_csv + - to_latex + + Called in pandas.core.frame.DataFrame: + - to_html + - to_string + + Parameters + ---------- + fmt : DataFrameFormatter + Formatter with the formating options. + """ + + def __init__(self, fmt: DataFrameFormatter): + self.fmt = fmt + + def to_latex( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + column_format: Optional[str] = None, + longtable: bool = False, + encoding: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + from pandas.io.formats.latex import LatexFormatter + + latex_formatter = LatexFormatter( + self.fmt, + longtable=longtable, + column_format=column_format, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + caption=caption, + label=label, + position=position, + ) + string = latex_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_html( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + classes: Optional[Union[str, List, Tuple]] = None, + notebook: bool = False, + border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, + ) -> Optional[str]: + """ + Render a DataFrame to a html table. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding : str, default “utf-8” + Set character encoding. + classes : str or list-like + classes to include in the `class` attribute of the opening + ``
          `` tag, in addition to the default "dataframe". + notebook : {True, False}, optional, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
          `` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
          ` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + """ + from pandas.io.formats.html import HTMLFormatter, NotebookFormatter + + Klass = NotebookFormatter if notebook else HTMLFormatter + + html_formatter = Klass( + self.fmt, + classes=classes, + border=border, + table_id=table_id, + render_links=render_links, + ) + string = html_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + line_width: Optional[int] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a console-friendly tabular output. + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + encoding: str, default “utf-8” + Set character encoding. + line_width : int, optional + Width to wrap a line in characters. + """ + from pandas.io.formats.string import StringFormatter + + string_formatter = StringFormatter(self.fmt, line_width=line_width) + string = string_formatter.to_string() + return save_to_buffer(string, buf=buf, encoding=encoding) + + def to_csv( + self, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + sep: str = ",", + columns: Optional[Sequence[Label]] = None, + index_label: Optional[IndexLabel] = None, + mode: str = "w", + compression: CompressionOptions = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool = True, + escapechar: Optional[str] = None, + errors: str = "strict", + storage_options: StorageOptions = None, + ) -> Optional[str]: + """ + Render dataframe as comma-separated file. + """ + from pandas.io.formats.csvs import CSVFormatter + + if path_or_buf is None: + created_buffer = True + path_or_buf = StringIO() + else: + created_buffer = False + + csv_formatter = CSVFormatter( + path_or_buf=path_or_buf, + line_terminator=line_terminator, + sep=sep, + encoding=encoding, + errors=errors, + compression=compression, + quoting=quoting, + cols=columns, + index_label=index_label, + mode=mode, + chunksize=chunksize, + quotechar=quotechar, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, + storage_options=storage_options, + formatter=self.fmt, + ) + csv_formatter.save() + + if created_buffer: + assert isinstance(path_or_buf, StringIO) + content = path_or_buf.getvalue() + path_or_buf.close() + return content + + return None + + +def save_to_buffer( + string: str, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, +) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with get_buffer(buf, encoding=encoding) as f: + f.write(string) + if buf is None: + return f.getvalue() + return None + + +@contextmanager +def get_buffer(buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") + + # ---------------------------------------------------------------------- # Array formatters @@ -1110,7 +1148,7 @@ def format_array( space: Optional[Union[str, int]] = None, justify: str = "right", decimal: str = ".", - leading_space: Optional[bool] = None, + leading_space: Optional[bool] = True, quoting: Optional[int] = None, ) -> List[str]: """ @@ -1126,7 +1164,7 @@ def format_array( space justify decimal - leading_space : bool, optional + leading_space : bool, optional, default True Whether the array should be formatted with a leading space. When an array as a column of a Series or DataFrame, we do want the leading space to pad between columns. @@ -1193,7 +1231,7 @@ def __init__( decimal: str = ".", quoting: Optional[int] = None, fixed_width: bool = True, - leading_space: Optional[bool] = None, + leading_space: Optional[bool] = True, ): self.values = values self.digits = digits @@ -1216,7 +1254,7 @@ def _format_strings(self) -> List[str]: float_format = get_option("display.float_format") if float_format is None: precision = get_option("display.precision") - float_format = lambda x: f"{x: .{precision:d}g}" + float_format = lambda x: f"{x: .{precision:d}f}" else: float_format = self.float_format @@ -1277,14 +1315,12 @@ def _format(x): tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) + fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") + return fmt_values class FloatArrayFormatter(GenericArrayFormatter): - """ - - """ - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1316,7 +1352,17 @@ def _value_formatter( if float_format: def base_formatter(v): - return float_format(value=v) if notna(v) else self.na_rep + assert float_format is not None # for mypy + # pandas\io\formats\format.py:1411: error: "str" not callable + # [operator] + + # pandas\io\formats\format.py:1411: error: Unexpected keyword + # argument "value" for "__call__" of "EngFormatter" [call-arg] + return ( + float_format(value=v) # type: ignore[operator,call-arg] + if notna(v) + else self.na_rep + ) else: @@ -1350,8 +1396,19 @@ def get_result_as_array(self) -> np.ndarray: Returns the float values converted into strings using the parameters given at initialisation, as a numpy array """ + + def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str): + mask = isna(values) + formatted = np.array( + [ + formatter(val) if not m else na_rep + for val, m in zip(values.ravel(), mask.ravel()) + ] + ).reshape(values.shape) + return formatted + if self.formatter is not None: - return np.array([self.formatter(x) for x in self.values]) + return format_with_na_rep(self.values, self.formatter, self.na_rep) if self.fixed_width: threshold = get_option("display.chop_threshold") @@ -1372,19 +1429,13 @@ def format_values_with(float_format): # separate the wheat from the chaff values = self.values is_complex = is_complex_dtype(values) - mask = isna(values) - values = np.array(values, dtype="object") - values[mask] = na_rep - imask = (~mask).ravel() - values.flat[imask] = np.array( - [formatter(val) for val in values.ravel()[imask]] - ) + values = format_with_na_rep(values, formatter, na_rep) if self.fixed_width: if is_complex: - result = _trim_zeros_complex(values, self.decimal, na_rep) + result = _trim_zeros_complex(values, self.decimal) else: - result = _trim_zeros_float(values, self.decimal, na_rep) + result = _trim_zeros_float(values, self.decimal) return np.asarray(result, dtype="object") return values @@ -1394,9 +1445,11 @@ def format_values_with(float_format): float_format: Optional[FloatFormatType] if self.float_format is None: if self.fixed_width: - float_format = partial( - "{value: .{digits:d}f}".format, digits=self.digits - ) + if self.leading_space is True: + fmt_str = "{value: .{digits:d}f}" + else: + fmt_str = "{value:.{digits:d}f}" + float_format = partial(fmt_str.format, digits=self.digits) else: float_format = self.float_format else: @@ -1428,22 +1481,26 @@ def format_values_with(float_format): ).any() if has_small_values or (too_long and has_large_values): - float_format = partial("{value: .{digits:d}e}".format, digits=self.digits) + if self.leading_space is True: + fmt_str = "{value: .{digits:d}e}" + else: + fmt_str = "{value:.{digits:d}e}" + float_format = partial(fmt_str.format, digits=self.digits) formatted_values = format_values_with(float_format) return formatted_values def _format_strings(self) -> List[str]: - # shortcut - if self.formatter is not None: - return [self.formatter(x) for x in self.values] - return list(self.get_result_as_array()) class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: - formatter = self.formatter or (lambda x: f"{x: d}") + if self.leading_space is False: + formatter_str = lambda x: f"{x:d}".format(x=x) + else: + formatter_str = lambda x: f"{x: d}".format(x=x) + formatter = self.formatter or formatter_str fmt_values = [formatter(x) for x in self.values] return fmt_values @@ -1470,11 +1527,9 @@ def _format_strings(self) -> List[str]: if self.formatter is not None and callable(self.formatter): return [self.formatter(x) for x in values] - fmt_values = format_array_from_datetime( - values.asi8.ravel(), - format=_get_format_datetime64_from_values(values, self.date_format), - na_rep=self.nat_rep, - ).reshape(values.shape) + fmt_values = values._data._format_native_types( + na_rep=self.nat_rep, date_format=self.date_format + ) return fmt_values.tolist() @@ -1482,7 +1537,9 @@ class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: values = extract_array(self.values, extract_numpy=True) - formatter = values._formatter(boxed=True) + formatter = self.formatter + if formatter is None: + formatter = values._formatter(boxed=True) if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo @@ -1498,7 +1555,9 @@ def _format_strings(self) -> List[str]: digits=self.digits, space=self.space, justify=self.justify, + decimal=self.decimal, leading_space=self.leading_space, + quoting=self.quoting, ) return fmt_values @@ -1572,11 +1631,12 @@ def format_percentiles( return [i + "%" for i in out] -def _is_dates_only( +def is_dates_only( values: Union[np.ndarray, DatetimeArray, Index, DatetimeIndex] ) -> bool: # return a boolean if we are only dates (and don't have a timezone) - values = values.ravel() + if not isinstance(values, Index): + values = values.ravel() values = DatetimeIndex(values) if values.tz is not None: @@ -1593,49 +1653,40 @@ def _is_dates_only( return False -def _format_datetime64( - x: Union[NaTType, Timestamp], tz: Optional[tzinfo] = None, nat_rep: str = "NaT" -) -> str: - if x is None or (is_scalar(x) and isna(x)): +def _format_datetime64(x: Union[NaTType, Timestamp], nat_rep: str = "NaT") -> str: + if x is NaT: return nat_rep - if tz is not None or not isinstance(x, Timestamp): - if getattr(x, "tzinfo", None) is not None: - x = Timestamp(x).tz_convert(tz) - else: - x = Timestamp(x).tz_localize(tz) - return str(x) def _format_datetime64_dateonly( - x: Union[NaTType, Timestamp], nat_rep: str = "NaT", date_format: None = None + x: Union[NaTType, Timestamp], + nat_rep: str = "NaT", + date_format: Optional[str] = None, ) -> str: - if x is None or (is_scalar(x) and isna(x)): + if x is NaT: return nat_rep - if not isinstance(x, Timestamp): - x = Timestamp(x) - if date_format: return x.strftime(date_format) else: return x._date_repr -def _get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: None = None +def get_format_datetime64( + is_dates_only: bool, nat_rep: str = "NaT", date_format: Optional[str] = None ) -> Callable: if is_dates_only: - return lambda x, tz=None: _format_datetime64_dateonly( + return lambda x: _format_datetime64_dateonly( x, nat_rep=nat_rep, date_format=date_format ) else: - return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) + return lambda x: _format_datetime64(x, nat_rep=nat_rep) -def _get_format_datetime64_from_values( +def get_format_datetime64_from_values( values: Union[np.ndarray, DatetimeArray, DatetimeIndex], date_format: Optional[str] ) -> Optional[str]: """ given values and a date_format, return a string format """ @@ -1644,8 +1695,8 @@ def _get_format_datetime64_from_values( # only accepts 1D values values = values.ravel() - is_dates_only = _is_dates_only(values) - if is_dates_only: + ido = is_dates_only(values) + if ido: return date_format or "%Y-%m-%d" return date_format @@ -1654,9 +1705,9 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> List[str]: """ we by definition have a TZ """ values = self.values.astype(object) - is_dates_only = _is_dates_only(values) - formatter = self.formatter or _get_format_datetime64( - is_dates_only, date_format=self.date_format + ido = is_dates_only(values) + formatter = self.formatter or get_format_datetime64( + ido, date_format=self.date_format ) fmt_values = [formatter(x) for x in values] @@ -1676,13 +1727,13 @@ def __init__( self.box = box def _format_strings(self) -> List[str]: - formatter = self.formatter or _get_format_timedelta64( + formatter = self.formatter or get_format_timedelta64( self.values, nat_rep=self.nat_rep, box=self.box ) return [formatter(x) for x in self.values] -def _get_format_timedelta64( +def get_format_timedelta64( values: Union[np.ndarray, TimedeltaIndex, TimedeltaArray], nat_rep: str = "NaT", box: bool = False, @@ -1732,9 +1783,11 @@ def _make_fixed_width( return strings if adj is None: - adj = _get_adjustment() + adjustment = get_adjustment() + else: + adjustment = adj - max_len = max(adj.len(x) for x in strings) + max_len = max(adjustment.len(x) for x in strings) if minimum is not None: max_len = max(minimum, max_len) @@ -1743,57 +1796,74 @@ def _make_fixed_width( if conf_max is not None and max_len > conf_max: max_len = conf_max - def just(x): + def just(x: str) -> str: if conf_max is not None: - if (conf_max > 3) & (adj.len(x) > max_len): + if (conf_max > 3) & (adjustment.len(x) > max_len): x = x[: max_len - 3] + "..." return x strings = [just(x) for x in strings] - result = adj.justify(strings, max_len, mode=justify) + result = adjustment.justify(strings, max_len, mode=justify) return result -def _trim_zeros_complex( - str_complexes: np.ndarray, decimal: str = ".", na_rep: str = "NaN" -) -> List[str]: +def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ - return [ - "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal, na_rep)) + trimmed = [ + "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal)) for x in str_complexes ] + # pad strings to the length of the longest trimmed string for alignment + lengths = [len(s) for s in trimmed] + max_length = max(lengths) + padded = [ + s[: -((k - 1) // 2 + 1)] # real part + + (max_length - k) // 2 * "0" + + s[-((k - 1) // 2 + 1) : -((k - 1) // 2)] # + / - + + s[-((k - 1) // 2) : -1] # imaginary part + + (max_length - k) // 2 * "0" + + s[-1] + for s, k in zip(trimmed, lengths) + ] + return padded + def _trim_zeros_float( - str_floats: Union[np.ndarray, List[str]], decimal: str = ".", na_rep: str = "NaN" + str_floats: Union[np.ndarray, List[str]], decimal: str = "." ) -> List[str]: """ Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats + number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$") - def _is_number(x): - return x != na_rep and not x.endswith("inf") + def is_number_with_decimal(x): + return re.match(number_regex, x) is not None - def _cond(values): - finite = [x for x in values if _is_number(x)] - has_decimal = [decimal in x for x in finite] + def should_trim(values: Union[np.ndarray, List[str]]) -> bool: + """ + Determine if an array of strings should be trimmed. - return ( - len(finite) > 0 - and all(has_decimal) - and all(x.endswith("0") for x in finite) - and not (any(("e" in x) or ("E" in x) for x in finite)) - ) + Returns True if all numbers containing decimals (defined by the + above regular expression) within the array end in a zero, otherwise + returns False. + """ + numbers = [x for x in values if is_number_with_decimal(x)] + return len(numbers) > 0 and all(x.endswith("0") for x in numbers) - while _cond(trimmed): - trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] + while should_trim(trimmed): + trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed] # leave one 0 after the decimal points if need be. - return [x + "0" if x.endswith(decimal) and _is_number(x) else x for x in trimmed] + result = [ + x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x + for x in trimmed + ] + return result def _has_names(index: Index) -> bool: @@ -1912,26 +1982,6 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non set_option("display.column_space", max(12, accuracy + 9)) -def _binify(cols: List[int], line_width: int) -> List[int]: - adjoin_width = 1 - bins = [] - curr_width = 0 - i_last_column = len(cols) - 1 - for i, w in enumerate(cols): - w_adjoined = w + adjoin_width - curr_width += w_adjoined - if i_last_column == i: - wrap = curr_width + 1 > line_width and i > 0 - else: - wrap = curr_width + 2 > line_width and i > 0 - if wrap: - bins.append(i) - curr_width = w_adjoined - - bins.append(len(cols)) - return bins - - def get_level_lengths( levels: Any, sentinel: Union[bool, object, str] = "" ) -> List[Dict[int, int]]: diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 7ea2417ceb24b..b4f7e3922f02f 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -3,7 +3,7 @@ """ from textwrap import dedent -from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option @@ -12,16 +12,11 @@ from pandas import MultiIndex, option_context from pandas.io.common import is_url -from pandas.io.formats.format import ( - DataFrameFormatter, - TableFormatter, - buffer_put_lines, - get_level_lengths, -) +from pandas.io.formats.format import DataFrameFormatter, get_level_lengths from pandas.io.formats.printing import pprint_thing -class HTMLFormatter(TableFormatter): +class HTMLFormatter: """ Internal class for formatting output data in html. This class is intended for shared functionality between @@ -38,6 +33,8 @@ def __init__( formatter: DataFrameFormatter, classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, + table_id: Optional[str] = None, + render_links: bool = False, ) -> None: self.fmt = formatter self.classes = classes @@ -51,14 +48,35 @@ def __init__( if border is None: border = cast(int, get_option("display.html.border")) self.border = border - self.table_id = self.fmt.table_id - self.render_links = self.fmt.render_links + self.table_id = table_id + self.render_links = render_links self.col_space = { column: f"{value}px" if isinstance(value, int) else value for column, value in self.fmt.col_space.items() } + def to_string(self) -> str: + lines = self.render() + if any(isinstance(x, str) for x in lines): + lines = [str(x) for x in lines] + return "\n".join(lines) + + def render(self) -> List[str]: + self._write_table() + + if self.should_show_dimensions: + by = chr(215) # × + self.write( + f"

          {len(self.frame)} rows {by} {len(self.frame.columns)} columns

          " + ) + + return self.elements + + @property + def should_show_dimensions(self): + return self.fmt.should_show_dimensions + @property def show_row_idx_names(self) -> bool: return self.fmt.show_row_idx_names @@ -85,9 +103,8 @@ def row_levels(self) -> int: def _get_columns_formatted_values(self) -> Iterable: return self.columns - # https://github.com/python/mypy/issues/1237 @property - def is_truncated(self) -> bool: # type: ignore + def is_truncated(self) -> bool: return self.fmt.is_truncated @property @@ -188,20 +205,6 @@ def write_tr( indent -= indent_delta self.write("", indent) - def render(self) -> List[str]: - self._write_table() - - if self.should_show_dimensions: - by = chr(215) # × - self.write( - f"

          {len(self.frame)} rows {by} {len(self.frame.columns)} columns

          " - ) - - return self.elements - - def write_result(self, buf: IO[str]) -> None: - buffer_put_lines(buf, self.render()) - def _write_table(self, indent: int = 0) -> None: _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") @@ -235,7 +238,7 @@ def _write_table(self, indent: int = 0) -> None: self.write("
          ", indent) def _write_col_header(self, indent: int) -> None: - truncate_h = self.fmt.truncate_h + is_truncated_horizontally = self.fmt.is_truncated_horizontally if isinstance(self.columns, MultiIndex): template = 'colspan="{span:d}" halign="left"' @@ -248,7 +251,7 @@ def _write_col_header(self, indent: int) -> None: level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, levels)): - if truncate_h: + if is_truncated_horizontally: # modify the header lines ins_col = self.fmt.tr_col_num if self.fmt.sparsify: @@ -345,16 +348,16 @@ def _write_col_header(self, indent: int) -> None: row.extend(self._get_columns_formatted_values()) align = self.fmt.justify - if truncate_h: + if is_truncated_horizontally: ins_col = self.row_levels + self.fmt.tr_col_num row.insert(ins_col, "...") self.write_tr(row, indent, self.indent_delta, header=True, align=align) def _write_row_header(self, indent: int) -> None: - truncate_h = self.fmt.truncate_h + is_truncated_horizontally = self.fmt.is_truncated_horizontally row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( - self.ncols + (1 if truncate_h else 0) + self.ncols + (1 if is_truncated_horizontally else 0) ) self.write_tr(row, indent, self.indent_delta, header=True) @@ -371,7 +374,7 @@ def _write_header(self, indent: int) -> None: def _get_formatted_values(self) -> Dict[int, List[str]]: with option_context("display.max_colwidth", None): - fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} + fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)} return fmt_values def _write_body(self, indent: int) -> None: @@ -389,8 +392,8 @@ def _write_body(self, indent: int) -> None: def _write_regular_rows( self, fmt_values: Mapping[int, List[str]], indent: int ) -> None: - truncate_h = self.fmt.truncate_h - truncate_v = self.fmt.truncate_v + is_truncated_horizontally = self.fmt.is_truncated_horizontally + is_truncated_vertically = self.fmt.is_truncated_vertically nrows = len(self.fmt.tr_frame) @@ -404,7 +407,7 @@ def _write_regular_rows( row: List[str] = [] for i in range(nrows): - if truncate_v and i == (self.fmt.tr_row_num): + if is_truncated_vertically and i == (self.fmt.tr_row_num): str_sep_row = ["..."] * len(row) self.write_tr( str_sep_row, @@ -425,7 +428,7 @@ def _write_regular_rows( row.append("") row.extend(fmt_values[j][i] for j in range(self.ncols)) - if truncate_h: + if is_truncated_horizontally: dot_col_ix = self.fmt.tr_col_num + self.row_levels row.insert(dot_col_ix, "...") self.write_tr( @@ -437,11 +440,12 @@ def _write_hierarchical_rows( ) -> None: template = 'rowspan="{span}" valign="top"' - truncate_h = self.fmt.truncate_h - truncate_v = self.fmt.truncate_v + is_truncated_horizontally = self.fmt.is_truncated_horizontally + is_truncated_vertically = self.fmt.is_truncated_vertically frame = self.fmt.tr_frame nrows = len(frame) + assert isinstance(frame.index, MultiIndex) idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = list(zip(*idx_values)) @@ -452,12 +456,10 @@ def _write_hierarchical_rows( level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 - if truncate_v: + if is_truncated_vertically: # Insert ... row and adjust idx_values and # level_lengths to take this into account. ins_row = self.fmt.tr_row_num - # cast here since if truncate_v is True, self.fmt.tr_row_num is not None - ins_row = cast(int, ins_row) inserted = False for lnum, records in enumerate(level_lengths): rec_new = {} @@ -518,7 +520,7 @@ def _write_hierarchical_rows( row.append(v) row.extend(fmt_values[j][i] for j in range(self.ncols)) - if truncate_h: + if is_truncated_horizontally: row.insert( self.row_levels - sparse_offset + self.fmt.tr_col_num, "..." ) @@ -532,7 +534,7 @@ def _write_hierarchical_rows( else: row = [] for i in range(len(frame)): - if truncate_v and i == (self.fmt.tr_row_num): + if is_truncated_vertically and i == (self.fmt.tr_row_num): str_sep_row = ["..."] * len(row) self.write_tr( str_sep_row, @@ -548,7 +550,7 @@ def _write_hierarchical_rows( row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(self.ncols)) - if truncate_h: + if is_truncated_horizontally: row.insert(self.row_levels + self.fmt.tr_col_num, "...") self.write_tr( row, @@ -567,7 +569,7 @@ class NotebookFormatter(HTMLFormatter): """ def _get_formatted_values(self) -> Dict[int, List[str]]: - return {i: self.fmt._format_col(i) for i in range(self.ncols)} + return {i: self.fmt.format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> List[str]: return self.columns.format() diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 7a53b46a4ac0f..98bd159c567b1 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,10 +1,20 @@ -from abc import ABCMeta, abstractmethod +from abc import ABC, abstractmethod import sys -from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union +from typing import ( + IO, + TYPE_CHECKING, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Union, +) from pandas._config import get_option -from pandas._typing import Dtype, FrameOrSeries +from pandas._typing import Dtype, FrameOrSeriesUnion from pandas.core.indexes.api import Index @@ -12,7 +22,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas.core.series import Series # noqa: F401 + from pandas.core.frame import DataFrame def _put_str(s: Union[str, Dtype], space: int) -> str: @@ -72,101 +82,101 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" -class BaseInfo(metaclass=ABCMeta): - def __init__( - self, - data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, - memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, - ): - if buf is None: # pragma: no cover - buf = sys.stdout - if memory_usage is None: - memory_usage = get_option("display.memory_usage") +def _initialize_memory_usage( + memory_usage: Optional[Union[bool, str]] = None, +) -> Union[bool, str]: + """Get memory usage based on inputs and display options.""" + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + return memory_usage - self.data = data - self.verbose = verbose - self.buf = buf - self.max_cols = max_cols - self.memory_usage = memory_usage - self.null_counts = null_counts - @abstractmethod - def _get_mem_usage(self, deep: bool) -> int: - """ - Get memory usage in bytes. +class BaseInfo(ABC): + """ + Base class for DataFrameInfo and SeriesInfo. - Parameters - ---------- - deep : bool - If True, introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. + Parameters + ---------- + data : DataFrame or Series + Either dataframe or series. + memory_usage : bool or str, optional + If "deep", introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + """ - Returns - ------- - mem_usage : int - Object's total memory usage in bytes. - """ - pass + data: FrameOrSeriesUnion + memory_usage: Union[bool, str] + @property @abstractmethod - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + def dtypes(self) -> Iterable[Dtype]: """ - Get column names and dtypes. + Dtypes. Returns ------- - ids : Index - DataFrame's column names. - dtypes : Series - Dtype of each of the DataFrame's columns. + dtypes : sequence + Dtype of each of the DataFrame's columns (or one series column). """ - pass + @property @abstractmethod - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - """ - Append name, non-null count (optional), and dtype for each column to `lines`. + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. - dtypes : Series - The DataFrame's columns' dtypes. - show_counts : bool - If True, count of non-NA cells for each column will be appended to `lines`. - """ - pass + @property + @abstractmethod + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + @property @abstractmethod - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: + def memory_usage_bytes(self) -> int: """ - Append short summary of columns' names to `lines`. + Memory usage in bytes. - Parameters - ---------- - lines : List[str] - Lines that will contain `info` representation. - ids : Index - The DataFrame's column names. + Returns + ------- + memory_usage_bytes : int + Object's total memory usage in bytes. """ - pass - def info(self) -> None: + @property + def memory_usage_string(self) -> str: + """Memory usage in a form of human readable string.""" + return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n" + + @property + def size_qualifier(self) -> str: + size_qualifier = "" + if self.memory_usage: + if self.memory_usage != "deep": + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + if ( + "object" in self.dtype_counts + or self.data.index._is_memory_usage_qualified() + ): + size_qualifier = "+" + return size_qualifier + + @abstractmethod + def render( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: """ Print a concise summary of a %(klass)s. This method prints information about a %(klass)s including the index dtype%(type_sub)s, non-null values and memory usage. + %(version_added_sub)s\ Parameters ---------- @@ -193,12 +203,7 @@ def info(self) -> None: consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. + %(show_counts_sub)s Returns ------- @@ -213,139 +218,470 @@ def info(self) -> None: -------- %(examples_sub)s """ - lines = [] - lines.append(str(type(self.data))) - lines.append(self.data.index._summary()) - ids, dtypes = self._get_ids_and_dtypes() - col_count = len(ids) +class DataFrameInfo(BaseInfo): + """ + Class storing dataframe-specific info. + """ - if col_count == 0: - lines.append(f"Empty {type(self.data).__name__}") - fmt.buffer_put_lines(self.buf, lines) - return + def __init__( + self, + data: "DataFrame", + memory_usage: Optional[Union[bool, str]] = None, + ): + self.data: "DataFrame" = data + self.memory_usage = _initialize_memory_usage(memory_usage) - # hack - max_cols = self.max_cols - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) + @property + def dtype_counts(self) -> Mapping[str, int]: + return _get_dataframe_dtype_counts(self.data) + + @property + def dtypes(self) -> Iterable[Dtype]: + """ + Dtypes. + + Returns + ------- + dtypes + Dtype of each of the DataFrame's columns. + """ + return self.data.dtypes + + @property + def ids(self) -> Index: + """ + Column names. + + Returns + ------- + ids : Index + DataFrame's column names. + """ + return self.data.columns + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) + + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + return self.data.count() + + @property + def memory_usage_bytes(self) -> int: + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() + + def render( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: + printer = DataFrameInfoPrinter( + info=self, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + printer.to_buffer(buf) + + +class InfoPrinterAbstract: + """ + Class for printing dataframe or series info. + """ + + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) - max_rows = get_option("display.max_info_rows", len(self.data) + 1) + @abstractmethod + def _create_table_builder(self) -> "TableBuilderAbstract": + """Create instance of table builder.""" + + +class DataFrameInfoPrinter(InfoPrinterAbstract): + """ + Class for printing dataframe info. - if self.null_counts is None: - show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + Parameters + ---------- + info : DataFrameInfo + Instance of DataFrameInfo. + max_cols : int, optional + When to switch from the verbose to the truncated output. + verbose : bool, optional + Whether to print the full summary. + show_counts : bool, optional + Whether to show the non-null counts. + """ + + def __init__( + self, + info: DataFrameInfo, + max_cols: Optional[int] = None, + verbose: Optional[bool] = None, + show_counts: Optional[bool] = None, + ): + self.info = info + self.data = info.data + self.verbose = verbose + self.max_cols = self._initialize_max_cols(max_cols) + self.show_counts = self._initialize_show_counts(show_counts) + + @property + def max_rows(self) -> int: + """Maximum info rows to be displayed.""" + return get_option("display.max_info_rows", len(self.data) + 1) + + @property + def exceeds_info_cols(self) -> bool: + """Check if number of columns to be summarized does not exceed maximum.""" + return bool(self.col_count > self.max_cols) + + @property + def exceeds_info_rows(self) -> bool: + """Check if number of rows to be summarized does not exceed maximum.""" + return bool(len(self.data) > self.max_rows) + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return self.info.col_count + + def _initialize_max_cols(self, max_cols: Optional[int]) -> int: + if max_cols is None: + return get_option("display.max_info_columns", self.col_count + 1) + return max_cols + + def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: + if show_counts is None: + return bool(not self.exceeds_info_cols and not self.exceeds_info_rows) else: - show_counts = self.null_counts - exceeds_info_cols = col_count > max_cols + return show_counts + def _create_table_builder(self) -> "DataFrameTableBuilder": + """ + Create instance of table builder based on verbosity and display settings. + """ if self.verbose: - self._verbose_repr(lines, ids, dtypes, show_counts) + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) elif self.verbose is False: # specifically set to False, not necessarily None - self._non_verbose_repr(lines, ids) + return DataFrameTableBuilderNonVerbose(info=self.info) else: - if exceeds_info_cols: - self._non_verbose_repr(lines, ids) + if self.exceeds_info_cols: + return DataFrameTableBuilderNonVerbose(info=self.info) else: - self._verbose_repr(lines, ids, dtypes, show_counts) + return DataFrameTableBuilderVerbose( + info=self.info, + with_counts=self.show_counts, + ) - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") - if self.memory_usage: - # append memory usage of df to display - size_qualifier = "" - if self.memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or self.data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = self._get_mem_usage(deep=deep) - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(self.buf, lines) +class TableBuilderAbstract(ABC): + """ + Abstract builder for info table. + """ + _lines: List[str] + info: BaseInfo -class DataFrameInfo(BaseInfo): - def _get_mem_usage(self, deep: bool) -> int: - return self.data.memory_usage(index=True, deep=deep).sum() + @abstractmethod + def get_lines(self) -> List[str]: + """Product in a form of list of lines (strings).""" + + @property + def data(self) -> FrameOrSeriesUnion: + return self.info.data + + @property + def dtypes(self) -> Iterable[Dtype]: + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts + + @property + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" + return bool(self.info.memory_usage) + + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" + self._lines.append(str(type(self.data))) + + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" + self._lines.append(self.data.index._summary()) + + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + +class DataFrameTableBuilder(TableBuilderAbstract): + """ + Abstract builder for dataframe info table. - def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: - return self.data.columns, self.data.dtypes + Parameters + ---------- + info : DataFrameInfo. + Instance of DataFrameInfo. + """ - def _verbose_repr( - self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool - ) -> None: - col_count = len(ids) - lines.append(f"Data columns (total {col_count} columns):") - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in ids) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space - - header = _put_str(id_head, space_num) + _put_str(column_head, space) - if show_counts: - counts = self.data.count() - if col_count != len(counts): # pragma: no cover - raise AssertionError( - f"Columns must equal counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null + def __init__(self, *, info: DataFrameInfo): + self.info: DataFrameInfo = info + + def get_lines(self) -> List[str]: + self._lines = [] + if self.col_count == 0: + self._fill_empty_info() else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" - - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype + self._fill_non_empty_info() + return self._lines + + def _fill_empty_info(self) -> None: + """Add lines to the info table, pertaining to empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self._lines.append(f"Empty {type(self.data).__name__}") + + @abstractmethod + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + + @property + def data(self) -> "DataFrame": + """DataFrame.""" + return self.info.data + + @property + def ids(self) -> Index: + """Dataframe columns.""" + return self.info.ids + + @property + def col_count(self) -> int: + """Number of dataframe columns to be summarized.""" + return self.info.col_count + + def add_memory_usage_line(self) -> None: + """Add line containing memory usage.""" + self._lines.append(f"memory usage: {self.memory_usage_string}") + + +class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): + """ + Dataframe info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + def add_columns_summary_line(self) -> None: + self._lines.append(self.ids._summary(name="Columns")) + + +class TableBuilderVerboseMixin(TableBuilderAbstract): + """ + Mixin for verbose info output. + """ + + SPACING: str = " " * 2 + strrows: Sequence[Sequence[str]] + gross_column_widths: Sequence[int] + with_counts: bool + + @property + @abstractmethod + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + + @property + def header_column_widths(self) -> Sequence[int]: + """Widths of header columns (only titles).""" + return [len(col) for col in self.headers] + + def _get_gross_column_widths(self) -> Sequence[int]: + """Get widths of columns containing both headers and actual content.""" + body_column_widths = self._get_body_column_widths() + return [ + max(*widths) + for widths in zip(self.header_column_widths, body_column_widths) + ] + + def _get_body_column_widths(self) -> Sequence[int]: + """Get widths of table content columns.""" + strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) + return [max(len(x) for x in col) for col in strcols] + + def _gen_rows(self) -> Iterator[Sequence[str]]: + """ + Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + @abstractmethod + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + + @abstractmethod + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + + def add_header_line(self) -> None: + header_line = self.SPACING.join( + [ + _put_str(header, col_width) + for header, col_width in zip(self.headers, self.gross_column_widths) + ] + ) + self._lines.append(header_line) + + def add_separator_line(self) -> None: + separator_line = self.SPACING.join( + [ + _put_str("-" * header_colwidth, gross_colwidth) + for header_colwidth, gross_colwidth in zip( + self.header_column_widths, self.gross_column_widths + ) + ] + ) + self._lines.append(separator_line) + + def add_body_lines(self) -> None: + for row in self.strrows: + body_line = self.SPACING.join( + [ + _put_str(col, gross_colwidth) + for col, gross_colwidth in zip(row, self.gross_column_widths) + ] + ) + self._lines.append(body_line) + + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" + + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) + + +class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): + """ + Dataframe info table builder for verbose output. + """ + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") + + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_dtypes(), ) - lines.append(header) - lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype) + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + yield from zip( + self._gen_line_numbers(), + self._gen_columns(), + self._gen_non_null_counts(), + self._gen_dtypes(), ) - for i, col in enumerate(ids): - dtype = dtypes[i] - col = pprint_thing(col) + def _gen_line_numbers(self) -> Iterator[str]: + """Iterator with string representation of column numbers.""" + for i, _ in enumerate(self.ids): + yield f" {i}" - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] + def _gen_columns(self) -> Iterator[str]: + """Iterator with string representation of column names.""" + for col in self.ids: + yield pprint_thing(col) - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype) - ) - def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: - lines.append(ids._summary(name="Columns")) +def _get_dataframe_dtype_counts(df: "DataFrame") -> Mapping[str, int]: + """ + Create mapping between datatypes and their number of occurences. + """ + # groupby dtype.name to collect e.g. Categorical columns + return df.dtypes.value_counts().groupby(lambda x: x.name).sum() diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 3a3ca84642d51..f6f3571955e6e 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -1,79 +1,142 @@ """ Module for formatting output data in Latex. """ -from typing import IO, List, Optional, Tuple +from abc import ABC, abstractmethod +from typing import Iterator, List, Optional, Sequence, Tuple, Type, Union import numpy as np from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.io.formats.format import DataFrameFormatter, TableFormatter +from pandas.io.formats.format import DataFrameFormatter -class LatexFormatter(TableFormatter): +def _split_into_full_short_caption( + caption: Optional[Union[str, Tuple[str, str]]] +) -> Tuple[str, str]: + """Extract full and short captions from caption string/tuple. + + Parameters + ---------- + caption : str or tuple, optional + Either table caption string or tuple (full_caption, short_caption). + If string is provided, then it is treated as table full caption, + while short_caption is considered an empty string. + + Returns + ------- + full_caption, short_caption : tuple + Tuple of full_caption, short_caption strings. """ - Used to render a DataFrame to a LaTeX tabular/longtable environment output. + if caption: + if isinstance(caption, str): + full_caption = caption + short_caption = "" + else: + try: + full_caption, short_caption = caption + except ValueError as err: + msg = "caption must be either a string or a tuple of two strings" + raise ValueError(msg) from err + else: + full_caption = "" + short_caption = "" + return full_caption, short_caption + + +class RowStringConverter(ABC): + r"""Converter for dataframe rows into LaTeX strings. Parameters ---------- formatter : `DataFrameFormatter` - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 columns - longtable : boolean, default False - Use a longtable environment instead of tabular. + Instance of `DataFrameFormatter`. + multicolumn: bool, optional + Whether to use \multicolumn macro. + multicolumn_format: str, optional + Multicolumn format. + multirow: bool, optional + Whether to use \multirow macro. - See Also - -------- - HTMLFormatter """ def __init__( self, formatter: DataFrameFormatter, - column_format: Optional[str] = None, - longtable: bool = False, multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, - caption: Optional[str] = None, - label: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.bold_rows - self.column_format = column_format - self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow - self.caption = caption - self.label = label - self.escape = self.fmt.escape + self.clinebuf: List[List[int]] = [] + self.strcols = self._get_strcols() + self.strrows = list(zip(*self.strcols)) + + def get_strrow(self, row_num: int) -> str: + """Get string representation of the row.""" + row = self.strrows[row_num] + + is_multicol = ( + row_num < self.column_levels and self.fmt.header and self.multicolumn + ) + + is_multirow = ( + row_num >= self.header_levels + and self.fmt.index + and self.multirow + and self.index_levels > 1 + ) + + is_cline_maybe_required = is_multirow and row_num < len(self.strrows) - 1 + + crow = self._preprocess_row(row) + + if is_multicol: + crow = self._format_multicolumn(crow) + if is_multirow: + crow = self._format_multirow(crow, row_num) + + lst = [] + lst.append(" & ".join(crow)) + lst.append(" \\\\") + if is_cline_maybe_required: + cline = self._compose_cline(row_num, len(self.strcols)) + lst.append(cline) + return "".join(lst) + + @property + def _header_row_num(self) -> int: + """Number of rows in header.""" + return self.header_levels if self.fmt.header else 0 + + @property + def index_levels(self) -> int: + """Integer number of levels in index.""" + return self.frame.index.nlevels + + @property + def column_levels(self) -> int: + return self.frame.columns.nlevels + + @property + def header_levels(self) -> int: + nlevels = self.column_levels + if self.fmt.has_index_names and self.fmt.show_index_names: + nlevels += 1 + return nlevels - def write_result(self, buf: IO[str]) -> None: - """ - Render a DataFrame to a LaTeX tabular, longtable, or table/tabular - environment output. - """ - # string representation of the columns - if len(self.frame.columns) == 0 or len(self.frame.index) == 0: - info_line = ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {self.frame.columns}\n" - f"Index: {self.frame.index}" - ) - strcols = [[info_line]] + def _get_strcols(self) -> List[List[str]]: + """String representation of the columns.""" + if self.fmt.frame.empty: + strcols = [[self._empty_info_line]] else: - strcols = self.fmt._to_str_columns() - - def get_col_type(dtype): - if issubclass(dtype.type, np.number): - return "r" - else: - return "l" + strcols = self.fmt.get_strcols() - # reestablish the MultiIndex that has been joined by _to_str_column + # reestablish the MultiIndex that has been joined by get_strcols() if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( adjoin=False, @@ -90,11 +153,11 @@ def pad_empties(x): break return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] - out = (pad_empties(i) for i in out) + gen = (pad_empties(i) for i in out) # Add empty spaces for each column level clevels = self.frame.columns.nlevels - out = [[" " * len(i[-1])] * clevels + i for i in out] + out = [[" " * len(i[-1])] * clevels + i for i in gen] # Add the column names to the last index column cnames = self.frame.columns.names @@ -104,95 +167,27 @@ def pad_empties(x): # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] - - if self.column_format is None: - dtypes = self.frame.dtypes._values - column_format = "".join(map(get_col_type, dtypes)) - if self.fmt.index: - index_format = "l" * self.frame.index.nlevels - column_format = index_format + column_format - elif not isinstance(self.column_format, str): # pragma: no cover - raise AssertionError( - f"column_format must be str or unicode, not {type(column_format)}" - ) - else: - column_format = self.column_format - - if self.longtable: - self._write_longtable_begin(buf, column_format) - else: - self._write_tabular_begin(buf, column_format) - - buf.write("\\toprule\n") - - ilevels = self.frame.index.nlevels - clevels = self.frame.columns.nlevels - nlevels = clevels - if self.fmt.has_index_names and self.fmt.show_index_names: - nlevels += 1 - strrows = list(zip(*strcols)) - self.clinebuf: List[List[int]] = [] - - for i, row in enumerate(strrows): - if i == nlevels and self.fmt.header: - buf.write("\\midrule\n") # End of header - if self.longtable: - buf.write("\\endhead\n") - buf.write("\\midrule\n") - buf.write( - f"\\multicolumn{{{len(row)}}}{{r}}" - "{{Continued on next page}} \\\\\n" - ) - buf.write("\\midrule\n") - buf.write("\\endfoot\n\n") - buf.write("\\bottomrule\n") - buf.write("\\endlastfoot\n") - if self.escape: - # escape backslashes first - crow = [ - ( - x.replace("\\", "\\textbackslash ") - .replace("_", "\\_") - .replace("%", "\\%") - .replace("$", "\\$") - .replace("#", "\\#") - .replace("{", "\\{") - .replace("}", "\\}") - .replace("~", "\\textasciitilde ") - .replace("^", "\\textasciicircum ") - .replace("&", "\\&") - if (x and x != "{}") - else "{}" - ) - for x in row - ] - else: - crow = [x if x else "{}" for x in row] - if self.bold_rows and self.fmt.index: - # bold row labels - crow = [ - f"\\textbf{{{x}}}" - if j < ilevels and x.strip() not in ["", "{}"] - else x - for j, x in enumerate(crow) - ] - if i < clevels and self.fmt.header and self.multicolumn: - # sum up columns to multicolumns - crow = self._format_multicolumn(crow, ilevels) - if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1: - # sum up rows to multirows - crow = self._format_multirow(crow, ilevels, i, strrows) - buf.write(" & ".join(crow)) - buf.write(" \\\\\n") - if self.multirow and i < len(strrows) - 1: - self._print_cline(buf, i, len(strcols)) - - if self.longtable: - self._write_longtable_end(buf) + return strcols + + @property + def _empty_info_line(self): + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {self.frame.columns}\n" + f"Index: {self.frame.index}" + ) + + def _preprocess_row(self, row: Sequence[str]) -> List[str]: + """Preprocess elements of the row.""" + if self.fmt.escape: + crow = _escape_symbols(row) else: - self._write_tabular_end(buf) + crow = [x if x else "{}" for x in row] + if self.fmt.bold_rows and self.fmt.index: + crow = _convert_to_bold(crow, self.index_levels) + return crow - def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: + def _format_multicolumn(self, row: List[str]) -> List[str]: r""" Combine columns belonging to a group to a single multicolumn entry according to self.multicolumn_format @@ -202,7 +197,7 @@ def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: will become \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} """ - row2 = list(row[:ilevels]) + row2 = row[: self.index_levels] ncol = 1 coltext = "" @@ -217,7 +212,7 @@ def append_col(): else: row2.append(coltext) - for c in row[ilevels:]: + for c in row[self.index_levels :]: # if next col has text, write the previous if c.strip(): if coltext: @@ -232,9 +227,7 @@ def append_col(): append_col() return row2 - def _format_multirow( - self, row: List[str], ilevels: int, i: int, rows: List[Tuple[str, ...]] - ) -> List[str]: + def _format_multirow(self, row: List[str], i: int) -> List[str]: r""" Check following rows, whether row should be a multirow @@ -244,10 +237,10 @@ def _format_multirow( b & 0 & \cline{1-2} b & 0 & """ - for j in range(ilevels): + for j in range(self.index_levels): if row[j].strip(): nrow = 1 - for r in rows[i + 1 :]: + for r in self.strrows[i + 1 :]: if not r[j].strip(): nrow += 1 else: @@ -259,114 +252,574 @@ def _format_multirow( self.clinebuf.append([i + nrow - 1, j + 1]) return row - def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: + def _compose_cline(self, i: int, icol: int) -> str: """ - Print clines after multirow-blocks are finished. + Create clines after multirow-blocks are finished. """ + lst = [] for cl in self.clinebuf: if cl[0] == i: - buf.write(f"\\cline{{{cl[1]:d}-{icol:d}}}\n") - # remove entries that have been written to buffer - self.clinebuf = [x for x in self.clinebuf if x[0] != i] + lst.append(f"\n\\cline{{{cl[1]:d}-{icol:d}}}") + # remove entries that have been written to buffer + self.clinebuf = [x for x in self.clinebuf if x[0] != i] + return "".join(lst) + + +class RowStringIterator(RowStringConverter): + """Iterator over rows of the header or the body of the table.""" + + @abstractmethod + def __iter__(self) -> Iterator[str]: + """Iterate over LaTeX string representations of rows.""" + + +class RowHeaderIterator(RowStringIterator): + """Iterator for the table header rows.""" + + def __iter__(self) -> Iterator[str]: + for row_num in range(len(self.strrows)): + if row_num < self._header_row_num: + yield self.get_strrow(row_num) + + +class RowBodyIterator(RowStringIterator): + """Iterator for the table body rows.""" + + def __iter__(self) -> Iterator[str]: + for row_num in range(len(self.strrows)): + if row_num >= self._header_row_num: + yield self.get_strrow(row_num) - def _write_tabular_begin(self, buf, column_format: str): + +class TableBuilderAbstract(ABC): + """ + Abstract table builder producing string representation of LaTeX table. + + Parameters + ---------- + formatter : `DataFrameFormatter` + Instance of `DataFrameFormatter`. + column_format: str, optional + Column format, for example, 'rcl' for three columns. + multicolumn: bool, optional + Use multicolumn to enhance MultiIndex columns. + multicolumn_format: str, optional + The alignment for multicolumns, similar to column_format. + multirow: bool, optional + Use multirow to enhance MultiIndex rows. + caption: str, optional + Table caption. + short_caption: str, optional + Table short caption. + label: str, optional + LaTeX label. + position: str, optional + Float placement specifier, for example, 'htb'. + """ + + def __init__( + self, + formatter: DataFrameFormatter, + column_format: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + short_caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ): + self.fmt = formatter + self.column_format = column_format + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + self.caption = caption + self.short_caption = short_caption + self.label = label + self.position = position + + def get_result(self) -> str: + """String representation of LaTeX table.""" + elements = [ + self.env_begin, + self.top_separator, + self.header, + self.middle_separator, + self.env_body, + self.bottom_separator, + self.env_end, + ] + result = "\n".join([item for item in elements if item]) + trailing_newline = "\n" + result += trailing_newline + return result + + @property + @abstractmethod + def env_begin(self) -> str: + """Beginning of the environment.""" + + @property + @abstractmethod + def top_separator(self) -> str: + """Top level separator.""" + + @property + @abstractmethod + def header(self) -> str: + """Header lines.""" + + @property + @abstractmethod + def middle_separator(self) -> str: + """Middle level separator.""" + + @property + @abstractmethod + def env_body(self) -> str: + """Environment body.""" + + @property + @abstractmethod + def bottom_separator(self) -> str: + """Bottom level separator.""" + + @property + @abstractmethod + def env_end(self) -> str: + """End of the environment.""" + + +class GenericTableBuilder(TableBuilderAbstract): + """Table builder producing string representation of LaTeX table.""" + + @property + def header(self) -> str: + iterator = self._create_row_iterator(over="header") + return "\n".join(list(iterator)) + + @property + def top_separator(self) -> str: + return "\\toprule" + + @property + def middle_separator(self) -> str: + return "\\midrule" if self._is_separator_required() else "" + + @property + def env_body(self) -> str: + iterator = self._create_row_iterator(over="body") + return "\n".join(list(iterator)) + + def _is_separator_required(self) -> bool: + return bool(self.header and self.env_body) + + @property + def _position_macro(self) -> str: + r"""Position macro, extracted from self.position, like [h].""" + return f"[{self.position}]" if self.position else "" + + @property + def _caption_macro(self) -> str: + r"""Caption macro, extracted from self.caption. + + With short caption: + \caption[short_caption]{caption_string}. + + Without short caption: + \caption{caption_string}. """ - Write the beginning of a tabular environment or - nested table/tabular environments including caption and label. + if self.caption: + return "".join( + [ + r"\caption", + f"[{self.short_caption}]" if self.short_caption else "", + f"{{{self.caption}}}", + ] + ) + return "" + + @property + def _label_macro(self) -> str: + r"""Label macro, extracted from self.label, like \label{ref}.""" + return f"\\label{{{self.label}}}" if self.label else "" + + def _create_row_iterator(self, over: str) -> RowStringIterator: + """Create iterator over header or body of the table. Parameters ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - column_format : str - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' - for 3 columns - """ - if self.caption is not None or self.label is not None: - # then write output in a nested table/tabular environment - if self.caption is None: - caption_ = "" - else: - caption_ = f"\n\\caption{{{self.caption}}}" + over : {'body', 'header'} + Over what to iterate. - if self.label is None: - label_ = "" - else: - label_ = f"\n\\label{{{self.label}}}" + Returns + ------- + RowStringIterator + Iterator over body or header. + """ + iterator_kind = self._select_iterator(over) + return iterator_kind( + formatter=self.fmt, + multicolumn=self.multicolumn, + multicolumn_format=self.multicolumn_format, + multirow=self.multirow, + ) + + def _select_iterator(self, over: str) -> Type[RowStringIterator]: + """Select proper iterator over table rows.""" + if over == "header": + return RowHeaderIterator + elif over == "body": + return RowBodyIterator + else: + msg = f"'over' must be either 'header' or 'body', but {over} was provided" + raise ValueError(msg) + + +class LongTableBuilder(GenericTableBuilder): + """Concrete table builder for longtable. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = LongTableBuilder(formatter, caption='a long table', + ... label='tab:long', column_format='lrl') + >>> table = builder.get_result() + >>> print(table) + \\begin{longtable}{lrl} + \\caption{a long table} + \\label{tab:long}\\\\ + \\toprule + {} & a & b \\\\ + \\midrule + \\endfirsthead + \\caption[]{a long table} \\\\ + \\toprule + {} & a & b \\\\ + \\midrule + \\endhead + \\midrule + \\multicolumn{3}{r}{{Continued on next page}} \\\\ + \\midrule + \\endfoot + + \\bottomrule + \\endlastfoot + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\end{longtable} + + """ - buf.write(f"\\begin{{table}}\n\\centering{caption_}{label_}\n") + @property + def env_begin(self) -> str: + first_row = ( + f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" + ) + elements = [first_row, f"{self._caption_and_label()}"] + return "\n".join([item for item in elements if item]) + + def _caption_and_label(self) -> str: + if self.caption or self.label: + double_backslash = "\\\\" + elements = [f"{self._caption_macro}", f"{self._label_macro}"] + caption_and_label = "\n".join([item for item in elements if item]) + caption_and_label += double_backslash + return caption_and_label else: - # then write output only in a tabular environment - pass + return "" + + @property + def middle_separator(self) -> str: + iterator = self._create_row_iterator(over="header") + + # the content between \endfirsthead and \endhead commands + # mitigates repeated List of Tables entries in the final LaTeX + # document when dealing with longtable environments; GH #34360 + elements = [ + "\\midrule", + "\\endfirsthead", + f"\\caption[]{{{self.caption}}} \\\\" if self.caption else "", + self.top_separator, + self.header, + "\\midrule", + "\\endhead", + "\\midrule", + f"\\multicolumn{{{len(iterator.strcols)}}}{{r}}" + "{{Continued on next page}} \\\\", + "\\midrule", + "\\endfoot\n", + "\\bottomrule", + "\\endlastfoot", + ] + if self._is_separator_required(): + return "\n".join(elements) + return "" + + @property + def bottom_separator(self) -> str: + return "" + + @property + def env_end(self) -> str: + return "\\end{longtable}" + + +class RegularTableBuilder(GenericTableBuilder): + """Concrete table builder for regular table. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = RegularTableBuilder(formatter, caption='caption', label='lab', + ... column_format='lrc') + >>> table = builder.get_result() + >>> print(table) + \\begin{table} + \\centering + \\caption{caption} + \\label{lab} + \\begin{tabular}{lrc} + \\toprule + {} & a & b \\\\ + \\midrule + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\bottomrule + \\end{tabular} + \\end{table} + + """ + + @property + def env_begin(self) -> str: + elements = [ + f"\\begin{{table}}{self._position_macro}", + "\\centering", + f"{self._caption_macro}", + f"{self._label_macro}", + f"\\begin{{tabular}}{{{self.column_format}}}", + ] + return "\n".join([item for item in elements if item]) + + @property + def bottom_separator(self) -> str: + return "\\bottomrule" + + @property + def env_end(self) -> str: + return "\n".join(["\\end{tabular}", "\\end{table}"]) + + +class TabularBuilder(GenericTableBuilder): + """Concrete table builder for tabular environment. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = TabularBuilder(formatter, column_format='lrc') + >>> table = builder.get_result() + >>> print(table) + \\begin{tabular}{lrc} + \\toprule + {} & a & b \\\\ + \\midrule + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\bottomrule + \\end{tabular} + + """ + + @property + def env_begin(self) -> str: + return f"\\begin{{tabular}}{{{self.column_format}}}" + + @property + def bottom_separator(self) -> str: + return "\\bottomrule" - buf.write(f"\\begin{{tabular}}{{{column_format}}}\n") + @property + def env_end(self) -> str: + return "\\end{tabular}" - def _write_tabular_end(self, buf): + +class LatexFormatter: + r""" + Used to render a DataFrame to a LaTeX tabular/longtable environment output. + + Parameters + ---------- + formatter : `DataFrameFormatter` + longtable : bool, default False + Use longtable environment. + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + multicolumn : bool, default False + Use \multicolumn to enhance MultiIndex columns. + multicolumn_format : str, default 'l' + The alignment for multicolumns, similar to `column_format` + multirow : bool, default False + Use \multirow to enhance MultiIndex rows. + caption : str or tuple, optional + Tuple (full_caption, short_caption), + which results in \caption[short_caption]{full_caption}; + if a single string is passed, no short caption will be set. + label : str, optional + The LaTeX label to be placed inside ``\label{}`` in the output. + position : str, optional + The LaTeX positional argument for tables, to be placed after + ``\begin{}`` in the output. + + See Also + -------- + HTMLFormatter + """ + + def __init__( + self, + formatter: DataFrameFormatter, + longtable: bool = False, + column_format: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[Union[str, Tuple[str, str]]] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ): + self.fmt = formatter + self.frame = self.fmt.frame + self.longtable = longtable + self.column_format = column_format + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + self.caption, self.short_caption = _split_into_full_short_caption(caption) + self.label = label + self.position = position + + def to_string(self) -> str: + """ + Render a DataFrame to a LaTeX tabular, longtable, or table/tabular + environment output. """ - Write the end of a tabular environment or nested table/tabular - environment. + return self.builder.get_result() - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. + @property + def builder(self) -> TableBuilderAbstract: + """Concrete table builder. + Returns + ------- + TableBuilder """ - buf.write("\\bottomrule\n") - buf.write("\\end{tabular}\n") - if self.caption is not None or self.label is not None: - buf.write("\\end{table}\n") + builder = self._select_builder() + return builder( + formatter=self.fmt, + column_format=self.column_format, + multicolumn=self.multicolumn, + multicolumn_format=self.multicolumn_format, + multirow=self.multirow, + caption=self.caption, + short_caption=self.short_caption, + label=self.label, + position=self.position, + ) + + def _select_builder(self) -> Type[TableBuilderAbstract]: + """Select proper table builder.""" + if self.longtable: + return LongTableBuilder + if any([self.caption, self.label, self.position]): + return RegularTableBuilder + return TabularBuilder + + @property + def column_format(self) -> Optional[str]: + """Column format.""" + return self._column_format + + @column_format.setter + def column_format(self, input_column_format: Optional[str]) -> None: + """Setter for column format.""" + if input_column_format is None: + self._column_format = ( + self._get_index_format() + self._get_column_format_based_on_dtypes() + ) + elif not isinstance(input_column_format, str): + raise ValueError( + f"column_format must be str or unicode, " + f"not {type(input_column_format)}" + ) else: - pass + self._column_format = input_column_format - def _write_longtable_begin(self, buf, column_format: str): - """ - Write the beginning of a longtable environment including caption and - label if provided by user. + def _get_column_format_based_on_dtypes(self) -> str: + """Get column format based on data type. - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - column_format : str - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' - for 3 columns + Right alignment for numbers and left - for strings. """ - buf.write(f"\\begin{{longtable}}{{{column_format}}}\n") - if self.caption is not None or self.label is not None: - if self.caption is None: - pass - else: - buf.write(f"\\caption{{{self.caption}}}") + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return "r" + return "l" - if self.label is None: - pass - else: - buf.write(f"\\label{{{self.label}}}") + dtypes = self.frame.dtypes._values + return "".join(map(get_col_type, dtypes)) - # a double-backslash is required at the end of the line - # as discussed here: - # https://tex.stackexchange.com/questions/219138 - buf.write("\\\\\n") - else: - pass + def _get_index_format(self) -> str: + """Get index column format.""" + return "l" * self.frame.index.nlevels if self.fmt.index else "" - @staticmethod - def _write_longtable_end(buf): - """ - Write the end of a longtable environment. - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. +def _escape_symbols(row: Sequence[str]) -> List[str]: + """Carry out string replacements for special symbols. - """ - buf.write("\\end{longtable}\n") + Parameters + ---------- + row : list + List of string, that may contain special symbols. + + Returns + ------- + list + list of strings with the special symbols replaced. + """ + return [ + ( + x.replace("\\", "\\textbackslash ") + .replace("_", "\\_") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~", "\\textasciitilde ") + .replace("^", "\\textasciicircum ") + .replace("&", "\\&") + if (x and x != "{}") + else "{}" + ) + for x in row + ] + + +def _convert_to_bold(crow: Sequence[str], ilevels: int) -> List[str]: + """Convert elements in ``crow`` to bold.""" + return [ + f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x + for j, x in enumerate(crow) + ] + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 1cf79dc105901..128e50d84657c 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -12,6 +12,7 @@ Mapping, Optional, Sequence, + Sized, Tuple, TypeVar, Union, @@ -205,7 +206,7 @@ def as_escaped_string( translate = escape_chars escape_chars = list(escape_chars.keys()) else: - escape_chars = escape_chars or tuple() + escape_chars = escape_chars or () result = str(thing) for c in escape_chars: @@ -243,7 +244,7 @@ def pprint_thing_encoded( return value.encode(encoding, errors) -def _enable_data_resource_formatter(enable: bool) -> None: +def enable_data_resource_formatter(enable: bool) -> None: if "IPython" not in sys.modules: # definitely not in IPython return @@ -307,7 +308,7 @@ def format_object_summary( name : name, optional defaults to the class name of the obj indent_for_name : bool, default True - Whether subsequent lines should be be indented to + Whether subsequent lines should be indented to align with the name. line_break_each_value : bool, default False If True, inserts a line break for each value of ``obj``. @@ -321,7 +322,7 @@ def format_object_summary( summary string """ from pandas.io.formats.console import get_console_size - from pandas.io.formats.format import _get_adjustment + from pandas.io.formats.format import get_adjustment display_width, _ = get_console_size() if display_width is None: @@ -350,7 +351,7 @@ def format_object_summary( is_truncated = n > max_seq_items # adj can optionally handle unicode eastern asian width - adj = _get_adjustment() + adj = get_adjustment() def _extend_line( s: str, line: str, value: str, display_width: int, next_line_prefix: str @@ -499,11 +500,11 @@ def _justify( # error: Incompatible return value type (got "Tuple[List[Sequence[str]], # List[Sequence[str]]]", expected "Tuple[List[Tuple[str, ...]], # List[Tuple[str, ...]]]") - return head, tail # type: ignore + return head, tail # type: ignore[return-value] def format_object_attrs( - obj: Sequence, include_dtype: bool = True + obj: Sized, include_dtype: bool = True ) -> List[Tuple[str, Union[str, int]]]: """ Return a list of tuples of the (attr, formatted_value) @@ -512,7 +513,7 @@ def format_object_attrs( Parameters ---------- obj : object - must be iterable + Must be sized. include_dtype : bool If False, dtype won't be in the returned list @@ -523,15 +524,17 @@ def format_object_attrs( """ attrs: List[Tuple[str, Union[str, int]]] = [] if hasattr(obj, "dtype") and include_dtype: - # error: "Sequence[Any]" has no attribute "dtype" - attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore + # error: "Sized" has no attribute "dtype" + attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore[attr-defined] if getattr(obj, "name", None) is not None: - # error: "Sequence[Any]" has no attribute "name" - attrs.append(("name", default_pprint(obj.name))) # type: ignore - # error: "Sequence[Any]" has no attribute "names" - elif getattr(obj, "names", None) is not None and any(obj.names): # type: ignore - # error: "Sequence[Any]" has no attribute "names" - attrs.append(("names", default_pprint(obj.names))) # type: ignore + # error: "Sized" has no attribute "name" + attrs.append(("name", default_pprint(obj.name))) # type: ignore[attr-defined] + # error: "Sized" has no attribute "names" + elif getattr(obj, "names", None) is not None and any( + obj.names # type: ignore[attr-defined] + ): + # error: "Sized" has no attribute "names" + attrs.append(("names", default_pprint(obj.names))) # type: ignore[attr-defined] max_seq_items = get_option("display.max_seq_items") or len(obj) if len(obj) > max_seq_items: attrs.append(("length", len(obj))) diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py new file mode 100644 index 0000000000000..4ebb78f29c739 --- /dev/null +++ b/pandas/io/formats/string.py @@ -0,0 +1,201 @@ +""" +Module for formatting output data in console (to string). +""" +from shutil import get_terminal_size +from typing import Iterable, List, Optional + +import numpy as np + +from pandas.io.formats.format import DataFrameFormatter +from pandas.io.formats.printing import pprint_thing + + +class StringFormatter: + """Formatter for string representation of a dataframe.""" + + def __init__(self, fmt: DataFrameFormatter, line_width: Optional[int] = None): + self.fmt = fmt + self.adj = fmt.adj + self.frame = fmt.frame + self.line_width = line_width + + def to_string(self) -> str: + text = self._get_string_representation() + if self.fmt.should_show_dimensions: + text = "".join([text, self.fmt.dimensions_info]) + return text + + def _get_strcols(self) -> List[List[str]]: + strcols = self.fmt.get_strcols() + if self.fmt.is_truncated: + strcols = self._insert_dot_separators(strcols) + return strcols + + def _get_string_representation(self) -> str: + if self.fmt.frame.empty: + return self._empty_info_line + + strcols = self._get_strcols() + + if self.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) + + if self._need_to_wrap_around: + return self._join_multiline(strcols) + + return self._fit_strcols_to_terminal_width(strcols) + + @property + def _empty_info_line(self) -> str: + return ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" + ) + + @property + def _need_to_wrap_around(self) -> bool: + return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0) + + def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: + str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) + index_length = len(str_index) + + if self.fmt.is_truncated_horizontally: + strcols = self._insert_dot_separator_horizontal(strcols, index_length) + + if self.fmt.is_truncated_vertically: + strcols = self._insert_dot_separator_vertical(strcols, index_length) + + return strcols + + def _insert_dot_separator_horizontal( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + strcols.insert(self.fmt.tr_col_num + 1, [" ..."] * index_length) + return strcols + + def _insert_dot_separator_vertical( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + n_header_rows = index_length - len(self.fmt.tr_frame) + row_num = self.fmt.tr_row_num + for ix, col in enumerate(strcols): + cwidth = self.adj.len(col[row_num]) + + if self.fmt.is_truncated_horizontally: + is_dot_col = ix == self.fmt.tr_col_num + 1 + else: + is_dot_col = False + + if cwidth > 3 or is_dot_col: + dots = "..." + else: + dots = ".." + + if ix == 0: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + + dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] + col.insert(row_num + n_header_rows, dot_str) + return strcols + + def _join_multiline(self, strcols_input: Iterable[List[str]]) -> str: + lwidth = self.line_width + adjoin_width = 1 + strcols = list(strcols_input) + + if self.fmt.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.fmt.is_truncated_vertically: + assert self.fmt.max_rows_fitted is not None + nrows = self.fmt.max_rows_fitted + 1 + else: + nrows = len(self.frame) + + str_lst = [] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] + if self.fmt.index: + row.insert(0, idx) + if nbins > 1: + if end <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + start = end + return "\n\n".join(str_lst) + + def _fit_strcols_to_terminal_width(self, strcols: List[List[str]]) -> str: + from pandas import Series + + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + width, _ = get_terminal_size() + dif = max_len - width + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.0)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + + # subtract index column + max_cols_fitted = n_cols - self.fmt.index + # GH-21180. Ensure that we print at least two. + max_cols_fitted = max(max_cols_fitted, 2) + self.fmt.max_cols_fitted = max_cols_fitted + + # Call again _truncate to cut frame appropriately + # and then generate string representation + self.fmt.truncate() + strcols = self._get_strcols() + return self.adj.adjoin(1, *strcols) + + +def _binify(cols: List[int], line_width: int) -> List[int]: + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d11144938eb26..4557c10927a15 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1,7 +1,6 @@ """ Module for applying conditional formatting to DataFrames and Series. """ - from collections import defaultdict from contextlib import contextmanager import copy @@ -18,7 +17,7 @@ Tuple, Union, ) -from uuid import uuid1 +from uuid import uuid4 import numpy as np @@ -33,17 +32,18 @@ import pandas as pd from pandas.api.types import is_dict_like, is_list_like +from pandas.core import generic import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice +from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") try: - import matplotlib.pyplot as plt from matplotlib import colors + import matplotlib.pyplot as plt has_mpl = True except ImportError: @@ -89,6 +89,12 @@ class Styler: .. versionadded:: 1.0.0 + uuid_len : int, default 5 + If ``uuid`` is not specified, the length of the ``uuid`` to randomly generate + expressed in hex characters, in range [0, 32]. + + .. versionadded:: 1.2.0 + Attributes ---------- env : Jinja2 jinja2.Environment @@ -144,6 +150,7 @@ def __init__( table_attributes: Optional[str] = None, cell_ids: bool = True, na_rep: Optional[str] = None, + uuid_len: int = 5, ): self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) self._todo: List[Tuple[Callable, Tuple, Dict]] = [] @@ -159,7 +166,10 @@ def __init__( self.index = data.index self.columns = data.columns - self.uuid = uuid + if not isinstance(uuid_len, int) or not uuid_len >= 0: + raise TypeError("``uuid_len`` must be an integer in range [0, 32].") + self.uuid_len = min(32, uuid_len) + self.uuid = (uuid or uuid4().hex[: self.uuid_len]) + "_" self.table_styles = table_styles self.caption = caption if precision is None: @@ -171,6 +181,8 @@ def __init__( self.cell_ids = cell_ids self.na_rep = na_rep + self.cell_context: Dict[str, Any] = {} + # display_funcs maps (row, col) -> formatting function def default_display_func(x): @@ -192,7 +204,11 @@ def _repr_html_(self) -> str: """ return self.render() - @doc(NDFrame.to_excel, klass="Styler") + @doc( + NDFrame.to_excel, + klass="Styler", + storage_options=generic._shared_docs["storage_options"], + ) def to_excel( self, excel_writer, @@ -246,7 +262,7 @@ def _translate(self): precision = self.precision hidden_index = self.hidden_index hidden_columns = self.hidden_columns - uuid = self.uuid or str(uuid1()).replace("-", "_") + uuid = self.uuid ROW_HEADING_CLASS = "row_heading" COL_HEADING_CLASS = "col_heading" INDEX_NAME_CLASS = "index_name" @@ -262,7 +278,7 @@ def format_attr(pair): idx_lengths = _get_level_lengths(self.index) col_lengths = _get_level_lengths(self.columns, hidden_columns) - cell_context = dict() + cell_context = self.cell_context n_rlvls = self.data.index.nlevels n_clvls = self.data.columns.nlevels @@ -327,7 +343,7 @@ def format_attr(pair): colspan = col_lengths.get((r, c), 0) if colspan > 1: es["attributes"] = [ - format_attr({"key": "colspan", "value": colspan}) + format_attr({"key": "colspan", "value": f'"{colspan}"'}) ] row_es.append(es) head.append(row_es) @@ -390,16 +406,16 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style - if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): + props = [] + if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) + for x in ctx[r, c]: + # have to handle empty styles like [''] + if x.count(":"): + props.append(tuple(x.split(":"))) + else: + props.append(("", "")) row_es.append(row_dict) - props = [] - for x in ctx[r, c]: - # have to handle empty styles like [''] - if x.count(":"): - props.append(tuple(x.split(":"))) - else: - props.append(("", "")) cellstyle_map[tuple(props)].append(f"row{r}_col{c}") body.append(row_es) @@ -417,16 +433,16 @@ def format_attr(pair): else: table_attr += ' class="tex2jax_ignore"' - return dict( - head=head, - cellstyle=cellstyle, - body=body, - uuid=uuid, - precision=precision, - table_styles=table_styles, - caption=caption, - table_attributes=table_attr, - ) + return { + "head": head, + "cellstyle": cellstyle, + "body": body, + "uuid": uuid, + "precision": precision, + "table_styles": table_styles, + "caption": caption, + "table_attributes": table_attr, + } def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Styler": """ @@ -475,7 +491,7 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Style row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) else: - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) if len(subset) == 1: subset = subset, self.data.columns @@ -499,6 +515,69 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Style self._display_funcs[(i, j)] = formatter return self + def set_td_classes(self, classes: DataFrame) -> "Styler": + """ + Add string based CSS class names to data cells that will appear within the + `Styler` HTML result. These classes are added within specified `
          ` elements. + + Parameters + ---------- + classes : DataFrame + DataFrame containing strings that will be translated to CSS classes, + mapped by identical column and index values that must exist on the + underlying `Styler` data. None, NaN values, and empty strings will + be ignored and not affect the rendered HTML. + + Returns + ------- + self : Styler + + Examples + -------- + >>> df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + >>> classes = pd.DataFrame([ + ... ["min-val red", "", "blue"], + ... ["red", None, "blue max-val"] + ... ], index=df.index, columns=df.columns) + >>> df.style.set_td_classes(classes) + + Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the + underlying, + + >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"], + ... columns=[["level0", "level0"], ["level1a", "level1b"]]) + >>> classes = pd.DataFrame(["min-val"], index=["a"], + ... columns=[["level0"],["level1a"]]) + >>> df.style.set_td_classes(classes) + + Form of the output with new additional css classes, + + >>> df = pd.DataFrame([[1]]) + >>> css = pd.DataFrame(["other-class"]) + >>> s = Styler(df, uuid="_", cell_ids=False).set_td_classes(css) + >>> s.hide_index().render() + '' + '' + ' ' + ' ' + ' ' + ' ' + ' ' + ' ' + '
          0
          1
          ' + """ + classes = classes.reindex_like(self.data) + + mask = (classes.isna()) | (classes.eq("")) + self.cell_context["data"] = { + r: {c: [str(classes.iloc[r, c])]} + for r, rn in enumerate(classes.index) + for c, cn in enumerate(classes.columns) + if not mask.iloc[r, c] + } + + return self + def render(self, **kwargs) -> str: """ Render the built up styles to HTML. @@ -609,6 +688,7 @@ def clear(self) -> None: Returns None. """ self.ctx.clear() + self.cell_context = {} self._todo = [] def _compute(self): @@ -633,7 +713,7 @@ def _apply( **kwargs, ) -> "Styler": subset = slice(None) if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: result = data.apply(func, axis=axis, result_type="expand", **kwargs) @@ -725,7 +805,7 @@ def _applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: subset = pd.IndexSlice[:] - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) result = self.data.loc[subset].applymap(func) self._update_ctx(result) return self @@ -752,7 +832,8 @@ def applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": See Also -------- - Styler.where + Styler.where: Updates the HTML representation with a style which is + selected in accordance with the return value of a function. """ self._todo.append( (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) @@ -793,7 +874,7 @@ def where( See Also -------- - Styler.applymap + Styler.applymap: Updates the HTML representation with the result. """ if other is None: other = "" @@ -822,7 +903,7 @@ def set_table_attributes(self, attributes: str) -> "Styler": Set the table attributes. These are the items that show up in the opening ```` tag - in addition to to automatic (by default) id. + in addition to automatic (by default) id. Parameters ---------- @@ -853,7 +934,7 @@ def export(self) -> List[Tuple[Callable, Tuple, Dict]]: See Also -------- - Styler.use + Styler.use: Set the styles on the current Styler. """ return self._todo @@ -874,7 +955,7 @@ def use(self, styles: List[Tuple[Callable, Tuple, Dict]]) -> "Styler": See Also -------- - Styler.export + Styler.export : Export the styles to applied to the current Styler. """ self._todo.extend(styles) return self @@ -909,20 +990,46 @@ def set_caption(self, caption: str) -> "Styler": self.caption = caption return self - def set_table_styles(self, table_styles) -> "Styler": + def set_table_styles(self, table_styles, axis=0, overwrite=True) -> "Styler": """ Set the table styles on a Styler. These are placed in a ``