Merging master (pandas-dev#35498)

alexhlim · Aug 10, 2020 · f14a9e5 · f14a9e5
2 parents e083962 + 9a8152c
commit f14a9e5
Show file tree

Hide file tree

Showing 207 changed files with 2,687 additions and 1,255 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -4,7 +4,9 @@ on:
   push:
     branches: master
   pull_request:
-    branches: master
+    branches:
+      - master
+      - 1.1.x
 
 env:
   ENV_FILE: environment.yml

diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -6,7 +6,7 @@
 from .pandas_vb_common import tm
 
 try:
-    from pandas.tseries.offsets import Nano, Hour
+    from pandas.tseries.offsets import Hour, Nano
 except ImportError:
     # For compatibility with older versions
     from pandas.core.datetools import *  # noqa

diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
@@ -7,14 +7,14 @@
 
 try:
     from pandas import (
-        rolling_median,
+        rolling_kurt,
+        rolling_max,
         rolling_mean,
+        rolling_median,
         rolling_min,
-        rolling_max,
-        rolling_var,
         rolling_skew,
-        rolling_kurt,
         rolling_std,
+        rolling_var,
     )
 
     have_rolling_methods = True

diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
@@ -2,8 +2,8 @@
 
 try:
     from pandas._libs.tslibs.parsing import (
-        concat_date_cols,
         _does_string_look_like_datetime,
+        concat_date_cols,
     )
 except ImportError:
     # Avoid whole benchmark suite import failure on asv (currently 0.4)

diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py
@@ -1,5 +1,5 @@
 try:
-    from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized
+    from pandas._libs.tslibs import is_date_array_normalized, normalize_i8_timestamps
 except ImportError:
     from pandas._libs.tslibs.conversion import (
         normalize_i8_timestamps,

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -1,9 +1,11 @@
 # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml
 trigger:
 - master
+- 1.1.x
 
 pr:
 - master
+- 1.1.x
 
 variables:
   PYTEST_WORKERS: auto

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
 
     # Imports - Check formatting using isort see setup.cfg for settings
     MSG='Check import format using isort' ; echo $MSG
-    ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts"
+    ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts"
     if [[ "$GITHUB_ACTIONS" == "true" ]]; then
         eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]}))
     else
@@ -230,6 +230,11 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
     invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    # https://github.com/python/mypy/issues/7384
+    # MSG='Check for missing error codes with # type: ignore' ; echo $MSG
+    # invgrep -R --include="*.py" -P '# type: ignore(?!\[)' pandas
+    # RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG
     invgrep -R --include=*.{py,pyx} '\.__class__' pandas
     RET=$(($RET + $?)) ; echo $MSG "DONE"

diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml
@@ -7,7 +7,7 @@ dependencies:
 
   # tools
   - cython>=0.29.16
-  - pytest>=5.0.1
+  - pytest>=5.0.1,<6.0.0  # https://github.com/pandas-dev/pandas/issues/35620
   - pytest-xdist>=1.21
   - pytest-asyncio
   - hypothesis>=3.58.0

diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
@@ -153,14 +153,38 @@ to build the documentation locally before pushing your changes.
 Using a Docker container
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-Instead of manually setting up a development environment, you can use Docker to
-automatically create the environment with just several commands. Pandas provides a `DockerFile`
-in the root directory to build a Docker image with a full pandas development environment.
+Instead of manually setting up a development environment, you can use `Docker
+<https://docs.docker.com/get-docker/>`_ to automatically create the environment with just several
+commands. Pandas provides a `DockerFile` in the root directory to build a Docker image
+with a full pandas development environment.
 
-Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code,
+**Docker Commands**
+
+Pass your GitHub username in the `DockerFile` to use your own fork::
+
+    # Build the image pandas-yourname-env
+    docker build --tag pandas-yourname-env .
+    # Run a container and bind your local forked repo, pandas-yourname, to the container
+    docker run -it --rm -v path-to-pandas-yourname:/home/pandas-yourname pandas-yourname-env
+
+Even easier, you can integrate Docker with the following IDEs:
+
+**Visual Studio Code**
+
+You can use the DockerFile to launch a remote session with Visual Studio Code,
 a popular free IDE, using the `.devcontainer.json` file.
 See https://code.visualstudio.com/docs/remote/containers for details.
 
+**PyCharm (Professional)**
+
+Enable Docker support and use the Services tool window to build and manage images as well as
+run and interact with containers.
+See https://www.jetbrains.com/help/pycharm/docker.html for details.
+
+Note that you might need to rebuild the C extensions if/when you merge with upstream/master using::
+
+    python setup.py build_ext --inplace -j 4
+
 .. _contributing.dev_c:
 
 Installing a C compiler
@@ -751,7 +775,7 @@ Imports are alphabetically sorted within these sections.
 
 As part of :ref:`Continuous Integration <contributing.ci>` checks we run::
 
-    isort --recursive --check-only pandas
+    isort --check-only pandas
 
 to check that imports are correctly formatted as per the `setup.cfg`.
 
@@ -770,8 +794,6 @@ You should run::
 
 to automatically format imports correctly. This will modify your local copy of the files.
 
-The `--recursive` flag can be passed to sort all files in a directory.
-
 Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above <contributing.code-formatting>`::
 
     git diff upstream/master --name-only -- "*.py" | xargs -r isort

diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -80,6 +80,11 @@ ML pipeline.
 
 Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community.
 
+`Compose <https://github.com/FeatureLabs/compose>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning.
+
 .. _ecosystem.visualization:
 
 Visualization
@@ -445,6 +450,7 @@ Library         Accessor   Classes                   Description
 `pdvega`_       ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library.
 `pandas_path`_  ``path``   ``Index``, ``Series``     Provides `pathlib.Path`_ functions for Series.
 `pint-pandas`_  ``pint``   ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames.
+`composeml`_    ``slice``  ``DataFrame``             Provides a generator for enhanced data slicing.
 =============== ========== ========================= ===============================================================
 
 .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest
@@ -453,3 +459,4 @@ Library         Accessor   Classes                   Description
 .. _pandas_path: https://github.com/drivendataorg/pandas-path/
 .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html
 .. _pint-pandas: https://github.com/hgrecco/pint-pandas
+.. _composeml: https://github.com/FeatureLabs/compose
diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst
@@ -397,6 +397,32 @@ The result will be a DataFrame with the same index as the input Series, and
 with one column whose name is the original name of the Series (only if no other
 column name provided).
 
+
+.. _basics.dataframe.from_list_namedtuples:
+
+From a list of namedtuples
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The field names of the first ``namedtuple`` in the list determine the columns
+of the ``DataFrame``. The remaining namedtuples (or tuples) are simply unpacked
+and their values are fed into the rows of the ``DataFrame``. If any of those
+tuples is shorter than the first ``namedtuple`` then the later columns in the
+corresponding row are marked as missing values. If any are longer than the
+first ``namedtuple``, a ``ValueError`` is raised.
+
+.. ipython:: python
+
+    from collections import namedtuple
+
+    Point = namedtuple('Point', 'x y')
+
+    pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])
+
+    Point3D = namedtuple('Point3D', 'x y z')
+
+    pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)])
+
+
 .. _basics.dataframe.from_list_dataclasses:
 
 From a list of dataclasses

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -1532,12 +1532,8 @@ Setting metadata
 ~~~~~~~~~~~~~~~~
 
 Indexes are "mostly immutable", but it is possible to set and change their
-metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and
-``codes``).
-
-You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_codes``
-to set these attributes directly. They default to returning a copy; however,
-you can specify ``inplace=True`` to have the data change in place.
+``name`` attribute. You can use the ``rename``, ``set_names`` to set these attributes
+directly, and they default to returning a copy.
 
 See :ref:`Advanced Indexing <advanced>` for usage of MultiIndexes.
 

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1064,6 +1064,23 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:
    pd.read_csv('tmp.csv', parse_dates=[0])
    pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0])
 
+Writing CSVs to binary file objects
++++++++++++++++++++++++++++++++++++
+
+.. versionadded:: 1.2.0
+
+``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object
+opened binary mode. For this to work, it is necessary that ``mode``
+contains a "b":
+
+.. ipython:: python
+
+   import io
+
+   data = pd.DataFrame([0, 1, 2])
+   buffer = io.BytesIO()
+   data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
+
 .. _io.float_precision:
 
 Specifying method for floating-point conversion
@@ -3441,10 +3458,11 @@ for some advanced strategies
 
 .. warning::
 
-   pandas requires ``PyTables`` >= 3.0.0.
-   There is a indexing bug in ``PyTables`` < 3.2 which may appear when querying stores using an index.
-   If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2.
-   Stores created previously will need to be rewritten using the updated version.
+   Pandas uses PyTables for reading and writing HDF5 files, which allows
+   serializing object-dtype data with pickle. Loading pickled data received from
+   untrusted sources can be unsafe.
+
+   See: https://docs.python.org/3/library/pickle.html for more.
 
 .. ipython:: python
    :suppress:

diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst
@@ -1,7 +1,7 @@
 .. _whatsnew_0220:
 
-v0.22.0 (December 29, 2017)
----------------------------
+Version 0.22.0 (December 29, 2017)
+----------------------------------
 
 {{ header }}
 
@@ -96,7 +96,7 @@ returning ``1`` instead.
 These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well.
 Finally, a few less obvious places in pandas are affected by this change.
 
-Grouping by a categorical
+Grouping by a Categorical
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Grouping by a ``Categorical`` and summing now returns ``0`` instead of

diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst
@@ -86,8 +86,8 @@ Please note that the string `index` is not supported with the round trip format,
 .. _whatsnew_0230.enhancements.assign_dependent:
 
 
-``.assign()`` accepts dependent arguments
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Method ``.assign()`` accepts dependent arguments
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468
 <https://www.python.org/dev/peps/pep-0468/>`_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the
@@ -244,7 +244,7 @@ documentation. If you build an extension array, publicize it on our
 
 .. _whatsnew_0230.enhancements.categorical_grouping:
 
-New ``observed`` keyword for excluding unobserved categories in ``groupby``
+New ``observed`` keyword for excluding unobserved categories in ``GroupBy``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Grouping by a categorical includes the unobserved categories in the output.
@@ -360,8 +360,8 @@ Fill all consecutive outside values in both directions
 
 .. _whatsnew_0210.enhancements.get_dummies_dtype:
 
-``get_dummies`` now supports ``dtype`` argument
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Function ``get_dummies`` now supports ``dtype`` argument
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`)
 
@@ -388,8 +388,8 @@ See the :ref:`documentation here <timedeltas.mod_divmod>`. (:issue:`19365`)
 
 .. _whatsnew_0230.enhancements.ran_inf:
 
-``.rank()`` handles ``inf`` values when ``NaN`` are present
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Method ``.rank()`` handles ``inf`` values when ``NaN`` are present
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 In previous versions, ``.rank()`` would assign ``inf`` elements ``NaN`` as their ranks. Now ranks are calculated properly. (:issue:`6945`)
 
@@ -587,7 +587,7 @@ If installed, we now require:
 
 .. _whatsnew_0230.api_breaking.dict_insertion_order:
 
-Instantiation from dicts preserves dict insertion order for python 3.6+
+Instantiation from dicts preserves dict insertion order for Python 3.6+
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Until Python 3.6, dicts in Python had no formally defined ordering. For Python
@@ -1365,8 +1365,8 @@ MultiIndex
 - Bug in indexing where nested indexers having only numpy arrays are handled incorrectly (:issue:`19686`)
 
 
-I/O
-^^^
+IO
+^^
 
 - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`)
 - :meth:`DataFrame.to_html` now has an option to add an id to the leading `<table>` tag (:issue:`8496`)
@@ -1403,7 +1403,7 @@ Plotting
 - :func:`DataFrame.plot` now supports multiple columns to the ``y`` argument (:issue:`19699`)
 
 
-Groupby/resample/rolling
+GroupBy/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`)