Merging master (pandas-dev#35498)

alexhlim · Sep 14, 2020 · 7b14cf6 · 7b14cf6
2 parents 89137ee + 1b2f1f4
commit 7b14cf6
Show file tree

Hide file tree

Showing 129 changed files with 2,258 additions and 1,178 deletions.
diff --git a/Makefile b/Makefile
@@ -32,3 +32,9 @@ check:
 		--included-file-extensions="py" \
 		--excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored \
 		pandas/
+
+	python3 scripts/validate_unwanted_patterns.py \
+		--validation-type="private_import_across_module" \
+		--included-file-extensions="py" \
+		--excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/
+		pandas/
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -627,49 +627,63 @@ def time_first(self):
 
 
 class TransformEngine:
-    def setup(self):
+
+    param_names = ["parallel"]
+    params = [[True, False]]
+
+    def setup(self, parallel):
         N = 10 ** 3
         data = DataFrame(
             {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
             columns=[0, 1],
         )
+        self.parallel = parallel
         self.grouper = data.groupby(0)
 
-    def time_series_numba(self):
+    def time_series_numba(self, parallel):
         def function(values, index):
             return values * 5
 
-        self.grouper[1].transform(function, engine="numba")
+        self.grouper[1].transform(
+            function, engine="numba", engine_kwargs={"parallel": self.parallel}
+        )
 
-    def time_series_cython(self):
+    def time_series_cython(self, parallel):
         def function(values):
             return values * 5
 
         self.grouper[1].transform(function, engine="cython")
 
-    def time_dataframe_numba(self):
+    def time_dataframe_numba(self, parallel):
         def function(values, index):
             return values * 5
 
-        self.grouper.transform(function, engine="numba")
+        self.grouper.transform(
+            function, engine="numba", engine_kwargs={"parallel": self.parallel}
+        )
 
-    def time_dataframe_cython(self):
+    def time_dataframe_cython(self, parallel):
         def function(values):
             return values * 5
 
         self.grouper.transform(function, engine="cython")
 
 
 class AggEngine:
-    def setup(self):
+
+    param_names = ["parallel"]
+    params = [[True, False]]
+
+    def setup(self, parallel):
         N = 10 ** 3
         data = DataFrame(
             {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
             columns=[0, 1],
         )
+        self.parallel = parallel
         self.grouper = data.groupby(0)
 
-    def time_series_numba(self):
+    def time_series_numba(self, parallel):
         def function(values, index):
             total = 0
             for i, value in enumerate(values):
@@ -679,9 +693,11 @@ def function(values, index):
                     total += value * 2
             return total
 
-        self.grouper[1].agg(function, engine="numba")
+        self.grouper[1].agg(
+            function, engine="numba", engine_kwargs={"parallel": self.parallel}
+        )
 
-    def time_series_cython(self):
+    def time_series_cython(self, parallel):
         def function(values):
             total = 0
             for i, value in enumerate(values):
@@ -693,7 +709,7 @@ def function(values):
 
         self.grouper[1].agg(function, engine="cython")
 
-    def time_dataframe_numba(self):
+    def time_dataframe_numba(self, parallel):
         def function(values, index):
             total = 0
             for i, value in enumerate(values):
@@ -703,9 +719,11 @@ def function(values, index):
                     total += value * 2
             return total
 
-        self.grouper.agg(function, engine="numba")
+        self.grouper.agg(
+            function, engine="numba", engine_kwargs={"parallel": self.parallel}
+        )
 
-    def time_dataframe_cython(self):
+    def time_dataframe_cython(self, parallel):
         def function(values):
             total = 0
             for i, value in enumerate(values):

diff --git a/ci/build39.sh b/ci/build39.sh
@@ -3,16 +3,9 @@
 
 sudo apt-get install build-essential gcc xvfb
 pip install --no-deps -U pip wheel setuptools
-pip install python-dateutil pytz pytest pytest-xdist hypothesis
+pip install numpy python-dateutil pytz pytest pytest-xdist hypothesis
 pip install cython --pre # https://github.com/cython/cython/issues/3395
 
-git clone https://github.com/numpy/numpy
-cd numpy
-python setup.py build_ext --inplace
-python setup.py install
-cd ..
-rm -rf numpy
-
 python setup.py build_ext -inplace
 python -m pip install --no-build-isolation -e .
 

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -116,11 +116,19 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
     fi
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
-    MSG='Check for use of private module attribute access' ; echo $MSG
+    MSG='Check for import of private attributes across modules' ; echo $MSG
     if [[ "$GITHUB_ACTIONS" == "true" ]]; then
-        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/
+        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/
     else
-        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/
+        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/
+    fi
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
+    MSG='Check for use of private functions across modules' ; echo $MSG
+    if [[ "$GITHUB_ACTIONS" == "true" ]]; then
+        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ --format="##[error]{source_path}:{line_number}:{msg}" pandas/
+    else
+        $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ pandas/
     fi
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 

diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
@@ -73,8 +73,6 @@ applies only to certain dtypes.
 Extension types
 ---------------
 
-.. versionadded:: 0.23.0
-
 .. warning::
 
    The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and

diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -301,8 +301,6 @@ Optional dependencies for parsing HTML
 One of the following combinations of libraries is needed to use the
 top-level :func:`~pandas.read_html` function:
 
-.. versionchanged:: 0.23.0
-
 * `BeautifulSoup4`_ and `html5lib`_
 * `BeautifulSoup4`_ and `lxml`_
 * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_

diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
@@ -1065,8 +1065,6 @@ are closed on.  Intervals are closed on the right side by default.
 
    pd.interval_range(start=0, end=4, closed='neither')
 
-.. versionadded:: 0.23.0
-
 Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced
 intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements
 in the resulting ``IntervalIndex``:

diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -1877,8 +1877,6 @@ different columns.
 By indexes and values
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. versionadded:: 0.23.0
-
 Strings passed as the ``by`` parameter to :meth:`DataFrame.sort_values` may
 refer to either columns or index level names.
 

diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
@@ -112,8 +112,6 @@ only labels present in a given column are categories:
     df['B']
 
 
-.. versionadded:: 0.23.0
-
 Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`:
 
 .. ipython:: python

diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst
@@ -597,8 +597,6 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function
 of one argument to be called on the ``DataFrame``. A *copy* of the original
 DataFrame is returned, with the new values inserted.
 
-.. versionchanged:: 0.23.0
-
 Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows
 for *dependent* assignment, where an expression later in ``**kwargs`` can refer
 to a column created earlier in the same :meth:`~DataFrame.assign`.

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -930,7 +930,7 @@ take full advantage of the flexibility of the date parsing API:
 .. ipython:: python
 
    df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec,
-                    date_parser=pd.io.date_converters.parse_date_time)
+                    date_parser=pd.to_datetime)
    df
 
 Pandas will try to call the ``date_parser`` function in three different ways. If
@@ -942,11 +942,6 @@ an exception is raised, the next one is tried:
 2. If #1 fails, ``date_parser`` is called with all the columns
    concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``).
 
-3. If #2 fails, ``date_parser`` is called once for every row with one or more
-   string arguments from the columns indicated with `parse_dates`
-   (e.g., ``date_parser('2013', '1')`` for the first row, ``date_parser('2013', '2')``
-   for the second, etc.).
-
 Note that performance-wise, you should try these methods of parsing dates in order:
 
 1. Try to infer the format using ``infer_datetime_format=True`` (see section below).
@@ -958,14 +953,6 @@ Note that performance-wise, you should try these methods of parsing dates in ord
    For optimal performance, this should be vectorized, i.e., it should accept arrays
    as arguments.
 
-You can explore the date parsing functionality in
-`date_converters.py <https://github.com/pandas-dev/pandas/blob/master/pandas/io/date_converters.py>`__
-and add your own. We would love to turn this module into a community supported
-set of date/time parsers. To get you started, ``date_converters.py`` contains
-functions to parse dual date and time columns, year/month/day columns,
-and year/month/day/hour/minute/second columns. It also contains a
-``generic_parser`` function so you can curry it with a function that deals with
-a single date rather than the entire array.
 
 .. ipython:: python
    :suppress:
@@ -2373,8 +2360,6 @@ A few notes on the generated table schema:
       then ``level_<i>`` is used.
 
 
-.. versionadded:: 0.23.0
-
 ``read_json`` also accepts ``orient='table'`` as an argument. This allows for
 the preservation of metadata such as dtypes and index names in a
 round-trippable manner.

diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
@@ -175,8 +175,6 @@ behavior:
 
 .. warning::
 
-   .. versionchanged:: 0.23.0
-
    The default behavior with ``join='outer'`` is to sort the other axis
    (columns in this case). In a future version of pandas, the default will
    be to not sort. We specified ``sort=False`` to opt in to the new
@@ -1198,8 +1196,6 @@ done using the following code.
 Merging on a combination of columns and index levels
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. versionadded:: 0.23
-
 Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters
 may refer to either column names or index level names.  This enables merging
 ``DataFrame`` instances on a combination of index levels and columns without

diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -336,10 +336,6 @@ examined :ref:`in the API <api.dataframe.missing>`.
 Interpolation
 ~~~~~~~~~~~~~
 
-.. versionadded:: 0.23.0
-
-  The ``limit_area`` keyword argument was added.
-
 Both Series and DataFrame objects have :meth:`~DataFrame.interpolate`
 that, by default, performs linear interpolation at missing data points.
 
@@ -507,8 +503,8 @@ By default, ``NaN`` values are filled in a ``forward`` direction. Use
    ser.interpolate(limit_direction='both')
 
 By default, ``NaN`` values are filled whether they are inside (surrounded by)
-existing valid values, or outside existing valid values. Introduced in v0.23
-the ``limit_area`` parameter restricts filling to either inside or outside values.
+existing valid values, or outside existing valid values. The ``limit_area``
+parameter restricts filling to either inside or outside values.
 
 .. ipython:: python
 

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -679,8 +679,6 @@ To choose another dtype, use the ``dtype`` argument:
 
     pd.get_dummies(df, dtype=bool).dtypes
 
-.. versionadded:: 0.23.0
-
 
 .. _reshaping.factorize:
 

diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -282,8 +282,6 @@ following code will cause trouble because of the regular expression meaning of
    # We need to escape the special character (for >1 len patterns)
    dollars.str.replace(r'-\$', '-')
 
-.. versionadded:: 0.23.0
-
 If you do want literal replacement of a string (equivalent to
 :meth:`str.replace`), you can set the optional ``regex`` parameter to
 ``False``, rather than escaping each character. In this case both ``pat``
@@ -390,8 +388,6 @@ Missing values on either side will result in missing values in the result as wel
 Concatenating a Series and something array-like into a Series
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. versionadded:: 0.23.0
-
 The parameter ``others`` can also be two-dimensional. In this case, the number or rows must match the lengths of the calling ``Series`` (or ``Index``).
 
 .. ipython:: python
@@ -404,8 +400,6 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o
 Concatenating a Series and an indexed object into a Series, with alignment
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. versionadded:: 0.23.0
-
 For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the indexes before concatenation by setting
 the ``join``-keyword.
 

diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst
@@ -18,7 +18,7 @@ parsing, and attributes.
 Parsing
 -------
 
-You can construct a ``Timedelta`` scalar through various arguments:
+You can construct a ``Timedelta`` scalar through various arguments, including `ISO 8601 Duration`_ strings.
 
 .. ipython:: python
 
@@ -53,10 +53,6 @@ You can construct a ``Timedelta`` scalar through various arguments:
    pd.Timedelta('P0DT0H1M0S')
    pd.Timedelta('P0DT0H0M0.000000123S')
 
-.. versionadded:: 0.23.0
-
-   Added constructor for `ISO 8601 Duration`_ strings
-
 :ref:`DateOffsets<timeseries.offsets>` (``Day, Hour, Minute, Second, Milli, Micro, Nano``) can also be used in construction.
 
 .. ipython:: python
@@ -387,8 +383,6 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases <timeseri
    pd.timedelta_range(start='1 days', periods=5, freq='2D5H')
 
 
-.. versionadded:: 0.23.0
-
 Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced
 timedeltas from ``start`` to ``end`` inclusively, with ``periods`` number of elements
 in the resulting ``TimedeltaIndex``:

diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
@@ -461,8 +461,6 @@ of those specified will not be generated:
 
    pd.bdate_range(start=start, periods=20)
 
-.. versionadded:: 0.23.0
-
 Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced
 dates from ``start`` to ``end`` inclusively, with ``periods`` number of elements in the
 resulting ``DatetimeIndex``:
@@ -643,8 +641,6 @@ Slicing with string indexing also honors UTC offset.
 Slice vs. exact match
 ~~~~~~~~~~~~~~~~~~~~~
 
-.. versionchanged:: 0.20.0
-
 The same string used as an indexing parameter can be treated either as a slice or as an exact match depending on the resolution of the index. If the string is less accurate than the index, it will be treated as a slice, otherwise as an exact match.
 
 Consider a ``Series`` object with a minute resolution index: