Skip to content

Commit

Permalink
Merging master (pandas-dev#35498)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexhlim committed Sep 14, 2020
2 parents 89137ee + 1b2f1f4 commit 7b14cf6
Show file tree
Hide file tree
Showing 129 changed files with 2,258 additions and 1,178 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,9 @@ check:
--included-file-extensions="py" \
--excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored \
pandas/

python3 scripts/validate_unwanted_patterns.py \
--validation-type="private_import_across_module" \
--included-file-extensions="py" \
--excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/
pandas/
46 changes: 32 additions & 14 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,49 +627,63 @@ def time_first(self):


class TransformEngine:
def setup(self):

param_names = ["parallel"]
params = [[True, False]]

def setup(self, parallel):
N = 10 ** 3
data = DataFrame(
{0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
columns=[0, 1],
)
self.parallel = parallel
self.grouper = data.groupby(0)

def time_series_numba(self):
def time_series_numba(self, parallel):
def function(values, index):
return values * 5

self.grouper[1].transform(function, engine="numba")
self.grouper[1].transform(
function, engine="numba", engine_kwargs={"parallel": self.parallel}
)

def time_series_cython(self):
def time_series_cython(self, parallel):
def function(values):
return values * 5

self.grouper[1].transform(function, engine="cython")

def time_dataframe_numba(self):
def time_dataframe_numba(self, parallel):
def function(values, index):
return values * 5

self.grouper.transform(function, engine="numba")
self.grouper.transform(
function, engine="numba", engine_kwargs={"parallel": self.parallel}
)

def time_dataframe_cython(self):
def time_dataframe_cython(self, parallel):
def function(values):
return values * 5

self.grouper.transform(function, engine="cython")


class AggEngine:
def setup(self):

param_names = ["parallel"]
params = [[True, False]]

def setup(self, parallel):
N = 10 ** 3
data = DataFrame(
{0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
columns=[0, 1],
)
self.parallel = parallel
self.grouper = data.groupby(0)

def time_series_numba(self):
def time_series_numba(self, parallel):
def function(values, index):
total = 0
for i, value in enumerate(values):
Expand All @@ -679,9 +693,11 @@ def function(values, index):
total += value * 2
return total

self.grouper[1].agg(function, engine="numba")
self.grouper[1].agg(
function, engine="numba", engine_kwargs={"parallel": self.parallel}
)

def time_series_cython(self):
def time_series_cython(self, parallel):
def function(values):
total = 0
for i, value in enumerate(values):
Expand All @@ -693,7 +709,7 @@ def function(values):

self.grouper[1].agg(function, engine="cython")

def time_dataframe_numba(self):
def time_dataframe_numba(self, parallel):
def function(values, index):
total = 0
for i, value in enumerate(values):
Expand All @@ -703,9 +719,11 @@ def function(values, index):
total += value * 2
return total

self.grouper.agg(function, engine="numba")
self.grouper.agg(
function, engine="numba", engine_kwargs={"parallel": self.parallel}
)

def time_dataframe_cython(self):
def time_dataframe_cython(self, parallel):
def function(values):
total = 0
for i, value in enumerate(values):
Expand Down
9 changes: 1 addition & 8 deletions ci/build39.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,9 @@

sudo apt-get install build-essential gcc xvfb
pip install --no-deps -U pip wheel setuptools
pip install python-dateutil pytz pytest pytest-xdist hypothesis
pip install numpy python-dateutil pytz pytest pytest-xdist hypothesis
pip install cython --pre # https://github.com/cython/cython/issues/3395

git clone https://github.com/numpy/numpy
cd numpy
python setup.py build_ext --inplace
python setup.py install
cd ..
rm -rf numpy

python setup.py build_ext -inplace
python -m pip install --no-build-isolation -e .

Expand Down
14 changes: 11 additions & 3 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,19 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
fi
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Check for use of private module attribute access' ; echo $MSG
MSG='Check for import of private attributes across modules' ; echo $MSG
if [[ "$GITHUB_ACTIONS" == "true" ]]; then
$BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/
$BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/
else
$BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/
$BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/
fi
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Check for use of private functions across modules' ; echo $MSG
if [[ "$GITHUB_ACTIONS" == "true" ]]; then
$BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ --format="##[error]{source_path}:{line_number}:{msg}" pandas/
else
$BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ pandas/
fi
RET=$(($RET + $?)) ; echo $MSG "DONE"

Expand Down
2 changes: 0 additions & 2 deletions doc/source/development/extending.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ applies only to certain dtypes.
Extension types
---------------

.. versionadded:: 0.23.0

.. warning::

The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and
Expand Down
2 changes: 0 additions & 2 deletions doc/source/getting_started/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -301,8 +301,6 @@ Optional dependencies for parsing HTML
One of the following combinations of libraries is needed to use the
top-level :func:`~pandas.read_html` function:

.. versionchanged:: 0.23.0

* `BeautifulSoup4`_ and `html5lib`_
* `BeautifulSoup4`_ and `lxml`_
* `BeautifulSoup4`_ and `html5lib`_ and `lxml`_
Expand Down
2 changes: 0 additions & 2 deletions doc/source/user_guide/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1065,8 +1065,6 @@ are closed on. Intervals are closed on the right side by default.
pd.interval_range(start=0, end=4, closed='neither')
.. versionadded:: 0.23.0

Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced
intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements
in the resulting ``IntervalIndex``:
Expand Down
2 changes: 0 additions & 2 deletions doc/source/user_guide/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1877,8 +1877,6 @@ different columns.
By indexes and values
~~~~~~~~~~~~~~~~~~~~~

.. versionadded:: 0.23.0

Strings passed as the ``by`` parameter to :meth:`DataFrame.sort_values` may
refer to either columns or index level names.

Expand Down
2 changes: 0 additions & 2 deletions doc/source/user_guide/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@ only labels present in a given column are categories:
df['B']
.. versionadded:: 0.23.0

Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`:

.. ipython:: python
Expand Down
2 changes: 0 additions & 2 deletions doc/source/user_guide/dsintro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -597,8 +597,6 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function
of one argument to be called on the ``DataFrame``. A *copy* of the original
DataFrame is returned, with the new values inserted.

.. versionchanged:: 0.23.0

Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows
for *dependent* assignment, where an expression later in ``**kwargs`` can refer
to a column created earlier in the same :meth:`~DataFrame.assign`.
Expand Down
17 changes: 1 addition & 16 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -930,7 +930,7 @@ take full advantage of the flexibility of the date parsing API:
.. ipython:: python
df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec,
date_parser=pd.io.date_converters.parse_date_time)
date_parser=pd.to_datetime)
df
Pandas will try to call the ``date_parser`` function in three different ways. If
Expand All @@ -942,11 +942,6 @@ an exception is raised, the next one is tried:
2. If #1 fails, ``date_parser`` is called with all the columns
concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``).

3. If #2 fails, ``date_parser`` is called once for every row with one or more
string arguments from the columns indicated with `parse_dates`
(e.g., ``date_parser('2013', '1')`` for the first row, ``date_parser('2013', '2')``
for the second, etc.).

Note that performance-wise, you should try these methods of parsing dates in order:

1. Try to infer the format using ``infer_datetime_format=True`` (see section below).
Expand All @@ -958,14 +953,6 @@ Note that performance-wise, you should try these methods of parsing dates in ord
For optimal performance, this should be vectorized, i.e., it should accept arrays
as arguments.

You can explore the date parsing functionality in
`date_converters.py <https://github.com/pandas-dev/pandas/blob/master/pandas/io/date_converters.py>`__
and add your own. We would love to turn this module into a community supported
set of date/time parsers. To get you started, ``date_converters.py`` contains
functions to parse dual date and time columns, year/month/day columns,
and year/month/day/hour/minute/second columns. It also contains a
``generic_parser`` function so you can curry it with a function that deals with
a single date rather than the entire array.

.. ipython:: python
:suppress:
Expand Down Expand Up @@ -2373,8 +2360,6 @@ A few notes on the generated table schema:
then ``level_<i>`` is used.


.. versionadded:: 0.23.0

``read_json`` also accepts ``orient='table'`` as an argument. This allows for
the preservation of metadata such as dtypes and index names in a
round-trippable manner.
Expand Down
4 changes: 0 additions & 4 deletions doc/source/user_guide/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,6 @@ behavior:
.. warning::

.. versionchanged:: 0.23.0

The default behavior with ``join='outer'`` is to sort the other axis
(columns in this case). In a future version of pandas, the default will
be to not sort. We specified ``sort=False`` to opt in to the new
Expand Down Expand Up @@ -1198,8 +1196,6 @@ done using the following code.
Merging on a combination of columns and index levels
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. versionadded:: 0.23

Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters
may refer to either column names or index level names. This enables merging
``DataFrame`` instances on a combination of index levels and columns without
Expand Down
8 changes: 2 additions & 6 deletions doc/source/user_guide/missing_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,6 @@ examined :ref:`in the API <api.dataframe.missing>`.
Interpolation
~~~~~~~~~~~~~

.. versionadded:: 0.23.0

The ``limit_area`` keyword argument was added.

Both Series and DataFrame objects have :meth:`~DataFrame.interpolate`
that, by default, performs linear interpolation at missing data points.

Expand Down Expand Up @@ -507,8 +503,8 @@ By default, ``NaN`` values are filled in a ``forward`` direction. Use
ser.interpolate(limit_direction='both')
By default, ``NaN`` values are filled whether they are inside (surrounded by)
existing valid values, or outside existing valid values. Introduced in v0.23
the ``limit_area`` parameter restricts filling to either inside or outside values.
existing valid values, or outside existing valid values. The ``limit_area``
parameter restricts filling to either inside or outside values.

.. ipython:: python
Expand Down
2 changes: 0 additions & 2 deletions doc/source/user_guide/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -679,8 +679,6 @@ To choose another dtype, use the ``dtype`` argument:
pd.get_dummies(df, dtype=bool).dtypes
.. versionadded:: 0.23.0

.. _reshaping.factorize:

Expand Down
6 changes: 0 additions & 6 deletions doc/source/user_guide/text.rst
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,6 @@ following code will cause trouble because of the regular expression meaning of
# We need to escape the special character (for >1 len patterns)
dollars.str.replace(r'-\$', '-')
.. versionadded:: 0.23.0

If you do want literal replacement of a string (equivalent to
:meth:`str.replace`), you can set the optional ``regex`` parameter to
``False``, rather than escaping each character. In this case both ``pat``
Expand Down Expand Up @@ -390,8 +388,6 @@ Missing values on either side will result in missing values in the result as wel
Concatenating a Series and something array-like into a Series
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. versionadded:: 0.23.0

The parameter ``others`` can also be two-dimensional. In this case, the number or rows must match the lengths of the calling ``Series`` (or ``Index``).

.. ipython:: python
Expand All @@ -404,8 +400,6 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o
Concatenating a Series and an indexed object into a Series, with alignment
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. versionadded:: 0.23.0

For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the indexes before concatenation by setting
the ``join``-keyword.

Expand Down
8 changes: 1 addition & 7 deletions doc/source/user_guide/timedeltas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ parsing, and attributes.
Parsing
-------

You can construct a ``Timedelta`` scalar through various arguments:
You can construct a ``Timedelta`` scalar through various arguments, including `ISO 8601 Duration`_ strings.

.. ipython:: python
Expand Down Expand Up @@ -53,10 +53,6 @@ You can construct a ``Timedelta`` scalar through various arguments:
pd.Timedelta('P0DT0H1M0S')
pd.Timedelta('P0DT0H0M0.000000123S')
.. versionadded:: 0.23.0

Added constructor for `ISO 8601 Duration`_ strings

:ref:`DateOffsets<timeseries.offsets>` (``Day, Hour, Minute, Second, Milli, Micro, Nano``) can also be used in construction.

.. ipython:: python
Expand Down Expand Up @@ -387,8 +383,6 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases <timeseri
pd.timedelta_range(start='1 days', periods=5, freq='2D5H')
.. versionadded:: 0.23.0

Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced
timedeltas from ``start`` to ``end`` inclusively, with ``periods`` number of elements
in the resulting ``TimedeltaIndex``:
Expand Down
4 changes: 0 additions & 4 deletions doc/source/user_guide/timeseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,6 @@ of those specified will not be generated:
pd.bdate_range(start=start, periods=20)
.. versionadded:: 0.23.0

Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced
dates from ``start`` to ``end`` inclusively, with ``periods`` number of elements in the
resulting ``DatetimeIndex``:
Expand Down Expand Up @@ -643,8 +641,6 @@ Slicing with string indexing also honors UTC offset.
Slice vs. exact match
~~~~~~~~~~~~~~~~~~~~~

.. versionchanged:: 0.20.0

The same string used as an indexing parameter can be treated either as a slice or as an exact match depending on the resolution of the index. If the string is less accurate than the index, it will be treated as a slice, otherwise as an exact match.

Consider a ``Series`` object with a minute resolution index:
Expand Down
Loading

0 comments on commit 7b14cf6

Please sign in to comment.