Skip to content

Commit

Permalink
merging master
Browse files Browse the repository at this point in the history
  • Loading branch information
alexhlim committed Aug 1, 2021
2 parents 11a3790 + a5f8c9a commit 838ceb4
Show file tree
Hide file tree
Showing 85 changed files with 1,590 additions and 593 deletions.
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,18 @@ def time_category_size(self):
self.draws.groupby(self.cats).size()


class Shift:
def setup(self):
N = 18
self.df = DataFrame({"g": ["a", "b"] * 9, "v": list(range(N))})

def time_defaults(self):
self.df.groupby("g").shift()

def time_fill_value(self):
self.df.groupby("g").shift(fill_value=99)


class FillNA:
def setup(self):
N = 100
Expand Down
6 changes: 5 additions & 1 deletion asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def setup(self, dtype):
columns = np.arange(n)
if dtype == "int":
values = np.arange(m * m * n).reshape(m * m, n)
self.df = DataFrame(values, index, columns)
else:
# the category branch is ~20x slower than int. So we
# cut down the size a bit. Now it's only ~3x slower.
Expand All @@ -111,7 +112,10 @@ def setup(self, dtype):
values = np.take(list(string.ascii_letters), indices)
values = [pd.Categorical(v) for v in values.T]

self.df = DataFrame(values, index, columns)
self.df = DataFrame(
{i: cat for i, cat in enumerate(values)}, index, columns
)

self.df2 = self.df.iloc[:-1]

def time_full_product(self, dtype):
Expand Down
3 changes: 3 additions & 0 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
pandas/io/parsers/ \
pandas/io/sas/ \
pandas/io/sql.py \
pandas/io/formats/format.py \
pandas/io/formats/style.py \
pandas/io/stata.py \
pandas/tseries/
RET=$(($RET + $?)) ; echo $MSG "DONE"

Expand Down
1 change: 1 addition & 0 deletions ci/deps/actions-39-slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies:
- matplotlib
- moto>=1.3.14
- flask
- numba
- numexpr
- numpy
- openpyxl
Expand Down
1 change: 1 addition & 0 deletions ci/deps/actions-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- matplotlib
- moto>=1.3.14
- flask
- numba
- numexpr
- numpy
- openpyxl
Expand Down
1 change: 1 addition & 0 deletions ci/deps/azure-windows-39.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ dependencies:
- matplotlib
- moto>=1.3.14
- flask
- numba
- numexpr
- numpy
- openpyxl
Expand Down
Binary file added doc/source/_static/style/df_pipe.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
54 changes: 54 additions & 0 deletions doc/source/user_guide/visualization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,34 @@ The ``by`` keyword can be specified to plot grouped histograms:
@savefig grouped_hist.png
data.hist(by=np.random.randint(0, 4, 1000), figsize=(6, 4));
.. ipython:: python
:suppress:
plt.close("all")
np.random.seed(123456)
In addition, the ``by`` keyword can also be specified in :meth:`DataFrame.plot.hist`.

.. versionchanged:: 1.4.0

.. ipython:: python
data = pd.DataFrame(
{
"a": np.random.choice(["x", "y", "z"], 1000),
"b": np.random.choice(["e", "f", "g"], 1000),
"c": np.random.randn(1000),
"d": np.random.randn(1000) - 1,
},
)
@savefig grouped_hist_by.png
data.plot.hist(by=["a", "b"], figsize=(10, 5));
.. ipython:: python
:suppress:
plt.close("all")
.. _visualization.box:

Expand Down Expand Up @@ -448,6 +476,32 @@ columns:
plt.close("all")
You could also create groupings with :meth:`DataFrame.plot.box`, for instance:

.. versionchanged:: 1.4.0

.. ipython:: python
:suppress:
plt.close("all")
np.random.seed(123456)
.. ipython:: python
:okwarning:
df = pd.DataFrame(np.random.rand(10, 3), columns=["Col1", "Col2", "Col3"])
df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
plt.figure();
@savefig box_plot_ex4.png
bp = df.plot.box(column=["Col1", "Col2"], by="X")
.. ipython:: python
:suppress:
plt.close("all")
.. _visualization.box.return:

In ``boxplot``, the return type can be controlled by the ``return_type``, keyword. The valid choices are ``{"axes", "dict", "both", None}``.
Expand Down
11 changes: 9 additions & 2 deletions doc/source/whatsnew/v1.3.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
-
- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`)
- Regression in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`)
- Fixed regression in :meth:`DataFrame.shift` where TypeError occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`)
- Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`)
- Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`)
- Fixed regression where :meth:`pandas.read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to None (:issue:`42387`)
- Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`)
-

.. ---------------------------------------------------------------------------
Expand All @@ -23,7 +30,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
-
- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`)
-

.. ---------------------------------------------------------------------------
Expand Down
17 changes: 15 additions & 2 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ Other enhancements
- Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
- :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`)
- :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
- Added ``sparse_index`` and ``sparse_columns`` keyword arguments to :meth:`.Styler.to_html` (:issue:`41946`)
- Added keyword argument ``environment`` to :meth:`.Styler.to_latex` also allowing a specific "longtable" entry with a separate jinja2 template (:issue:`41866`)
- :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -166,6 +169,10 @@ Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`)
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`)
- Performance improvement in constructing :class:`DataFrame` objects (:issue:`42631`)
- Performance improvement in :meth:`GroupBy.shift` when ``fill_value`` argument is provided (:issue:`26615`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`)
-

.. ---------------------------------------------------------------------------
Expand Down Expand Up @@ -202,7 +209,7 @@ Numeric
^^^^^^^
- Bug in :meth:`DataFrame.rank` raising ``ValueError`` with ``object`` columns and ``method="first"`` (:issue:`41931`)
- Bug in :meth:`DataFrame.rank` treating missing values and extreme values as equal (for example ``np.nan`` and ``np.inf``), causing incorrect results when ``na_option="bottom"`` or ``na_option="top`` used (:issue:`41931`)
-
- Bug in ``numexpr`` engine still being used when the option ``compute.use_numexpr`` is set to ``False`` (:issue:`32556`)

Conversion
^^^^^^^^^^
Expand All @@ -225,6 +232,8 @@ Indexing
- Bug in :meth:`Series.loc` when with a :class:`MultiIndex` whose first level contains only ``np.nan`` values (:issue:`42055`)
- Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` when passing a string, the return type depended on whether the index was monotonic (:issue:`24892`)
- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
-

Expand Down Expand Up @@ -261,11 +270,14 @@ Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^
- Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
- Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
-
- Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`)
- Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`)

Reshaping
^^^^^^^^^
- Improved error message when creating a :class:`DataFrame` column from a multi-dimensional :class:`numpy.ndarray` (:issue:`42463`)
- :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
-

Sparse
Expand All @@ -285,6 +297,7 @@ Styler

Other
^^^^^
- Bug in :meth:`CustomBusinessMonthBegin.__add__` (:meth:`CustomBusinessMonthEnd.__add__`) not applying the extra ``offset`` parameter when beginning (end) of the target month is already a business day (:issue:`41356`)

.. ***DO NOT USE THIS SECTION***
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ dependencies:
- fsspec>=0.7.4, <2021.6.0 # for generic remote file operations
- gcsfs>=0.6.0 # file IO when using 'gcs://...' path
- sqlalchemy # pandas.read_sql, DataFrame.to_sql
- xarray # DataFrame.to_xarray
- xarray<0.19 # DataFrame.to_xarray
- cftime # Needed for downstream xarray.CFTimeIndex test
- pyreadstat # pandas.read_spss
- tabulate>=0.8.3 # DataFrame.to_markdown
Expand Down
75 changes: 55 additions & 20 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,8 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
This is a reverse of the label factorization process.
"""
cdef:
Py_ssize_t i, loc, label, n
ndarray[intp_t] indexer, where, counts
Py_ssize_t i, label, n
intp_t[::1] indexer, where, counts

counts = np.zeros(ngroups + 1, dtype=np.intp)
n = len(index)
Expand All @@ -241,7 +241,7 @@ def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
indexer[where[label]] = i
where[label] += 1

return indexer, counts
return indexer.base, counts.base


cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
Expand Down Expand Up @@ -325,11 +325,14 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
cdef:
Py_ssize_t i, j, xi, yi, N, K
bint minpv
ndarray[float64_t, ndim=2] result
float64_t[:, ::1] result
# Initialize to None since we only use in the no missing value case
float64_t[::1] means=None, ssqds=None
ndarray[uint8_t, ndim=2] mask
bint no_nans
int64_t nobs = 0
float64_t vx, vy, meanx, meany, divisor, prev_meany, prev_meanx, ssqdmx
float64_t ssqdmy, covxy
float64_t mean, ssqd, val
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy

N, K = (<object>mat).shape

Expand All @@ -340,25 +343,57 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):

result = np.empty((K, K), dtype=np.float64)
mask = np.isfinite(mat).view(np.uint8)
no_nans = mask.all()

# Computing the online means and variances is expensive - so if possible we can
# precompute these and avoid repeating the computations each time we handle
# an (xi, yi) pair
if no_nans:
means = np.empty(K, dtype=np.float64)
ssqds = np.empty(K, dtype=np.float64)

with nogil:
for j in range(K):
ssqd = mean = 0
for i in range(N):
val = mat[i, j]
dx = val - mean
mean += 1 / (i + 1) * dx
ssqd += (val - mean) * dx

means[j] = mean
ssqds[j] = ssqd

with nogil:
for xi in range(K):
for yi in range(xi + 1):
# Welford's method for the variance-calculation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
for i in range(N):
if mask[i, xi] and mask[i, yi]:
covxy = 0
if no_nans:
for i in range(N):
vx = mat[i, xi]
vy = mat[i, yi]
nobs += 1
prev_meanx = meanx
prev_meany = meany
meanx = meanx + 1 / nobs * (vx - meanx)
meany = meany + 1 / nobs * (vy - meany)
ssqdmx = ssqdmx + (vx - meanx) * (vx - prev_meanx)
ssqdmy = ssqdmy + (vy - meany) * (vy - prev_meany)
covxy = covxy + (vx - meanx) * (vy - prev_meany)
covxy += (vx - means[xi]) * (vy - means[yi])

ssqdmx = ssqds[xi]
ssqdmy = ssqds[yi]
nobs = N

else:
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
for i in range(N):
# Welford's method for the variance-calculation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
if mask[i, xi] and mask[i, yi]:
vx = mat[i, xi]
vy = mat[i, yi]
nobs += 1
dx = vx - meanx
dy = vy - meany
meanx += 1 / nobs * dx
meany += 1 / nobs * dy
ssqdmx += (vx - meanx) * dx
ssqdmy += (vy - meany) * dy
covxy += (vx - meanx) * dy

if nobs < minpv:
result[xi, yi] = result[yi, xi] = NaN
Expand All @@ -370,7 +405,7 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
else:
result[xi, yi] = result[yi, xi] = NaN

return result
return result.base

# ----------------------------------------------------------------------
# Pairwise Spearman correlation
Expand Down
Loading

0 comments on commit 838ceb4

Please sign in to comment.