diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 3538652c9bded..ca9751569336c 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -454,6 +454,7 @@ non-null values: series[10:20] = 5 series.nunique() +.. _basics.describe: Summarizing data: describe ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -471,7 +472,13 @@ course): frame.ix[::2] = np.nan frame.describe() -.. _basics.describe: +You can select specific percentiles to include in the output: + +.. ipython:: python + + series.describe(percentiles=[.05, .25, .75, .95]) + +By default, the median is always included. For a non-numerical Series object, `describe` will give a simple summary of the number of unique values and most frequently occurring values: @@ -482,6 +489,7 @@ number of unique values and most frequently occurring values: s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) s.describe() + There also is a utility function, ``value_range`` which takes a DataFrame and returns a series with the minimum/maximum values in the DataFrame. diff --git a/doc/source/release.rst b/doc/source/release.rst index 2f6e6e7cd0fdc..c2cf938f6f806 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -204,6 +204,8 @@ API Changes - Produce :class:`~pandas.io.parsers.ParserWarning` on fallback to python parser when no options are ignored (:issue:`6607`) - Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`) +- :meth:`DataFrame.describe` on a DataFrame with a mix of Timestamp and string like objects + returns a different Index (:issue:`7088`). Previously the index was unintentionally sorted. Deprecations ~~~~~~~~~~~~ @@ -250,6 +252,10 @@ Deprecations - The support for the 'mysql' flavor when using DBAPI connection objects has been deprecated. MySQL will be further supported with SQLAlchemy engines (:issue:`6900`). +- The `percentile_width` keyword argument in :meth:`~DataFrame.describe` has been deprecated. + Use the `percentiles` keyword instead, which takes a list of percentiles to display. The + default output is unchanged. + Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -339,6 +345,7 @@ Improvements to existing features - ``boxplot`` now supports ``layout`` keyword (:issue:`6769`) - Regression in the display of a MultiIndexed Series with ``display.max_rows`` is less than the length of the series (:issue:`7101`) +- :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index d063a7218be97..9f050633a3b0d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -196,6 +196,8 @@ API changes - accept ``TextFileReader`` in ``concat``, which was affecting a common user idiom (:issue:`6583`), this was a regression from 0.13.1 - Added ``factorize`` functions to ``Index`` and ``Series`` to get indexer and unique values (:issue:`7090`) +- ``describe`` on a DataFrame with a mix of Timestamp and string like objects returns a different Index (:issue:`7088`). + Previously the index was unintentionally sorted. .. _whatsnew_0140.display: @@ -511,6 +513,10 @@ Deprecations - The support for the 'mysql' flavor when using DBAPI connection objects has been deprecated. MySQL will be further supported with SQLAlchemy engines (:issue:`6900`). +- The `percentile_width` keyword argument in :meth:`~DataFrame.describe` has been deprecated. + Use the `percentiles` keyword instead, which takes a list of percentiles to display. The + default output is unchanged. + .. _whatsnew_0140.enhancements: Enhancements @@ -577,6 +583,7 @@ Enhancements - ``CustomBuisnessMonthBegin`` and ``CustomBusinessMonthEnd`` are now available (:issue:`6866`) - :meth:`Series.quantile` and :meth:`DataFrame.quantile` now accept an array of quantiles. +- :meth:`~DataFrame.describe` now accepts an array of percentiles to include in the summary statistics (:issue:`4196`) - ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`) .. ipython:: python diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec1529b66acd8..d20cec7aa79ee 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3808,54 +3808,6 @@ def corrwith(self, other, axis=0, drop=False): return correl - def describe(self, percentile_width=50): - """ - Generate various summary statistics of each column, excluding - NaN values. These include: count, mean, std, min, max, and - lower%/50%/upper% percentiles - - Parameters - ---------- - percentile_width : float, optional - width of the desired uncertainty interval, default is 50, - which corresponds to lower=25, upper=75 - - Returns - ------- - DataFrame of summary statistics - """ - numdata = self._get_numeric_data() - - if len(numdata.columns) == 0: - return DataFrame(dict((k, v.describe()) - for k, v in compat.iteritems(self)), - columns=self.columns) - - lb = .5 * (1. - percentile_width / 100.) - ub = 1. - lb - - def pretty_name(x): - x *= 100 - if x == int(x): - return '%.0f%%' % x - else: - return '%.1f%%' % x - - destat_columns = ['count', 'mean', 'std', 'min', - pretty_name(lb), '50%', pretty_name(ub), - 'max'] - - destat = [] - - for i in range(len(numdata.columns)): - series = numdata.iloc[:, i] - destat.append([series.count(), series.mean(), series.std(), - series.min(), series.quantile(lb), series.median(), - series.quantile(ub), series.max()]) - - return self._constructor(lmap(list, zip(*destat)), - index=destat_columns, columns=numdata.columns) - #---------------------------------------------------------------------- # ndarray-like stats methods diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b2e7120a21062..9172d174a1354 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -19,7 +19,7 @@ import pandas.core.common as com import pandas.core.datetools as datetools from pandas import compat, _np_version_under1p7 -from pandas.compat import map, zip, lrange, string_types, isidentifier +from pandas.compat import map, zip, lrange, string_types, isidentifier, lmap from pandas.core.common import (isnull, notnull, is_list_like, _values_from_object, _maybe_promote, _maybe_box_datetimelike, ABCSeries, SettingWithCopyError, SettingWithCopyWarning) @@ -3478,6 +3478,154 @@ def _convert_timedeltas(x): return np.abs(self) + _shared_docs['describe'] = """ + Generate various summary statistics, excluding NaN values. + + Parameters + ---------- + percentile_width : float, deprecated + The ``percentile_width`` argument will be removed in a future + version. Use ``percentiles`` instead. + width of the desired uncertainty interval, default is 50, + which corresponds to lower=25, upper=75 + percentiles : array-like, optional + The percentiles to include in the output. Should all + be in the interval [0, 1]. By default `percentiles` is + [.25, .5, .75], returning the 25th, 50th, and 75th percentiles. + + Returns + ------- + summary: %(klass)s of summary statistics + + Notes + ----- + For numeric dtypes the index includes: count, mean, std, min, + max, and lower, 50, and upper percentiles. + + If self is of object dtypes (e.g. timestamps or strings), the output + will include the count, unique, most common, and frequency of the + most common. Timestamps also include the first and last items. + + If multiple values have the highest count, then the + `count` and `most common` pair will be arbitrarily chosen from + among those with the highest count. + """ + + @Appender(_shared_docs['describe'] % _shared_doc_kwargs) + def describe(self, percentile_width=None, percentiles=None): + if self.ndim >= 3: + msg = "describe is not implemented on on Panel or PanelND objects." + raise NotImplementedError(msg) + + if percentile_width is not None and percentiles is not None: + msg = "Cannot specify both 'percentile_width' and 'percentiles.'" + raise ValueError(msg) + if percentiles is not None: + # get them all to be in [0, 1] + percentiles = np.asarray(percentiles) + if (percentiles > 1).any(): + percentiles = percentiles / 100.0 + msg = ("percentiles should all be in the interval [0, 1]. " + "Try {0} instead.") + raise ValueError(msg.format(list(percentiles))) + else: + # only warn if they change the default + if percentile_width is not None: + do_warn = True + else: + do_warn = False + percentile_width = percentile_width or 50 + lb = .5 * (1. - percentile_width / 100.) + ub = 1. - lb + percentiles = np.array([lb, 0.5, ub]) + if do_warn: + msg = ("The `percentile_width` keyword is deprecated. " + "Use percentiles={0} instead".format(list(percentiles))) + warnings.warn(msg, FutureWarning) + + # median should always be included + if (percentiles != 0.5).all(): # median isn't included + lh = percentiles[percentiles < .5] + uh = percentiles[percentiles > .5] + percentiles = np.hstack([lh, 0.5, uh]) + + # dtypes: numeric only, numeric mixed, objects only + data = self._get_numeric_data() + if self.ndim > 1: + if len(data._info_axis) == 0: + is_object = True + else: + is_object = False + else: + is_object = not self._is_numeric_mixed_type + + def pretty_name(x): + x *= 100 + if x == int(x): + return '%.0f%%' % x + else: + return '%.1f%%' % x + + def describe_numeric_1d(series, percentiles): + return ([series.count(), series.mean(), series.std(), + series.min()] + + [series.quantile(x) for x in percentiles] + + [series.max()]) + + def describe_categorical_1d(data): + if data.dtype == object: + names = ['count', 'unique'] + objcounts = data.value_counts() + result = [data.count(), len(objcounts)] + if result[1] > 0: + names += ['top', 'freq'] + top, freq = objcounts.index[0], objcounts.iloc[0] + result += [top, freq] + + elif issubclass(data.dtype.type, np.datetime64): + names = ['count', 'unique'] + asint = data.dropna().values.view('i8') + objcounts = compat.Counter(asint) + result = [data.count(), len(objcounts)] + if result[1] > 0: + top, freq = objcounts.most_common(1)[0] + names += ['first', 'last', 'top', 'freq'] + result += [lib.Timestamp(asint.min()), + lib.Timestamp(asint.max()), + lib.Timestamp(top), freq] + + return pd.Series(result, index=names) + + if is_object: + if data.ndim == 1: + return describe_categorical_1d(self) + else: + result = pd.DataFrame(dict((k, describe_categorical_1d(v)) + for k, v in compat.iteritems(self)), + columns=self._info_axis, + index=['count', 'unique', 'first', 'last', + 'top', 'freq']) + # just objects, no datime + if pd.isnull(result.loc['first']).all(): + result = result.drop(['first', 'last'], axis=0) + return result + else: + stat_index = (['count', 'mean', 'std', 'min'] + + [pretty_name(x) for x in percentiles] + + ['max']) + if data.ndim == 1: + return pd.Series(describe_numeric_1d(data, percentiles), + index=stat_index) + else: + destat = [] + for i in range(len(data._info_axis)): # BAD + series = data.iloc[:, i] + destat.append(describe_numeric_1d(series, percentiles)) + + return self._constructor(lmap(list, zip(*destat)), + index=stat_index, + columns=data._info_axis) + _shared_docs['pct_change'] = """ Percent change over given number of periods. diff --git a/pandas/core/series.py b/pandas/core/series.py index d4b6039cd375e..d95f8da8097e9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1267,67 +1267,6 @@ def multi(values, qs): def ptp(self, axis=None, out=None): return _values_from_object(self).ptp(axis, out) - def describe(self, percentile_width=50): - """ - Generate various summary statistics of Series, excluding NaN - values. These include: count, mean, std, min, max, and - lower%/50%/upper% percentiles - - Parameters - ---------- - percentile_width : float, optional - width of the desired uncertainty interval, default is 50, - which corresponds to lower=25, upper=75 - - Returns - ------- - desc : Series - """ - from pandas.compat import Counter - - if self.dtype == object: - names = ['count', 'unique'] - objcounts = Counter(self.dropna().values) - data = [self.count(), len(objcounts)] - if data[1] > 0: - names += ['top', 'freq'] - top, freq = objcounts.most_common(1)[0] - data += [top, freq] - - elif issubclass(self.dtype.type, np.datetime64): - names = ['count', 'unique'] - asint = self.dropna().values.view('i8') - objcounts = Counter(asint) - data = [self.count(), len(objcounts)] - if data[1] > 0: - top, freq = objcounts.most_common(1)[0] - names += ['first', 'last', 'top', 'freq'] - data += [lib.Timestamp(asint.min()), - lib.Timestamp(asint.max()), - lib.Timestamp(top), freq] - else: - - lb = .5 * (1. - percentile_width / 100.) - ub = 1. - lb - - def pretty_name(x): - x *= 100 - if x == int(x): - return '%.0f%%' % x - else: - return '%.1f%%' % x - - names = ['count'] - data = [self.count()] - names += ['mean', 'std', 'min', pretty_name(lb), '50%', - pretty_name(ub), 'max'] - data += [self.mean(), self.std(), self.min(), - self.quantile( - lb), self.median(), self.quantile(ub), - self.max()] - - return self._constructor(data, index=names).__finalize__(self) - def corr(self, other, method='pearson', min_periods=None): """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7365e4be187b0..4264f5b7e0931 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11224,40 +11224,6 @@ def test_rank_na_option(self): assert_almost_equal(ranks0.values, exp0) assert_almost_equal(ranks1.values, exp1) - def test_describe(self): - desc = self.tsframe.describe() - desc = self.mixed_frame.describe() - desc = self.frame.describe() - - def test_describe_percentiles(self): - desc = self.frame.describe(percentile_width=50) - assert '75%' in desc.index - assert '25%' in desc.index - - desc = self.frame.describe(percentile_width=95) - assert '97.5%' in desc.index - assert '2.5%' in desc.index - - def test_describe_no_numeric(self): - df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, - 'B': ['a', 'b', 'c', 'd'] * 6}) - desc = df.describe() - expected = DataFrame(dict((k, v.describe()) - for k, v in compat.iteritems(df)), - columns=df.columns) - assert_frame_equal(desc, expected) - - df = DataFrame({'time': self.tsframe.index}) - desc = df.describe() - assert(desc.time['first'] == min(self.tsframe.index)) - - def test_describe_empty_int_columns(self): - df = DataFrame([[0, 1], [1, 2]]) - desc = df[df[0] < 0].describe() # works - assert_series_equal(desc.xs('count'), - Series([0, 0], dtype=float, name='count')) - self.assert_(isnull(desc.ix[1:]).all().all()) - def test_axis_aliases(self): f = self.frame diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 42bb76930d783..57ec9d0eb8981 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -646,6 +646,59 @@ def test_interp_datetime64(self): expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3)) assert_series_equal(result, expected) + def test_describe(self): + _ = self.series.describe() + _ = self.ts.describe() + + def test_describe_percentiles(self): + with tm.assert_produces_warning(FutureWarning): + desc = self.series.describe(percentile_width=50) + assert '75%' in desc.index + assert '25%' in desc.index + + with tm.assert_produces_warning(FutureWarning): + desc = self.series.describe(percentile_width=95) + assert '97.5%' in desc.index + assert '2.5%' in desc.index + + def test_describe_objects(self): + s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) + result = s.describe() + expected = Series({'count': 7, 'unique': 4, + 'top': 'a', 'freq': 3}, index=result.index) + assert_series_equal(result, expected) + + dt = list(self.ts.index) + dt.append(dt[0]) + ser = Series(dt) + rs = ser.describe() + min_date = min(dt) + max_date = max(dt) + xp = Series({'count': len(dt), + 'unique': len(self.ts.index), + 'first': min_date, 'last': max_date, 'freq': 2, + 'top': min_date}, index=rs.index) + assert_series_equal(rs, xp) + + def test_describe_empty(self): + result = pd.Series().describe() + + self.assertEqual(result['count'], 0) + self.assert_(result.drop('count').isnull().all()) + + nanSeries = Series([np.nan]) + nanSeries.name = 'NaN' + result = nanSeries.describe() + self.assertEqual(result['count'], 0) + self.assert_(result.drop('count').isnull().all()) + + def test_describe_none(self): + noneSeries = Series([None]) + noneSeries.name = 'None' + assert_series_equal(noneSeries.describe(), + Series([0, 0], index=['count', 'unique'])) + + class TestDataFrame(tm.TestCase, Generic): _typ = DataFrame _comparator = lambda self, x, y: assert_frame_equal(x,y) @@ -708,7 +761,6 @@ def test_interp_combo(self): expected = Series([1, 2, 3, 4]) assert_series_equal(result, expected) - def test_interp_nan_idx(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) df = df.set_index('A') @@ -859,6 +911,115 @@ def test_interp_ignore_all_good(self): result = df[['B', 'D']].interpolate(downcast=None) assert_frame_equal(result, df[['B', 'D']]) + def test_describe(self): + desc = tm.makeDataFrame().describe() + desc = tm.makeMixedDataFrame().describe() + desc = tm.makeTimeDataFrame().describe() + + def test_describe_percentiles(self): + with tm.assert_produces_warning(FutureWarning): + desc = tm.makeDataFrame().describe(percentile_width=50) + assert '75%' in desc.index + assert '25%' in desc.index + + with tm.assert_produces_warning(FutureWarning): + desc = tm.makeDataFrame().describe(percentile_width=95) + assert '97.5%' in desc.index + assert '2.5%' in desc.index + + def test_describe_quantiles_both(self): + with tm.assertRaises(ValueError): + tm.makeDataFrame().describe(percentile_width=50, + percentiles=[25, 75]) + + def test_describe_percentiles_percent_or_raw(self): + df = tm.makeDataFrame() + with tm.assertRaises(ValueError): + df.describe(percentiles=[10, 50, 100]) + + def test_describe_percentiles_equivalence(self): + df = tm.makeDataFrame() + d1 = df.describe() + d2 = df.describe(percentiles=[.25, .75]) + assert_frame_equal(d1, d2) + + def test_describe_percentiles_insert_median(self): + df = tm.makeDataFrame() + d1 = df.describe(percentiles=[.25, .75]) + d2 = df.describe(percentiles=[.25, .5, .75]) + assert_frame_equal(d1, d2) + + # none above + d1 = df.describe(percentiles=[.25, .45]) + d2 = df.describe(percentiles=[.25, .45, .5]) + assert_frame_equal(d1, d2) + + # none below + d1 = df.describe(percentiles=[.75, 1]) + d2 = df.describe(percentiles=[.5, .75, 1]) + assert_frame_equal(d1, d2) + + def test_describe_no_numeric(self): + df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, + 'B': ['a', 'b', 'c', 'd'] * 6}) + desc = df.describe() + expected = DataFrame(dict((k, v.describe()) + for k, v in compat.iteritems(df)), + columns=df.columns) + assert_frame_equal(desc, expected) + + ts = tm.makeTimeSeries() + df = DataFrame({'time': ts.index}) + desc = df.describe() + self.assertEqual(desc.time['first'], min(ts.index)) + + def test_describe_empty_int_columns(self): + df = DataFrame([[0, 1], [1, 2]]) + desc = df[df[0] < 0].describe() # works + assert_series_equal(desc.xs('count'), + Series([0, 0], dtype=float, name='count')) + self.assert_(isnull(desc.ix[1:]).all().all()) + + def test_describe_objects(self): + df = DataFrame({"C1": ['a', 'a', 'c'], "C2": ['d', 'd', 'f']}) + result = df.describe() + expected = DataFrame({"C1": [3, 2, 'a', 2], "C2": [3, 2, 'd', 2]}, + index=['count', 'unique', 'top', 'freq']) + assert_frame_equal(result, expected) + + df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')}) + result = df.describe() + expected = DataFrame({"C1": [4, 4, pd.Timestamp('2010-01-01'), + pd.Timestamp('2010-01-04'), + pd.Timestamp('2010-01-01'), 1]}, + index=['count', 'unique', 'first', 'last', 'top', + 'freq']) + assert_frame_equal(result, expected) + + # mix time and str + df['C2'] = ['a', 'a', 'b', 'c'] + result = df.describe() + # when mix of dateimte / obj the index gets reordered. + expected['C2'] = [4, 3, np.nan, np.nan, 'a', 2] + assert_frame_equal(result, expected) + + # just str + expected = DataFrame({'C2': [4, 3, 'a', 2]}, + index=['count', 'unique', 'top', 'freq']) + result = df[['C2']].describe() + + # mix of time, str, numeric + df['C3'] = [2, 4, 6, 8] + result = df.describe() + expected = DataFrame({"C3": [4., 5., 2.5819889, 2., 3.5, 5., 6.5, 8.]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + assert_frame_equal(result, expected) + assert_frame_equal(df.describe(), df[['C3']].describe()) + + assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe()) + assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe()) + def test_no_order(self): _skip_if_no_scipy() s = Series([0, 1, np.nan, 3]) @@ -1053,6 +1214,9 @@ def test_equals(self): df2 = df1.set_index(['floats'], append=True) self.assert_(df3.equals(df2)) + def test_describe_raises(self): + with tm.assertRaises(NotImplementedError): + tm.makePanel().describe() if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 2bb720e1644ad..6e7c9edfc4025 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2219,56 +2219,6 @@ def test_quantile_multi(self): Timestamp('2000-01-10 19:12:00')], index=[.2, .2])) - def test_describe(self): - _ = self.series.describe() - _ = self.ts.describe() - - def test_describe_percentiles(self): - desc = self.series.describe(percentile_width=50) - assert '75%' in desc.index - assert '25%' in desc.index - - desc = self.series.describe(percentile_width=95) - assert '97.5%' in desc.index - assert '2.5%' in desc.index - - def test_describe_objects(self): - s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) - result = s.describe() - expected = Series({'count': 7, 'unique': 4, - 'top': 'a', 'freq': 3}, index=result.index) - assert_series_equal(result, expected) - - dt = list(self.ts.index) - dt.append(dt[0]) - ser = Series(dt) - rs = ser.describe() - min_date = min(dt) - max_date = max(dt) - xp = Series({'count': len(dt), - 'unique': len(self.ts.index), - 'first': min_date, 'last': max_date, 'freq': 2, - 'top': min_date}, index=rs.index) - assert_series_equal(rs, xp) - - def test_describe_empty(self): - result = self.empty.describe() - - self.assertEqual(result['count'], 0) - self.assert_(result.drop('count').isnull().all()) - - nanSeries = Series([np.nan]) - nanSeries.name = 'NaN' - result = nanSeries.describe() - self.assertEqual(result['count'], 0) - self.assert_(result.drop('count').isnull().all()) - - def test_describe_none(self): - noneSeries = Series([None]) - noneSeries.name = 'None' - assert_series_equal(noneSeries.describe(), - Series([0, 0], index=['count', 'unique'])) - def test_append(self): appendedSeries = self.series.append(self.objSeries) for idx, value in compat.iteritems(appendedSeries):