Merge remote-tracking branch 'upstream/master' into io_csv_docstring_…

…fixed * upstream/master: DOC: Fixes to docstring to add validation to CI (pandas-dev#23560) DOC: Remove incorrect periods at the end of parameter types (pandas-dev#23600) MAINT: tm.assert_raises_regex --> pytest.raises (pandas-dev#23592) DOC: Updating Series.resample and DataFrame.resample docstrings (pandas-dev#23197) ENH: Support for partition_cols in to_parquet (pandas-dev#23321) TST: Use intp as expected dtype in IntervalIndex indexing tests (pandas-dev#23609)
thoo · Nov 11, 2018 · d0600f9 · d0600f9
2 parents 5e85114 + 2cea659
commit d0600f9
Show file tree

Hide file tree

Showing 239 changed files with 2,335 additions and 2,180 deletions.
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -151,7 +151,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
 
     MSG='Doctests generic.py' ; echo $MSG
     pytest -q --doctest-modules pandas/core/generic.py \
-        -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -to_json -transpose -values -xs"
+        -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Doctests top-level reshaping functions' ; echo $MSG

diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4673,6 +4673,43 @@ Passing ``index=True`` will *always* write the index, even if that's not the
 underlying engine's default behavior.
 
 
+Partitioning Parquet files
+''''''''''''''''''''''''''
+
+.. versionadded:: 0.24.0
+
+Parquet supports partitioning of data based on the values of one or more columns.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]})
+    df.to_parquet(fname='test', engine='pyarrow', partition_cols=['a'], compression=None)
+
+The `fname` specifies the parent directory to which data will be saved.
+The `partition_cols` are the column names by which the dataset will be partitioned.
+Columns are partitioned in the order they are given. The partition splits are
+determined by the unique values in the partition columns.
+The above example creates a partitioned dataset that may look like:
+
+.. code-block:: text
+
+    test
+    ├── a=0
+    │   ├── 0bac803e32dc42ae83fddfd029cbdebc.parquet
+    │   └──  ...
+    └── a=1
+        ├── e6ab24a4f45147b49b54a662f0c412a3.parquet
+        └── ...
+
+.. ipython:: python
+   :suppress:
+
+   from shutil import rmtree
+   try:
+       rmtree('test')
+   except Exception:
+       pass
+
 .. _io.sql:
 
 SQL Queries

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -236,6 +236,7 @@ Other Enhancements
 - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
 - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
+- :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`)
 - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
 
 .. _whatsnew_0240.api_breaking:

diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
@@ -73,7 +73,7 @@ def is_string_like(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Examples
     --------
@@ -127,7 +127,7 @@ def is_iterator(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -172,7 +172,7 @@ def is_file_like(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -203,7 +203,7 @@ def is_re(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -227,7 +227,7 @@ def is_re_compilable(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -261,7 +261,7 @@ def is_list_like(obj, allow_sets=True):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
     allow_sets : boolean, default True
         If this parameter is False, sets will not be considered list-like
 
@@ -310,7 +310,7 @@ def is_array_like(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -343,7 +343,7 @@ def is_nested_list_like(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -384,7 +384,7 @@ def is_dict_like(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -408,7 +408,7 @@ def is_named_tuple(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------
@@ -468,7 +468,7 @@ def is_sequence(obj):
 
     Parameters
     ----------
-    obj : The object to check.
+    obj : The object to check
 
     Returns
     -------

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -864,12 +864,17 @@ def iterrows(self):
            data types, the iterator returns a copy and not a view, and writing
            to it will have no effect.
 
-        Returns
-        -------
+        Yields
+        ------
+        index : label or tuple of label
+            The index of the row. A tuple for a `MultiIndex`.
+        data : Series
+            The data of the row as a Series.
+
         it : generator
             A generator that iterates over the rows of the frame.
 
-        See also
+        See Also
         --------
         itertuples : Iterate over DataFrame rows as namedtuples of the values.
         iteritems : Iterate over (column name, Series) pairs.
@@ -1970,7 +1975,7 @@ def to_feather(self, fname):
         to_feather(self, fname)
 
     def to_parquet(self, fname, engine='auto', compression='snappy',
-                   index=None, **kwargs):
+                   index=None, partition_cols=None, **kwargs):
         """
         Write a DataFrame to the binary parquet format.
 
@@ -1984,7 +1989,11 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         Parameters
         ----------
         fname : str
-            String file path.
+            File path or Root Directory path. Will be used as Root Directory
+            path while writing a partitioned dataset.
+
+            .. versionchanged:: 0.24.0
+
         engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
             Parquet library to use. If 'auto', then the option
             ``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -1999,6 +2008,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
 
             .. versionadded:: 0.24.0
 
+        partition_cols : list, optional, default None
+            Column names by which to partition the dataset
+            Columns are partitioned in the order they are given
+
+            .. versionadded:: 0.24.0
+
         **kwargs
             Additional arguments passed to the parquet library. See
             :ref:`pandas io <io.parquet>` for more details.
@@ -2027,7 +2042,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         """
         from pandas.io.parquet import to_parquet
         to_parquet(self, fname, engine,
-                   compression=compression, index=index, **kwargs)
+                   compression=compression, index=index,
+                   partition_cols=partition_cols, **kwargs)
 
     @Substitution(header='Write out the column names. If a list of strings '
                          'is given, it is assumed to be aliases for the '
@@ -3940,6 +3956,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             necessary. Setting to False will improve the performance of this
             method
 
+        Returns
+        -------
+        DataFrame
+
         Examples
         --------
         >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
@@ -3980,10 +4000,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2  2014  4      40
         3  2013  7      84
         4  2014  10     31
-
-        Returns
-        -------
-        dataframe : DataFrame
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
         if not isinstance(keys, list):
@@ -6683,6 +6699,15 @@ def round(self, decimals=0, *args, **kwargs):
             of `decimals` which are not columns of the input will be
             ignored.
 
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        numpy.around
+        Series.round
+
         Examples
         --------
         >>> df = pd.DataFrame(np.random.random([3, 3]),
@@ -6708,15 +6733,6 @@ def round(self, decimals=0, *args, **kwargs):
         first   0.0  1  0.17
         second  0.0  1  0.58
         third   0.9  0  0.49
-
-        Returns
-        -------
-        DataFrame object
-
-        See Also
-        --------
-        numpy.around
-        Series.round
         """
         from pandas.core.reshape.concat import concat
 
@@ -6782,7 +6798,6 @@ def corr(self, method='pearson', min_periods=1):
 
         Examples
         --------
-        >>> import numpy as np
         >>> histogram_intersection = lambda a, b: np.minimum(a, b
         ... ).sum().round(decimals=1)
         >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],