pandas-dev · TomAugspurger · Nov 10, 2018 · Oct 26, 2018 · Oct 27, 2018 · Oct 29, 2018
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4574,6 +4574,8 @@ Several caveats.
 * Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
+* ``partition_cols`` will be used for partitioning the dataset, where the dataset will be written to multiple
+  files in the path specified. Therefore, the path specified, must be a directory path.
 
 You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
 If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``,

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -213,6 +213,7 @@ Other Enhancements
 - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
 - Compatibility with Matplotlib 3.0 (:issue:`22790`).
 - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
+- :func:`~DataFrame.to_parquet` now supports writing a DataFrame as a directory of parquet files partitioned by a subset of the columns. (:issue:`23283`).
 - :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
 
 .. _whatsnew_0240.api_breaking:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1970,7 +1970,7 @@ def to_feather(self, fname):
         to_feather(self, fname)
 
     def to_parquet(self, fname, engine='auto', compression='snappy',
-                   index=None, **kwargs):
+                   index=None, partition_cols=None, **kwargs):
         """
         Write a DataFrame to the binary parquet format.
 
@@ -1984,7 +1984,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         Parameters
         ----------
         fname : str
-            String file path.
+            File path or Root Directory path. Will be used as Root Directory
+            path while writing a partitioned dataset.
         engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
             Parquet library to use. If 'auto', then the option
             ``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -1998,6 +1999,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
             the behavior depends on the chosen engine.
 
             .. versionadded:: 0.24.0
+        partition_cols : list, optional, default None
+            Column names by which to partition the dataset
+            Columns are partitioned in the order they are given
+            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
+            For other versions, this argument will be ignored.
+            .. versionadded:: 0.24.0
 
         **kwargs
             Additional arguments passed to the parquet library. See
@@ -2027,7 +2034,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         """
         from pandas.io.parquet import to_parquet
         to_parquet(self, fname, engine,
-                   compression=compression, index=index, **kwargs)
+                   compression=compression, index=index,
+                   partition_cols=partition_cols, **kwargs)
 
     @Substitution(header='Write out the column names. If a list of strings '
                          'is given, it is assumed to be aliases for the '

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -104,7 +104,8 @@ def __init__(self):
         self.api = pyarrow
 
     def write(self, df, path, compression='snappy',
-              coerce_timestamps='ms', index=None, **kwargs):
+              coerce_timestamps='ms', index=None, partition_cols=None,
+              **kwargs):
         self.validate_dataframe(df)
 
         # Only validate the index if we're writing it.
@@ -125,9 +126,15 @@ def write(self, df, path, compression='snappy',
 
         else:
             table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
-            self.api.parquet.write_table(
-                table, path, compression=compression,
-                coerce_timestamps=coerce_timestamps, **kwargs)
+            if partition_cols is not None:
+                self.api.parquet.write_to_dataset(
+                    table, path, compression=compression,
+                    coerce_timestamps=coerce_timestamps,
+                    partition_cols=partition_cols, **kwargs)
+            else:
+                self.api.parquet.write_table(
+                    table, path, compression=compression,
+                    coerce_timestamps=coerce_timestamps, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
         path, _, _, should_close = get_filepath_or_buffer(path)
@@ -206,12 +213,16 @@ def __init__(self):
             )
         self.api = fastparquet
 
-    def write(self, df, path, compression='snappy', index=None, **kwargs):
+    def write(self, df, path, compression='snappy', index=None,
+              partition_cols=None, **kwargs):
         self.validate_dataframe(df)
         # thriftpy/protocol/compact.py:339:
         # DeprecationWarning: tostring() is deprecated.
         # Use tobytes() instead.
 
+        if partition_cols is not None:
+            kwargs['file_scheme'] = 'hive'
+
         if is_s3_url(path):
             # path is s3:// so we need to open the s3file in 'wb' mode.
             # TODO: Support 'ab'
@@ -224,7 +235,8 @@ def write(self, df, path, compression='snappy', index=None, **kwargs):
 
         with catch_warnings(record=True):
             self.api.write(path, df, compression=compression,
-                           write_index=index, **kwargs)
+                           write_index=index, partition_on=partition_cols,
+                           **kwargs)
 
     def read(self, path, columns=None, **kwargs):
         if is_s3_url(path):
@@ -244,15 +256,15 @@ def read(self, path, columns=None, **kwargs):
 
 
 def to_parquet(df, path, engine='auto', compression='snappy', index=None,
-               **kwargs):
+               partition_cols=None, **kwargs):
     """
     Write a DataFrame to the parquet format.
 
     Parameters
     ----------
-    df : DataFrame
-    path : string
-        File path
+    path : str
+        File path or Root Directory path. Will be used as Root Directory path
+        while writing a partitioned dataset.
     engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
         Parquet library to use. If 'auto', then the option
         ``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -266,11 +278,18 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
         engine's default behavior will be used.
 
         .. versionadded 0.24.0
+    partition_cols : list, optional
+            Column names by which to partition the dataset
+            Columns are partitioned in the order they are given
+            The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
+            For other versions, this argument will be ignored.
+            .. versionadded:: 0.24.0
     kwargs
         Additional keyword arguments passed to the engine
     """
     impl = get_engine(engine)
-    return impl.write(df, path, compression=compression, index=index, **kwargs)
+    return impl.write(df, path, compression=compression, index=index,
+                      partition_cols=partition_cols, **kwargs)
 
 
 def read_parquet(path, engine='auto', columns=None, **kwargs):

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1,4 +1,5 @@
 """ test parquet compat """
+import os
 
 import pytest
 import datetime
@@ -478,6 +479,26 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
         check_round_trip(df_compat, pa,
                          path='s3://pandas-test/pyarrow.parquet')
 
+    def test_partition_cols_supported(self, pa_ge_070, df_full):
+        # GH #23283
+        partition_cols = ['bool', 'int']
+        df = df_full
+        with tm.ensure_clean_dir() as path:
+            df.to_parquet(path, partition_cols=partition_cols,
+                          compression=None)
+            import pyarrow.parquet as pq
+            dataset = pq.ParquetDataset(path, validate_schema=False)
+            assert len(dataset.partitions.partition_names) == 2
+            assert dataset.partitions.partition_names == set(partition_cols)
+
+    def test_ignore_partition_cols_lt_070(self, pa_lt_070, df_full):
+        # GH #23283
+        partition_cols = ['bool', 'int']
+        pa = pa_lt_070
+        df = df_full
+        check_round_trip(df, pa,
+                         write_kwargs={'partition_cols': partition_cols})
+
 
 class TestParquetFastParquet(Base):
 
@@ -543,3 +564,15 @@ def test_s3_roundtrip(self, df_compat, s3_resource, fp):
         # GH #19134
         check_round_trip(df_compat, fp,
                          path='s3://pandas-test/fastparquet.parquet')
+
+    def test_partition_cols_supported(self, fp, df_full):
+        # GH #23283
+        partition_cols = ['bool', 'int']
+        df = df_full
+        with tm.ensure_clean_dir() as path:
+            df.to_parquet(path, partition_cols=partition_cols,
+                          compression=None)
+            assert os.path.exists(path)
+            import fastparquet
+            actual_partition_cols = fastparquet.ParquetFile(path, False).cats
+            assert len(actual_partition_cols) == 2
diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py
@@ -875,3 +875,12 @@ def test_datapath_missing(datapath, request):
     )
 
     assert result == expected
+
+
+def test_create_temp_directory():
+    temppath = ''
+    with tm.ensure_clean_dir() as path:
+        assert os.path.exists(path)
+        assert os.path.isdir(path)
+        temppath = path
+    assert not os.path.exists(temppath)
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -772,6 +772,23 @@ def ensure_clean(filename=None, return_filelike=False):
                 print("Exception on removing file: {error}".format(error=e))
 
 
+@contextmanager
+def ensure_clean_dir():
+    """
+    Get a temporary directory path and agrees to remove on close.
+
+    Yields
+    ----------
+    Temporary directory path
+    """
+    directory_name = tempfile.mkdtemp(suffix='')
+    try:
+        yield directory_name
+    finally:
+        import shutil
+        shutil.rmtree(directory_name)
+
+
 # -----------------------------------------------------------------------------
 # Comparators