Skip to content

Commit

Permalink
ENH: Add support for excluding the index from Parquet files (GH20768) (
Browse files Browse the repository at this point in the history
  • Loading branch information
dargueta authored and jorisvandenbossche committed Sep 21, 2018
1 parent 4612a82 commit bdb7a16
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 11 deletions.
38 changes: 38 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4570,6 +4570,9 @@ dtypes, including extension dtypes such as datetime with tz.
Several caveats.

* Duplicate column names and non-string columns names are not supported.
* The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default
indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
* Index level names, if specified, must be strings.
* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
Expand Down Expand Up @@ -4633,6 +4636,41 @@ Read only certain columns of a parquet file.
os.remove('example_pa.parquet')
os.remove('example_fp.parquet')
Handling Indexes
''''''''''''''''

Serializing a ``DataFrame`` to parquet may include the implicit index as one or
more columns in the output file. Thus, this code:

.. ipython:: python
df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
df.to_parquet('test.parquet', engine='pyarrow')
creates a parquet file with *three* columns if you use ``pyarrow`` for serialization:
``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the
index `may or may not <https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write>`_
be written to the file.

This unexpected extra column causes some databases like Amazon Redshift to reject
the file, because that column doesn't exist in the target table.

If you want to omit a dataframe's indexes when writing, pass ``index=False`` to
:func:`~pandas.DataFrame.to_parquet`:

.. ipython:: python
df.to_parquet('test.parquet', index=False)
This creates a parquet file with just the two expected columns, ``a`` and ``b``.
If your ``DataFrame`` has a custom index, you won't get it back when you load
this file into a ``DataFrame``.

Passing ``index=True`` will *always* write the index, even if that's not the
underlying engine's default behavior.


.. _io.sql:

SQL Queries
Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ New features

- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)

- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing
the user to override the engine's default behavior to include or omit the
dataframe's indexes from the resulting Parquet file. (:issue:`20768`)

.. _whatsnew_0240.enhancements.extension_array_operators:

``ExtensionArray`` operator support
Expand Down
11 changes: 9 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1902,7 +1902,7 @@ def to_feather(self, fname):
to_feather(self, fname)

def to_parquet(self, fname, engine='auto', compression='snappy',
**kwargs):
index=None, **kwargs):
"""
Write a DataFrame to the binary parquet format.
Expand All @@ -1924,6 +1924,13 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output.
If ``False``, they will not be written to the file. If ``None``,
the behavior depends on the chosen engine.
.. versionadded:: 0.24.0
**kwargs
Additional arguments passed to the parquet library. See
:ref:`pandas io <io.parquet>` for more details.
Expand Down Expand Up @@ -1952,7 +1959,7 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
"""
from pandas.io.parquet import to_parquet
to_parquet(self, fname, engine,
compression=compression, **kwargs)
compression=compression, index=index, **kwargs)

@Substitution(header='Write out the column names. If a list of strings '
'is given, it is assumed to be aliases for the '
Expand Down
33 changes: 24 additions & 9 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,19 +103,27 @@ def __init__(self):
self.api = pyarrow

def write(self, df, path, compression='snappy',
coerce_timestamps='ms', **kwargs):
coerce_timestamps='ms', index=None, **kwargs):
self.validate_dataframe(df)
if self._pyarrow_lt_070:

# Only validate the index if we're writing it.
if self._pyarrow_lt_070 and index is not False:
self._validate_write_lt_070(df)
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')

if index is None:
from_pandas_kwargs = {}
else:
from_pandas_kwargs = {'preserve_index': index}

if self._pyarrow_lt_060:
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
**from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression, **kwargs)

else:
table = self.api.Table.from_pandas(df)
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
self.api.parquet.write_table(
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)
Expand Down Expand Up @@ -197,7 +205,7 @@ def __init__(self):
)
self.api = fastparquet

def write(self, df, path, compression='snappy', **kwargs):
def write(self, df, path, compression='snappy', index=None, **kwargs):
self.validate_dataframe(df)
# thriftpy/protocol/compact.py:339:
# DeprecationWarning: tostring() is deprecated.
Expand All @@ -214,8 +222,8 @@ def write(self, df, path, compression='snappy', **kwargs):
path, _, _, _ = get_filepath_or_buffer(path)

with catch_warnings(record=True):
self.api.write(path, df,
compression=compression, **kwargs)
self.api.write(path, df, compression=compression,
write_index=index, **kwargs)

def read(self, path, columns=None, **kwargs):
if is_s3_url(path):
Expand All @@ -234,7 +242,8 @@ def read(self, path, columns=None, **kwargs):
return parquet_file.to_pandas(columns=columns, **kwargs)


def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
def to_parquet(df, path, engine='auto', compression='snappy', index=None,
**kwargs):
"""
Write a DataFrame to the parquet format.
Expand All @@ -250,11 +259,17 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output. If
``False``, they will not be written to the file. If ``None``, the
engine's default behavior will be used.
.. versionadded 0.24.0
kwargs
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
return impl.write(df, path, compression=compression, **kwargs)
return impl.write(df, path, compression=compression, index=index, **kwargs)


def read_parquet(path, engine='auto', columns=None, **kwargs):
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,40 @@ def test_multiindex_with_columns(self, pa_ge_070):
check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']},
expected=df[['A', 'B']])

def test_write_ignoring_index(self, engine):
# ENH 20768
# Ensure index=False omits the index from the written Parquet file.
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']})

write_kwargs = {
'compression': None,
'index': False,
}

# Because we're dropping the index, we expect the loaded dataframe to
# have the default integer index.
expected = df.reset_index(drop=True)

check_round_trip(df, engine, write_kwargs=write_kwargs,
expected=expected)

# Ignore custom index
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']},
index=['zyx', 'wvu', 'tsr'])

check_round_trip(df, engine, write_kwargs=write_kwargs,
expected=expected)

# Ignore multi-indexes as well.
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
df = pd.DataFrame({'one': [i for i in range(8)],
'two': [-i for i in range(8)]}, index=arrays)

expected = df.reset_index(drop=True)
check_round_trip(df, engine, write_kwargs=write_kwargs,
expected=expected)


class TestParquetPyArrow(Base):

Expand Down

0 comments on commit bdb7a16

Please sign in to comment.