BUG: Fix read_excel w/parse_cols & empty dataset (pandas-dev#23661)

Closes pandas-devgh-9208.
Pingviinituutti · Feb 28, 2019 · 791981f · 791981f
1 parent d1c5b7f
commit 791981f
Show file tree

Hide file tree

Showing 6 changed files with 24 additions and 11 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -1264,9 +1264,6 @@ MultiIndex
 I/O
 ^^^
 
-- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
-- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
-
 .. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
 
 Proper handling of `np.NaN` in a string data-typed column with the Python engine
@@ -1302,6 +1299,9 @@ Current Behavior:
 
 Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
 
+- Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
+- Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
+- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)
 - :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
 - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
 - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -634,14 +634,17 @@ def _parse_cell(cell_contents, cell_typ):
                 else:
                     offset = 1 + max(header)
 
-                for col in index_col:
-                    last = data[offset][col]
-
-                    for row in range(offset + 1, len(data)):
-                        if data[row][col] == '' or data[row][col] is None:
-                            data[row][col] = last
-                        else:
-                            last = data[row][col]
+                # Check if we have an empty dataset
+                # before trying to collect data.
+                if offset < len(data):
+                    for col in index_col:
+                        last = data[offset][col]
+
+                        for row in range(offset + 1, len(data)):
+                            if data[row][col] == '' or data[row][col] is None:
+                                data[row][col] = last
+                            else:
+                                last = data[row][col]
 
             has_index_names = is_list_like(header) and len(header) > 1
 

diff --git a/pandas/tests/io/data/test1.xls b/pandas/tests/io/data/test1.xls
diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm
diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -235,6 +235,16 @@ def test_index_col_label_error(self, ext):
             self.get_exceldf("test1", ext, "Sheet1", index_col=["A"],
                              usecols=["A", "C"])
 
+    def test_index_col_empty(self, ext):
+        # see gh-9208
+        result = self.get_exceldf("test1", ext, "Sheet3",
+                                  index_col=["A", "B", "C"])
+        expected = DataFrame(columns=["D", "E", "F"],
+                             index=MultiIndex(levels=[[]] * 3,
+                                              labels=[[]] * 3,
+                                              names=["A", "B", "C"]))
+        tm.assert_frame_equal(result, expected)
+
     def test_usecols_pass_non_existent_column(self, ext):
         msg = ("Usecols do not match columns, "
                "columns expected but not found: " + r"\['E'\]")