TST: Test unnamed columns with index_col for Excel (pandas-dev#23874)

Pingviinituutti · Feb 28, 2019 · 46044a2 · 46044a2
1 parent a0532d3
commit 46044a2
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 12 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1428,7 +1428,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
 - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
 - Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
-- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
+- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`)
+- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`)
 - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
 - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`)
 

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -662,10 +662,14 @@ def _parse_cell(cell_contents, cell_typ):
 
                 output[asheetname] = parser.read(nrows=nrows)
 
-                if ((not squeeze or isinstance(output[asheetname], DataFrame))
-                        and header_names):
-                    output[asheetname].columns = output[
-                        asheetname].columns.set_names(header_names)
+                if not squeeze or isinstance(output[asheetname], DataFrame):
+                    if header_names:
+                        output[asheetname].columns = output[
+                            asheetname].columns.set_names(header_names)
+                    elif compat.PY2:
+                        output[asheetname].columns = _maybe_convert_to_string(
+                            output[asheetname].columns)
+
             except EmptyDataError:
                 # No Data, return an empty DataFrame
                 output[asheetname] = DataFrame()
@@ -810,6 +814,39 @@ def _trim_excel_header(row):
     return row
 
 
+def _maybe_convert_to_string(row):
+    """
+    Convert elements in a row to string from Unicode.
+
+    This is purely a Python 2.x patch and is performed ONLY when all
+    elements of the row are string-like.
+
+    Parameters
+    ----------
+    row : array-like
+        The row of data to convert.
+
+    Returns
+    -------
+    converted : array-like
+    """
+    if compat.PY2:
+        converted = []
+
+        for i in range(len(row)):
+            if isinstance(row[i], compat.string_types):
+                try:
+                    converted.append(str(row[i]))
+                except UnicodeEncodeError:
+                    break
+            else:
+                break
+        else:
+            row = converted
+
+    return row
+
+
 def _fill_mi_header(row, control_row):
     """Forward fills blank entries in row, but only inside the same parent index
 
@@ -838,7 +875,7 @@ def _fill_mi_header(row, control_row):
             control_row[i] = False
             last = row[i]
 
-    return row, control_row
+    return _maybe_convert_to_string(row), control_row
 
 # fill blank if index_col not None
 

diff --git a/pandas/tests/io/data/test1.xls b/pandas/tests/io/data/test1.xls
diff --git a/pandas/tests/io/data/test1.xlsm b/pandas/tests/io/data/test1.xlsm
diff --git a/pandas/tests/io/data/test1.xlsx b/pandas/tests/io/data/test1.xlsx
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -264,6 +264,18 @@ def test_index_col_empty(self, ext):
                                               names=["A", "B", "C"]))
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("index_col", [None, 2])
+    def test_index_col_with_unnamed(self, ext, index_col):
+        # see gh-18792
+        result = self.get_exceldf("test1", ext, "Sheet4",
+                                  index_col=index_col)
+        expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
+                             columns=["Unnamed: 0", "col1", "col2"])
+        if index_col:
+            expected = expected.set_index(expected.columns[index_col])
+
+        tm.assert_frame_equal(result, expected)
+
     def test_usecols_pass_non_existent_column(self, ext):
         msg = ("Usecols do not match columns, "
                "columns expected but not found: " + r"\['E'\]")
@@ -923,9 +935,9 @@ def test_read_excel_multiindex_empty_level(self, ext):
             })
 
             expected = DataFrame({
-                ("One", u"x"): {0: 1},
-                ("Two", u"X"): {0: 3},
-                ("Two", u"Y"): {0: 7},
+                ("One", "x"): {0: 1},
+                ("Two", "X"): {0: 3},
+                ("Two", "Y"): {0: 7},
                 ("Zero", "Unnamed: 4_level_1"): {0: 0}
             })
 
@@ -942,9 +954,9 @@ def test_read_excel_multiindex_empty_level(self, ext):
 
             expected = pd.DataFrame({
                 ("Beg", "Unnamed: 1_level_1"): {0: 0},
-                ("Middle", u"x"): {0: 1},
-                ("Tail", u"X"): {0: 3},
-                ("Tail", u"Y"): {0: 7}
+                ("Middle", "x"): {0: 1},
+                ("Tail", "X"): {0: 3},
+                ("Tail", "Y"): {0: 7}
             })
 
             df.to_excel(path)