Skip to content

Commit

Permalink
TST: Test unnamed columns with index_col for Excel (pandas-dev#23874)
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung authored and Pingviinituutti committed Feb 28, 2019
1 parent a0532d3 commit 46044a2
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 12 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1428,7 +1428,8 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`)
- Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`)
- Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`)
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`)
- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`)
- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`)
- Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`)
- :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`)

Expand Down
47 changes: 42 additions & 5 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,10 +662,14 @@ def _parse_cell(cell_contents, cell_typ):

output[asheetname] = parser.read(nrows=nrows)

if ((not squeeze or isinstance(output[asheetname], DataFrame))
and header_names):
output[asheetname].columns = output[
asheetname].columns.set_names(header_names)
if not squeeze or isinstance(output[asheetname], DataFrame):
if header_names:
output[asheetname].columns = output[
asheetname].columns.set_names(header_names)
elif compat.PY2:
output[asheetname].columns = _maybe_convert_to_string(
output[asheetname].columns)

except EmptyDataError:
# No Data, return an empty DataFrame
output[asheetname] = DataFrame()
Expand Down Expand Up @@ -810,6 +814,39 @@ def _trim_excel_header(row):
return row


def _maybe_convert_to_string(row):
"""
Convert elements in a row to string from Unicode.
This is purely a Python 2.x patch and is performed ONLY when all
elements of the row are string-like.
Parameters
----------
row : array-like
The row of data to convert.
Returns
-------
converted : array-like
"""
if compat.PY2:
converted = []

for i in range(len(row)):
if isinstance(row[i], compat.string_types):
try:
converted.append(str(row[i]))
except UnicodeEncodeError:
break
else:
break
else:
row = converted

return row


def _fill_mi_header(row, control_row):
"""Forward fills blank entries in row, but only inside the same parent index
Expand Down Expand Up @@ -838,7 +875,7 @@ def _fill_mi_header(row, control_row):
control_row[i] = False
last = row[i]

return row, control_row
return _maybe_convert_to_string(row), control_row

# fill blank if index_col not None

Expand Down
Binary file modified pandas/tests/io/data/test1.xls
Binary file not shown.
Binary file modified pandas/tests/io/data/test1.xlsm
Binary file not shown.
Binary file modified pandas/tests/io/data/test1.xlsx
Binary file not shown.
24 changes: 18 additions & 6 deletions pandas/tests/io/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,18 @@ def test_index_col_empty(self, ext):
names=["A", "B", "C"]))
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("index_col", [None, 2])
def test_index_col_with_unnamed(self, ext, index_col):
# see gh-18792
result = self.get_exceldf("test1", ext, "Sheet4",
index_col=index_col)
expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
columns=["Unnamed: 0", "col1", "col2"])
if index_col:
expected = expected.set_index(expected.columns[index_col])

tm.assert_frame_equal(result, expected)

def test_usecols_pass_non_existent_column(self, ext):
msg = ("Usecols do not match columns, "
"columns expected but not found: " + r"\['E'\]")
Expand Down Expand Up @@ -923,9 +935,9 @@ def test_read_excel_multiindex_empty_level(self, ext):
})

expected = DataFrame({
("One", u"x"): {0: 1},
("Two", u"X"): {0: 3},
("Two", u"Y"): {0: 7},
("One", "x"): {0: 1},
("Two", "X"): {0: 3},
("Two", "Y"): {0: 7},
("Zero", "Unnamed: 4_level_1"): {0: 0}
})

Expand All @@ -942,9 +954,9 @@ def test_read_excel_multiindex_empty_level(self, ext):

expected = pd.DataFrame({
("Beg", "Unnamed: 1_level_1"): {0: 0},
("Middle", u"x"): {0: 1},
("Tail", u"X"): {0: 3},
("Tail", u"Y"): {0: 7}
("Middle", "x"): {0: 1},
("Tail", "X"): {0: 3},
("Tail", "Y"): {0: 7}
})

df.to_excel(path)
Expand Down

0 comments on commit 46044a2

Please sign in to comment.