diff --git a/pandas/io/excel.py b/pandas/io/excel.py index a7e0e48de0a754..3ce8953a6edb2d 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -175,12 +175,16 @@ convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally +mangle_dupe_cols : boolean, default True + Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. Returns ------- parsed : DataFrame or Dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheet_name - argument for more information on when a Dict of Dataframes is returned. + DataFrame from the passed in Excel file. See notes in sheet_name + argument for more information on when a dict of DataFrames is returned. Examples -------- @@ -314,6 +318,7 @@ def read_excel(io, comment=None, skipfooter=0, convert_float=True, + mangle_dupe_cols=True, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning @@ -349,6 +354,7 @@ def read_excel(io, comment=comment, skipfooter=skipfooter, convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, **kwds) @@ -441,6 +447,7 @@ def parse(self, comment=None, skipfooter=0, convert_float=True, + mangle_dupe_cols=True, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -476,6 +483,7 @@ def parse(self, comment=comment, skipfooter=skipfooter, convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, **kwds) def _parse_excel(self, @@ -498,6 +506,7 @@ def _parse_excel(self, comment=None, skipfooter=0, convert_float=True, + mangle_dupe_cols=True, **kwds): _validate_header_arg(header) @@ -667,6 +676,7 @@ def _parse_cell(cell_contents, cell_typ): comment=comment, skipfooter=skipfooter, usecols=usecols, + mangle_dupe_cols=mangle_dupe_cols, **kwds) output[asheetname] = parser.read(nrows=nrows) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index c79affaff3c1f8..a097e0adbeb7ac 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1846,33 +1846,41 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): def test_duplicated_columns(self, *_): # see gh-5235 - write_frame = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]]) - col_names = ["A", "B", "B"] - - write_frame.columns = col_names - write_frame.to_excel(self.path, "test1") + df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], + columns=["A", "B", "B"]) + df.to_excel(self.path, "test1") + expected = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], + columns=["A", "B", "B.1"]) - read_frame = read_excel(self.path, "test1", index_col=0) - read_frame.columns = col_names + # By default, we mangle. + result = read_excel(self.path, "test1", index_col=0) + tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(write_frame, read_frame) + # Explicitly, we pass in the parameter. + result = read_excel(self.path, "test1", index_col=0, + mangle_dupe_cols=True) + tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 - write_frame = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=["A", "B", "A", "B"]) - write_frame.to_excel(self.path, "test1") - - read_frame = read_excel(self.path, "test1", index_col=0) - read_frame.columns = ["A", "B", "A", "B"] + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "A", "B"]) + df.to_excel(self.path, "test1") - tm.assert_frame_equal(write_frame, read_frame) + result = read_excel(self.path, "test1", index_col=0) + expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "A.1", "B.1"]) + tm.assert_frame_equal(result, expected) # see gh-10982 - write_frame.to_excel(self.path, "test1", index=False, header=False) - read_frame = read_excel(self.path, "test1", header=None) + df.to_excel(self.path, "test1", index=False, header=False) + result = read_excel(self.path, "test1", header=None) - write_frame.columns = [0, 1, 2, 3] - tm.assert_frame_equal(write_frame, read_frame) + expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + tm.assert_frame_equal(result, expected) + + msg = "Setting mangle_dupe_cols=False is not supported yet" + with pytest.raises(ValueError, match=msg): + read_excel(self.path, "test1", header=None, mangle_dupe_cols=False) def test_swapped_columns(self, merge_cells, engine, ext): # Test for issue #5427.