Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Thoroughly dedup columns in read_csv #17060

Merged
merged 1 commit into from
Jul 25, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ Indexing
I/O
^^^

- Bug in :func:`read_csv` in which columns were not being thoroughly de-duplicated (:issue:`17060`)
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696, :issue:`16798`).
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)

- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)

Plotting
Expand Down
13 changes: 8 additions & 5 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -788,11 +788,14 @@ cdef class TextReader:
unnamed_count += 1

count = counts.get(name, 0)
if (count > 0 and self.mangle_dupe_cols
and not self.has_mi_columns):
this_header.append('%s.%d' % (name, count))
else:
this_header.append(name)

if not self.has_mi_columns and self.mangle_dupe_cols:
while count > 0:
counts[name] = count + 1
name = '%s.%d' % (name, count)
count = counts.get(name, 0)

this_header.append(name)
counts[name] = count + 1

if self.has_mi_columns:
Expand Down
10 changes: 8 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2331,10 +2331,16 @@ def _infer_columns(self):

if not have_mi_columns and self.mangle_dupe_cols:
counts = {}

for i, col in enumerate(this_columns):
cur_count = counts.get(col, 0)
if cur_count > 0:
this_columns[i] = '%s.%d' % (col, cur_count)

while cur_count > 0:
counts[col] = cur_count + 1
col = "%s.%d" % (col, cur_count)
cur_count = counts.get(col, 0)

this_columns[i] = col
counts[col] = cur_count + 1
elif have_mi_columns:

Expand Down
19 changes: 0 additions & 19 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,25 +224,6 @@ def test_unnamed_columns(self):
Index(['A', 'B', 'C', 'Unnamed: 3',
'Unnamed: 4']))

def test_duplicate_columns(self):
# TODO: add test for condition 'mangle_dupe_cols=False'
# once it is actually supported (gh-12935)
data = """A,A,B,B,B
1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""

for method in ('read_csv', 'read_table'):

# check default behavior
df = getattr(self, method)(StringIO(data), sep=',')
assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']

df = getattr(self, method)(StringIO(data), sep=',',
mangle_dupe_cols=True)
assert list(df.columns) == ['A', 'A.1', 'B', 'B.1', 'B.2']

def test_csv_mixed_type(self):
data = """A,B,C
a,1,2
Expand Down
42 changes: 42 additions & 0 deletions pandas/tests/io/parser/mangle_dupes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-

"""
Tests that duplicate columns are handled appropriately when parsed by the
CSV engine. In general, the expected result is that they are either thoroughly
de-duplicated (if mangling requested) or ignored otherwise.
"""

from pandas.compat import StringIO


class DupeColumnTests(object):
def test_basic(self):
# TODO: add test for condition "mangle_dupe_cols=False"
# once it is actually supported (gh-12935)
data = "a,a,b,b,b\n1,2,3,4,5"

for method in ("read_csv", "read_table"):
# Check default behavior.
expected = ["a", "a.1", "b", "b.1", "b.2"]
df = getattr(self, method)(StringIO(data), sep=",")
assert list(df.columns) == expected

df = getattr(self, method)(StringIO(data), sep=",",
mangle_dupe_cols=True)
assert list(df.columns) == expected

def test_thorough_mangle(self):
# see gh-17060
data = "a,a,a.1\n1,2,3"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1"]

data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1",
"a.1.1.1.1", "a.1.1.1.1.1"]

data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7"
df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True)
assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1",
"a.2", "a.2.1", "a.3.1"]
4 changes: 3 additions & 1 deletion pandas/tests/io/parser/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@
from .c_parser_only import CParserTests
from .parse_dates import ParseDatesTests
from .compression import CompressionTests
from .mangle_dupes import DupeColumnTests
from .multithread import MultithreadTests
from .python_parser_only import PythonParserTests
from .dtypes import DtypeTests


class BaseParser(CommentTests, CompressionTests,
ConverterTests, DialectTests,
DtypeTests, DupeColumnTests,
HeaderTests, IndexColTests,
MultithreadTests, NAvaluesTests,
ParseDatesTests, ParserTests,
SkipRowsTests, UsecolsTests,
QuotingTests, DtypeTests):
QuotingTests):

def read_csv(self, *args, **kwargs):
raise NotImplementedError
Expand Down