From 5983820f37e7f5415ee23a1e420114a9e20533ad Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Sun, 25 Sep 2022 20:40:45 +0200 Subject: [PATCH 1/4] Raise ParserError instead of IndexError when specifying an incorrect number of columns with index_col for the read_csv C parser. --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/io/parsers/c_parser_wrapper.py | 8 ++++++++ pandas/tests/io/parser/common/test_read_errors.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index d726f69286469..d9ed5ab02924a 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -35,7 +35,7 @@ Other enhancements - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) -- +- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser. .. --------------------------------------------------------------------------- .. _whatsnew_160.notable_bug_fixes: diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 6e4ea85548230..dc104b3020f14 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -33,6 +33,7 @@ from pandas.io.parsers.base_parser import ( ParserBase, + ParserError, is_index_col, ) @@ -270,6 +271,13 @@ def read( # implicit index, no index names arrays = [] + if self.index_col and self._reader.leading_cols != len(self.index_col): + raise ParserError( + "Could not construct index. Requested to use " + f"{len(self.index_col)} number of columns, but " + f"{self._reader.leading_cols} left to parse." + ) + for i in range(self._reader.leading_cols): if self.index_col is None: values = data.pop(i) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index f52af109626e9..fc30ebff0d93a 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -292,6 +292,18 @@ def test_conflict_on_bad_line(all_parsers, error_bad_lines, warn_bad_lines): parser.read_csv(StringIO(data), on_bad_lines="error", **kwds) +def test_bad_header_uniform_error(all_parsers): + parser = all_parsers + data = "+++123456789...\ncol1,col2,col3,col4\n1,2,3,4\n" + msg = "Expected 2 fields in line 2, saw 4" + if parser.engine == "c": + msg = "Could not construct index. Requested to use 1 " + "number of columns, but 3 left to parse." + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") + + def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): # see gh-15925 parser = all_parsers From 0a2d4c6312812530f460cef7ac9a76de3565c0b1 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Tue, 27 Sep 2022 08:39:55 +0200 Subject: [PATCH 2/4] Move whatsnew entry --- doc/source/whatsnew/v1.6.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index d9ed5ab02924a..1789df6b48073 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -35,7 +35,6 @@ Other enhancements - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) -- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser. .. --------------------------------------------------------------------------- .. _whatsnew_160.notable_bug_fixes: @@ -115,7 +114,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Other API changes ^^^^^^^^^^^^^^^^^ -- +- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser. - .. --------------------------------------------------------------------------- From e7a68d4a8aeb5408609069d027e0ae586e1d7c6a Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Tue, 27 Sep 2022 08:41:40 +0200 Subject: [PATCH 3/4] Cleanup after moving whatsnew --- doc/source/whatsnew/v1.6.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 1789df6b48073..df6fe02f19c16 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -147,7 +147,6 @@ Performance improvements - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). - Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`) -- .. --------------------------------------------------------------------------- .. _whatsnew_160.bug_fixes: From 82dda087702f26b43d823b2eabd0f5c1af13e750 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Tue, 27 Sep 2022 08:43:19 +0200 Subject: [PATCH 4/4] Cleanup after moving whatsnew (pt. 2) --- doc/source/whatsnew/v1.6.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index df6fe02f19c16..79d498c4d1669 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -35,6 +35,7 @@ Other enhancements - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) - Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`) - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) +- .. --------------------------------------------------------------------------- .. _whatsnew_160.notable_bug_fixes: @@ -147,6 +148,7 @@ Performance improvements - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). - Performance improvement to :func:`read_sas` with ``blank_missing=True`` (:issue:`48502`) +- .. --------------------------------------------------------------------------- .. _whatsnew_160.bug_fixes: