From d4c0cb73c0d291234053dbd2a9ae5e6104731156 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 31 Dec 2021 12:08:00 -0800 Subject: [PATCH 01/13] Add doc and validation --- pandas/io/parsers/python_parser.py | 5 ++++- pandas/io/parsers/readers.py | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 12d5e4599cee0..cf815d673ca6f 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -197,6 +197,7 @@ class MyDialect(csv.Dialect): skipinitialspace = self.skipinitialspace quoting = self.quoting lineterminator = "\n" + strict = not isinstance(self.on_bad_lines, callable) dia = MyDialect @@ -990,7 +991,9 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: actual_len = len(l) if actual_len > col_len: - if ( + if callable(self.on_bad_lines): + content.append(self.on_bad_lines(l)) + elif ( self.on_bad_lines == self.BadLineHandleMethod.ERROR or self.on_bad_lines == self.BadLineHandleMethod.WARN ): diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 464a9b0b9f88e..c9879addb9f25 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -9,6 +9,7 @@ from textwrap import fill from typing import ( Any, + Callable, NamedTuple, ) import warnings @@ -354,7 +355,7 @@ .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' +on_bad_lines : string, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : @@ -364,6 +365,12 @@ .. versionadded:: 1.3.0 + - callable, function with signature ``(bad_line: str) -> str`` that will + process a single bad line. Only supported when ``engine="python"`` + and + + .. versionadded:: 1.4.0 + delim_whitespace : bool, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the sep. Equivalent to setting ``sep='\\s+'``. If this option @@ -1367,7 +1374,7 @@ def _refine_defaults_read( sep: str | object, error_bad_lines: bool | None, warn_bad_lines: bool | None, - on_bad_lines: str | None, + on_bad_lines: str | Callable | None, names: ArrayLike | None | object, prefix: str | None | object, defaults: dict[str, Any], @@ -1399,7 +1406,7 @@ def _refine_defaults_read( Whether to error on a bad line or not. warn_bad_lines : str or None Whether to warn on a bad line or not. - on_bad_lines : str or None + on_bad_lines : str, callable or None An option for handling bad lines or a sentinel value(None). names : array-like, optional List of column names to use. If the file contains a header row, @@ -1503,6 +1510,12 @@ def _refine_defaults_read( kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN elif on_bad_lines == "skip": kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP + elif callable(on_bad_lines): + if engine != "python": + raise ValueError( + "on_bad_line can only be a callable function if engine='python'" + ) + kwds["on_bad_lines"] = on_bad_lines else: raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines") else: From f654e39110e031767e32c5e08a94ab0df282bd72 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 31 Dec 2021 15:14:18 -0800 Subject: [PATCH 02/13] Add whatsnew, testing, and docs --- doc/source/user_guide/io.rst | 23 ++++++-- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/io/parsers/python_parser.py | 2 +- pandas/io/parsers/readers.py | 6 +- .../io/parser/test_python_parser_only.py | 59 +++++++++++++++++++ pandas/tests/io/parser/test_unsupported.py | 12 ++++ 6 files changed, 95 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 9faef9b15bfb4..5cae40e833ac3 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1305,14 +1305,29 @@ You can elect to skip bad lines: 0 1 2 3 1 8 9 10 +Or pass a callable function to handle the bad line if ``engine="python"``. +The bad line will be a list of strings that was split by the ``sep``: + +.. code-block:: ipython + + In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python") + Out[30]: + a b c + 0 1 2 3 + 1 5 6 7 + 2 8 9 10 + + .. versionadded:: 1.4.0 + + You can also use the ``usecols`` parameter to eliminate extraneous column data that appear in some lines but not others: .. code-block:: ipython - In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) + In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) - Out[30]: + Out[31]: a b c 0 1 2 3 1 4 5 6 @@ -1324,9 +1339,9 @@ fields are filled with ``NaN``. .. code-block:: ipython - In [31]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) + In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) - Out[31]: + Out[32]: a b c d 0 1 2 3 NaN 1 4 5 6 7 diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index a47cee645bf4b..ee1e17634e19e 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -256,6 +256,7 @@ Other enhancements - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`) +- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"` for custom handling of bad lines (:issue:`5686`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index cf815d673ca6f..514737220f9b4 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -197,7 +197,7 @@ class MyDialect(csv.Dialect): skipinitialspace = self.skipinitialspace quoting = self.quoting lineterminator = "\n" - strict = not isinstance(self.on_bad_lines, callable) + strict = not callable(self.on_bad_lines) dia = MyDialect diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index c9879addb9f25..9861c654c379a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -365,9 +365,9 @@ .. versionadded:: 1.3.0 - - callable, function with signature ``(bad_line: str) -> str`` that will - process a single bad line. Only supported when ``engine="python"`` - and + - callable, function with signature ``(bad_line: list[str]) -> list[str]`` + that will process a single bad line. ``bad_line`` is a list of strings + split by the ``sep``. Only supported when ``engine="python"`` .. versionadded:: 1.4.0 diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index df8be721ec871..67091153bd77b 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -329,3 +329,62 @@ def readline(self): return self.data parser.read_csv(NoNextBuffer("a\n1")) + + +@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]]) +def test_on_bad_lines_callable(python_parser_only, bad_line_func): + # GH 5686 + parser = python_parser_only + bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_callable_write_to_external_list(python_parser_only): + # GH 5686 + parser = python_parser_only + bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + lst = [] + + def bad_line_func(bad_line): + lst.append(bad_line) + return ["2", "3"] + + result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) + tm.assert_frame_equal(result, expected) + assert lst == [["2", "3", "4", "5", "6"]] + + +@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]]) +@pytest.mark.parametrize("sep", [",", "111"]) +def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep): + # GH 5686 + # iterator=True has a separate code path than iterator=False + parser = python_parser_only + bad_sio = StringIO(f"0{sep}1\nhi{sep}there\nfoo{sep}bar{sep}baz\ngood{sep}bye") + result_iter = parser.read_csv( + bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep + ) + expecteds = [ + {"0": "hi", "1": "there"}, + {"0": "foo", "1": "bar"}, + {"0": "good", "1": "bye"}, + ] + for i, (result, expected) in enumerate(zip(result_iter, expecteds)): + expected = DataFrame(expected, index=range(i, i + 1)) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only): + # GH 5686 + parser = python_parser_only + bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + msg = "This function is buggy." + + def bad_line_func(bad_line): + raise ValueError(msg) + + with pytest.raises(ValueError, match=msg): + parser.read_csv(bad_sio, on_bad_lines=bad_line_func) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 46ae629d09112..f359a73382f32 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -149,3 +149,15 @@ def test_pyarrow_engine(self): kwargs[default] = "warn" with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) + + def test_on_bad_lines_callable_python_only(self, all_parsers): + # GH 5686 + sio = StringIO("a,b\n1,2") + bad_lines_func = lambda x: x + parser = all_parsers + if all_parsers.engine != "python": + msg = "on_bad_line can only be a callable function if engine='python'" + with pytest.raises(ValueError, match=msg): + parser.read_csv(sio, on_bad_lines=bad_lines_func) + else: + parser.read_csv(sio, on_bad_lines=bad_lines_func) From 6c12102c0bed38eaedbde9bfed0de8070d59b0ca Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 31 Dec 2021 15:18:43 -0800 Subject: [PATCH 03/13] Fix whatsnew formatting --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ee1e17634e19e..540709aa76098 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -256,7 +256,7 @@ Other enhancements - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`) -- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"` for custom handling of bad lines (:issue:`5686`) +- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"`` for custom handling of bad lines (:issue:`5686`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`) From 1aee16cecab7d303de90db4138afce5dba45adbc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 31 Dec 2021 15:30:09 -0800 Subject: [PATCH 04/13] Update doc --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 9861c654c379a..4d2d1c2222ff7 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -355,7 +355,7 @@ .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : string, default 'error' +on_bad_lines : string or callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : From d759a88adbcdae619b8479971bb8b6b9ea0a2ad8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 31 Dec 2021 17:24:18 -0800 Subject: [PATCH 05/13] fix docstring validation --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 4d2d1c2222ff7..b8b1c41bf8907 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -355,7 +355,7 @@ .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : string or callable, default 'error' +on_bad_lines : str or callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : From 9b73ae4a1d576b163cf1236092b7529f362e5194 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 2 Jan 2022 10:42:26 -0800 Subject: [PATCH 06/13] Test is callable returns a row longer than expected length --- .../tests/io/parser/test_python_parser_only.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 67091153bd77b..7c54d8e05c171 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -13,7 +13,10 @@ import pytest -from pandas.errors import ParserError +from pandas.errors import ( + ParserError, + ParserWarning, +) from pandas import ( DataFrame, @@ -388,3 +391,14 @@ def bad_line_func(bad_line): with pytest.raises(ValueError, match=msg): parser.read_csv(bad_sio, on_bad_lines=bad_line_func) + + +def test_on_bad_lines_callable_not_expected_length(python_parser_only): + # GH 5686 + parser = python_parser_only + bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + + with tm.assert_produces_warning(ParserWarning, match="Length of header or names"): + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x) + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) + tm.assert_frame_equal(result, expected) From b77da025e7027af065e301af2cf5e87d224de860 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 3 Jan 2022 14:45:11 -0800 Subject: [PATCH 07/13] Address comments --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/io/parsers/python_parser.py | 1 - pandas/io/parsers/readers.py | 7 ++-- .../io/parser/test_python_parser_only.py | 36 ++++++++++++++++--- 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 0b7dd3e89f87d..e36a637454f06 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -207,7 +207,7 @@ Other enhancements - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`) - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`) -- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"`` for custom handling of bad lines (:issue:`5686`) +- :func:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"`` for custom handling of bad lines (:issue:`5686`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) - :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 514737220f9b4..53be3e3fd08cd 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -197,7 +197,6 @@ class MyDialect(csv.Dialect): skipinitialspace = self.skipinitialspace quoting = self.quoting lineterminator = "\n" - strict = not callable(self.on_bad_lines) dia = MyDialect diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index b8b1c41bf8907..ef17a2bec7c88 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -355,7 +355,7 @@ .. deprecated:: 1.3.0 The ``on_bad_lines`` parameter should be used instead to specify behavior upon encountering a bad line instead. -on_bad_lines : str or callable, default 'error' +on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error' Specifies what to do upon encountering a bad line (a line with too many fields). Allowed values are : @@ -367,7 +367,10 @@ - callable, function with signature ``(bad_line: list[str]) -> list[str]`` that will process a single bad line. ``bad_line`` is a list of strings - split by the ``sep``. Only supported when ``engine="python"`` + split by the ``sep``. If the function returns a new list of strings + with more elements than expected, a ``ParserWarning`` will be emitted + while dropping extra elements. + Only supported when ``engine="python"`` .. versionadded:: 1.4.0 diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 7c54d8e05c171..737e762908938 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -338,7 +338,12 @@ def readline(self): def test_on_bad_lines_callable(python_parser_only, bad_line_func): # GH 5686 parser = python_parser_only - bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) tm.assert_frame_equal(result, expected) @@ -347,7 +352,12 @@ def test_on_bad_lines_callable(python_parser_only, bad_line_func): def test_on_bad_lines_callable_write_to_external_list(python_parser_only): # GH 5686 parser = python_parser_only - bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) lst = [] def bad_line_func(bad_line): @@ -366,7 +376,13 @@ def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, # GH 5686 # iterator=True has a separate code path than iterator=False parser = python_parser_only - bad_sio = StringIO(f"0{sep}1\nhi{sep}there\nfoo{sep}bar{sep}baz\ngood{sep}bye") + data = f""" +0{sep}1 +hi{sep}there +foo{sep}bar{sep}baz +good{sep}bye +""" + bad_sio = StringIO(data) result_iter = parser.read_csv( bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep ) @@ -383,7 +399,12 @@ def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only): # GH 5686 parser = python_parser_only - bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) msg = "This function is buggy." def bad_line_func(bad_line): @@ -396,7 +417,12 @@ def bad_line_func(bad_line): def test_on_bad_lines_callable_not_expected_length(python_parser_only): # GH 5686 parser = python_parser_only - bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4") + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) with tm.assert_produces_warning(ParserWarning, match="Length of header or names"): result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x) From 39a83b45872cdb4c119a65c5163a4ed0fc4ae089 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 3 Jan 2022 15:30:29 -0800 Subject: [PATCH 08/13] Allow callable behavior returning None --- pandas/io/parsers/python_parser.py | 4 +++- pandas/io/parsers/readers.py | 11 ++++++----- pandas/tests/io/parser/test_python_parser_only.py | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 53be3e3fd08cd..55ad6be3100e7 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -991,7 +991,9 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: if actual_len > col_len: if callable(self.on_bad_lines): - content.append(self.on_bad_lines(l)) + new_l = self.on_bad_lines(l) + if new_l is not None: + content.append(new_l) elif ( self.on_bad_lines == self.BadLineHandleMethod.ERROR or self.on_bad_lines == self.BadLineHandleMethod.WARN diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ef17a2bec7c88..c417a511a1d81 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -365,11 +365,12 @@ .. versionadded:: 1.3.0 - - callable, function with signature ``(bad_line: list[str]) -> list[str]`` - that will process a single bad line. ``bad_line`` is a list of strings - split by the ``sep``. If the function returns a new list of strings - with more elements than expected, a ``ParserWarning`` will be emitted - while dropping extra elements. + - callable, function with signature + ``(bad_line: list[str]) -> list[str] | None`` that will process a single + bad line. ``bad_line`` is a list of strings split by the ``sep``. + If the function returns ``None`, the bad line will be ignored. + If the function returns a new list of strings with more elements than + expected, a ``ParserWarning`` will be emitted while dropping extra elements. Only supported when ``engine="python"`` .. versionadded:: 1.4.0 diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 737e762908938..2d086e0a1b6a1 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -428,3 +428,18 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only): result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x) expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_callable_returns_none(python_parser_only): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2 +2,3,4,5,6 +3,4 +""" + bad_sio = StringIO(data) + + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None) + expected = DataFrame({"a": [1, 3], "b": [2, 4]}) + tm.assert_frame_equal(result, expected) From a5f3656b2c7a111ba3f69bdfb24045e14e6a3b8e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 3 Jan 2022 15:37:30 -0800 Subject: [PATCH 09/13] Add test for index_col inferred --- pandas/tests/io/parser/test_python_parser_only.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 2d086e0a1b6a1..ca9051b8148db 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -443,3 +443,17 @@ def test_on_bad_lines_callable_returns_none(python_parser_only): result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None) expected = DataFrame({"a": [1, 3], "b": [2, 4]}) tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_index_col_inferred(python_parser_only): + # GH 5686 + parser = python_parser_only + data = """a,b +1,2,3 +4,5,6 +""" + bad_sio = StringIO(data) + + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"]) + expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4]) + tm.assert_frame_equal(result, expected) From 743b83b3580d3e726a32ee23b74f28fb1d38d842 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 5 Jan 2022 17:35:15 -0800 Subject: [PATCH 10/13] improve docs --- doc/source/user_guide/io.rst | 48 ++++++++++++------------------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5cae40e833ac3..6ccf41a0d3c54 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1295,57 +1295,41 @@ too many fields will raise an error by default: You can elect to skip bad lines: -.. code-block:: ipython - - In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn") - Skipping line 3: expected 3 fields, saw 4 +.. ipython:: ipython - Out[29]: - a b c - 0 1 2 3 - 1 8 9 10 + pd.read_csv(StringIO(data), on_bad_lines="warn") Or pass a callable function to handle the bad line if ``engine="python"``. The bad line will be a list of strings that was split by the ``sep``: -.. code-block:: ipython +.. versionadded:: 1.4.0 + +.. ipython:: ipython + + external_list = [] - In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python") - Out[30]: - a b c - 0 1 2 3 - 1 5 6 7 - 2 8 9 10 + def func(line): + external_list.append(line) + return line[-3:] - .. versionadded:: 1.4.0 + pd.read_csv(StringIO(data), on_bad_lines=func, engine="python") + external_list You can also use the ``usecols`` parameter to eliminate extraneous column data that appear in some lines but not others: -.. code-block:: ipython - - In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) +.. ipython:: ipython - Out[31]: - a b c - 0 1 2 3 - 1 4 5 6 - 2 8 9 10 + pd.read_csv(StringIO(data), usecols=[0, 1, 2]) In case you want to keep all data including the lines with too many fields, you can specify a sufficient number of ``names``. This ensures that lines with not enough fields are filled with ``NaN``. -.. code-block:: ipython - - In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) +.. ipython:: ipython - Out[32]: - a b c d - 0 1 2 3 NaN - 1 4 5 6 7 - 2 8 9 10 NaN + pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) .. _io.dialect: From bd6715272915de6e363546eea2e4121a567a5e53 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 5 Jan 2022 17:36:13 -0800 Subject: [PATCH 11/13] type --- pandas/tests/io/parser/test_python_parser_only.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index ca9051b8148db..73a6c8226b554 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -4,6 +4,7 @@ these tests out of this module as soon as the C parser can accept further arguments when parsing. """ +from __future__ import annotations import csv from io import ( @@ -360,7 +361,7 @@ def test_on_bad_lines_callable_write_to_external_list(python_parser_only): bad_sio = StringIO(data) lst = [] - def bad_line_func(bad_line): + def bad_line_func(bad_line: list[str]) -> list[str]: lst.append(bad_line) return ["2", "3"] From e04124ae5ee614a4b29dacbc96bf8e0fc545e301 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 5 Jan 2022 20:22:51 -0800 Subject: [PATCH 12/13] Revert "improve docs" This reverts commit 743b83b3580d3e726a32ee23b74f28fb1d38d842. --- doc/source/user_guide/io.rst | 48 ++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6ccf41a0d3c54..5cae40e833ac3 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1295,41 +1295,57 @@ too many fields will raise an error by default: You can elect to skip bad lines: -.. ipython:: ipython +.. code-block:: ipython + + In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn") + Skipping line 3: expected 3 fields, saw 4 - pd.read_csv(StringIO(data), on_bad_lines="warn") + Out[29]: + a b c + 0 1 2 3 + 1 8 9 10 Or pass a callable function to handle the bad line if ``engine="python"``. The bad line will be a list of strings that was split by the ``sep``: -.. versionadded:: 1.4.0 - -.. ipython:: ipython - - external_list = [] +.. code-block:: ipython - def func(line): - external_list.append(line) - return line[-3:] + In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python") + Out[30]: + a b c + 0 1 2 3 + 1 5 6 7 + 2 8 9 10 - pd.read_csv(StringIO(data), on_bad_lines=func, engine="python") + .. versionadded:: 1.4.0 - external_list You can also use the ``usecols`` parameter to eliminate extraneous column data that appear in some lines but not others: -.. ipython:: ipython +.. code-block:: ipython + + In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) - pd.read_csv(StringIO(data), usecols=[0, 1, 2]) + Out[31]: + a b c + 0 1 2 3 + 1 4 5 6 + 2 8 9 10 In case you want to keep all data including the lines with too many fields, you can specify a sufficient number of ``names``. This ensures that lines with not enough fields are filled with ``NaN``. -.. ipython:: ipython +.. code-block:: ipython + + In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) - pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) + Out[32]: + a b c d + 0 1 2 3 NaN + 1 4 5 6 7 + 2 8 9 10 NaN .. _io.dialect: From 4817770eb4588fcd52bab07a9f91ae781966264e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 6 Jan 2022 11:52:12 -0800 Subject: [PATCH 13/13] Add example of writing to an external list --- doc/source/user_guide/io.rst | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5cae40e833ac3..f3be3277003ee 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1310,13 +1310,22 @@ The bad line will be a list of strings that was split by the ``sep``: .. code-block:: ipython - In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python") - Out[30]: + In [29]: external_list = [] + + In [30]: def bad_lines_func(line): + ...: external_list.append(line) + ...: return line[-3:] + + In [31]: pd.read_csv(StringIO(data), on_bad_lines=bad_lines_func, engine="python") + Out[31]: a b c 0 1 2 3 1 5 6 7 2 8 9 10 + In [32]: external_list + Out[32]: [4, 5, 6, 7] + .. versionadded:: 1.4.0 @@ -1325,9 +1334,9 @@ data that appear in some lines but not others: .. code-block:: ipython - In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) + In [33]: pd.read_csv(StringIO(data), usecols=[0, 1, 2]) - Out[31]: + Out[33]: a b c 0 1 2 3 1 4 5 6 @@ -1339,9 +1348,9 @@ fields are filled with ``NaN``. .. code-block:: ipython - In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) + In [34]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd']) - Out[32]: + Out[34]: a b c d 0 1 2 3 NaN 1 4 5 6 7