From d4c0cb73c0d291234053dbd2a9ae5e6104731156 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Fri, 31 Dec 2021 12:08:00 -0800
Subject: [PATCH 01/13] Add doc and validation

---
 pandas/io/parsers/python_parser.py |  5 ++++-
 pandas/io/parsers/readers.py       | 19 ++++++++++++++++---
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 12d5e4599cee0..cf815d673ca6f 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -197,6 +197,7 @@ class MyDialect(csv.Dialect):
                 skipinitialspace = self.skipinitialspace
                 quoting = self.quoting
                 lineterminator = "\n"
+                strict = not isinstance(self.on_bad_lines, callable)
 
             dia = MyDialect
 
@@ -990,7 +991,9 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
                 actual_len = len(l)
 
                 if actual_len > col_len:
-                    if (
+                    if callable(self.on_bad_lines):
+                        content.append(self.on_bad_lines(l))
+                    elif (
                         self.on_bad_lines == self.BadLineHandleMethod.ERROR
                         or self.on_bad_lines == self.BadLineHandleMethod.WARN
                     ):
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 464a9b0b9f88e..c9879addb9f25 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -9,6 +9,7 @@
 from textwrap import fill
 from typing import (
     Any,
+    Callable,
     NamedTuple,
 )
 import warnings
@@ -354,7 +355,7 @@
     .. deprecated:: 1.3.0
        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
        encountering a bad line instead.
-on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error'
+on_bad_lines : string, default 'error'
     Specifies what to do upon encountering a bad line (a line with too many fields).
     Allowed values are :
 
@@ -364,6 +365,12 @@
 
     .. versionadded:: 1.3.0
 
+        - callable, function with signature ``(bad_line: str) -> str`` that will
+          process a single bad line. Only supported when ``engine="python"``
+          and
+
+    .. versionadded:: 1.4.0
+
 delim_whitespace : bool, default False
     Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
     used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
@@ -1367,7 +1374,7 @@ def _refine_defaults_read(
     sep: str | object,
     error_bad_lines: bool | None,
     warn_bad_lines: bool | None,
-    on_bad_lines: str | None,
+    on_bad_lines: str | Callable | None,
     names: ArrayLike | None | object,
     prefix: str | None | object,
     defaults: dict[str, Any],
@@ -1399,7 +1406,7 @@ def _refine_defaults_read(
         Whether to error on a bad line or not.
     warn_bad_lines : str or None
         Whether to warn on a bad line or not.
-    on_bad_lines : str or None
+    on_bad_lines : str, callable or None
         An option for handling bad lines or a sentinel value(None).
     names : array-like, optional
         List of column names to use. If the file contains a header row,
@@ -1503,6 +1510,12 @@ def _refine_defaults_read(
             kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
         elif on_bad_lines == "skip":
             kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
+        elif callable(on_bad_lines):
+            if engine != "python":
+                raise ValueError(
+                    "on_bad_line can only be a callable function if engine='python'"
+                )
+            kwds["on_bad_lines"] = on_bad_lines
         else:
             raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
     else:

From f654e39110e031767e32c5e08a94ab0df282bd72 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Fri, 31 Dec 2021 15:14:18 -0800
Subject: [PATCH 02/13] Add whatsnew, testing, and docs

---
 doc/source/user_guide/io.rst                  | 23 ++++++--
 doc/source/whatsnew/v1.4.0.rst                |  1 +
 pandas/io/parsers/python_parser.py            |  2 +-
 pandas/io/parsers/readers.py                  |  6 +-
 .../io/parser/test_python_parser_only.py      | 59 +++++++++++++++++++
 pandas/tests/io/parser/test_unsupported.py    | 12 ++++
 6 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 9faef9b15bfb4..5cae40e833ac3 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -1305,14 +1305,29 @@ You can elect to skip bad lines:
     0  1  2   3
     1  8  9  10
 
+Or pass a callable function to handle the bad line if ``engine="python"``.
+The bad line will be a list of strings that was split by the ``sep``:
+
+.. code-block:: ipython
+
+    In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python")
+    Out[30]:
+       a  b   c
+    0  1  2   3
+    1  5  6   7
+    2  8  9  10
+
+    .. versionadded:: 1.4.0
+
+
 You can also use the ``usecols`` parameter to eliminate extraneous column
 data that appear in some lines but not others:
 
 .. code-block:: ipython
 
-   In [30]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
+   In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
 
-    Out[30]:
+    Out[31]:
        a  b   c
     0  1  2   3
     1  4  5   6
@@ -1324,9 +1339,9 @@ fields are filled with ``NaN``.
 
 .. code-block:: ipython
 
-   In [31]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
+   In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
 
-   Out[31]:
+   Out[32]:
        a  b   c  d
     0  1  2   3  NaN
     1  4  5   6  7
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index a47cee645bf4b..ee1e17634e19e 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -256,6 +256,7 @@ Other enhancements
 - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
 - Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
+- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"` for custom handling of bad lines (:issue:`5686`)
 - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`)
 - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`)
 - :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index cf815d673ca6f..514737220f9b4 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -197,7 +197,7 @@ class MyDialect(csv.Dialect):
                 skipinitialspace = self.skipinitialspace
                 quoting = self.quoting
                 lineterminator = "\n"
-                strict = not isinstance(self.on_bad_lines, callable)
+                strict = not callable(self.on_bad_lines)
 
             dia = MyDialect
 
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index c9879addb9f25..9861c654c379a 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -365,9 +365,9 @@
 
     .. versionadded:: 1.3.0
 
-        - callable, function with signature ``(bad_line: str) -> str`` that will
-          process a single bad line. Only supported when ``engine="python"``
-          and
+        - callable, function with signature ``(bad_line: list[str]) -> list[str]``
+          that will process a single bad line. ``bad_line`` is a list of strings
+          split by the ``sep``. Only supported when ``engine="python"``
 
     .. versionadded:: 1.4.0
 
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index df8be721ec871..67091153bd77b 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -329,3 +329,62 @@ def readline(self):
             return self.data
 
     parser.read_csv(NoNextBuffer("a\n1"))
+
+
+@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
+def test_on_bad_lines_callable(python_parser_only, bad_line_func):
+    # GH 5686
+    parser = python_parser_only
+    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+    result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
+    expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
+    # GH 5686
+    parser = python_parser_only
+    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+    lst = []
+
+    def bad_line_func(bad_line):
+        lst.append(bad_line)
+        return ["2", "3"]
+
+    result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
+    expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
+    tm.assert_frame_equal(result, expected)
+    assert lst == [["2", "3", "4", "5", "6"]]
+
+
+@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
+@pytest.mark.parametrize("sep", [",", "111"])
+def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
+    # GH 5686
+    # iterator=True has a separate code path than iterator=False
+    parser = python_parser_only
+    bad_sio = StringIO(f"0{sep}1\nhi{sep}there\nfoo{sep}bar{sep}baz\ngood{sep}bye")
+    result_iter = parser.read_csv(
+        bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
+    )
+    expecteds = [
+        {"0": "hi", "1": "there"},
+        {"0": "foo", "1": "bar"},
+        {"0": "good", "1": "bye"},
+    ]
+    for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
+        expected = DataFrame(expected, index=range(i, i + 1))
+        tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
+    # GH 5686
+    parser = python_parser_only
+    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+    msg = "This function is buggy."
+
+    def bad_line_func(bad_line):
+        raise ValueError(msg)
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
index 46ae629d09112..f359a73382f32 100644
--- a/pandas/tests/io/parser/test_unsupported.py
+++ b/pandas/tests/io/parser/test_unsupported.py
@@ -149,3 +149,15 @@ def test_pyarrow_engine(self):
                 kwargs[default] = "warn"
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)
+
+    def test_on_bad_lines_callable_python_only(self, all_parsers):
+        # GH 5686
+        sio = StringIO("a,b\n1,2")
+        bad_lines_func = lambda x: x
+        parser = all_parsers
+        if all_parsers.engine != "python":
+            msg = "on_bad_line can only be a callable function if engine='python'"
+            with pytest.raises(ValueError, match=msg):
+                parser.read_csv(sio, on_bad_lines=bad_lines_func)
+        else:
+            parser.read_csv(sio, on_bad_lines=bad_lines_func)

From 6c12102c0bed38eaedbde9bfed0de8070d59b0ca Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Fri, 31 Dec 2021 15:18:43 -0800
Subject: [PATCH 03/13] Fix whatsnew formatting

---
 doc/source/whatsnew/v1.4.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index ee1e17634e19e..540709aa76098 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -256,7 +256,7 @@ Other enhancements
 - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
 - Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
-- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"` for custom handling of bad lines (:issue:`5686`)
+- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"`` for custom handling of bad lines (:issue:`5686`)
 - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`)
 - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`)
 - :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`)

From 1aee16cecab7d303de90db4138afce5dba45adbc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Fri, 31 Dec 2021 15:30:09 -0800
Subject: [PATCH 04/13] Update doc

---
 pandas/io/parsers/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 9861c654c379a..4d2d1c2222ff7 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -355,7 +355,7 @@
     .. deprecated:: 1.3.0
        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
        encountering a bad line instead.
-on_bad_lines : string, default 'error'
+on_bad_lines : string or callable, default 'error'
     Specifies what to do upon encountering a bad line (a line with too many fields).
     Allowed values are :
 

From d759a88adbcdae619b8479971bb8b6b9ea0a2ad8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Fri, 31 Dec 2021 17:24:18 -0800
Subject: [PATCH 05/13] fix docstring validation

---
 pandas/io/parsers/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 4d2d1c2222ff7..b8b1c41bf8907 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -355,7 +355,7 @@
     .. deprecated:: 1.3.0
        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
        encountering a bad line instead.
-on_bad_lines : string or callable, default 'error'
+on_bad_lines : str or callable, default 'error'
     Specifies what to do upon encountering a bad line (a line with too many fields).
     Allowed values are :
 

From 9b73ae4a1d576b163cf1236092b7529f362e5194 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Sun, 2 Jan 2022 10:42:26 -0800
Subject: [PATCH 06/13] Test is callable returns a row longer than expected
 length

---
 .../tests/io/parser/test_python_parser_only.py   | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 67091153bd77b..7c54d8e05c171 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -13,7 +13,10 @@
 
 import pytest
 
-from pandas.errors import ParserError
+from pandas.errors import (
+    ParserError,
+    ParserWarning,
+)
 
 from pandas import (
     DataFrame,
@@ -388,3 +391,14 @@ def bad_line_func(bad_line):
 
     with pytest.raises(ValueError, match=msg):
         parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
+
+
+def test_on_bad_lines_callable_not_expected_length(python_parser_only):
+    # GH 5686
+    parser = python_parser_only
+    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+
+    with tm.assert_produces_warning(ParserWarning, match="Length of header or names"):
+        result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x)
+    expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
+    tm.assert_frame_equal(result, expected)

From b77da025e7027af065e301af2cf5e87d224de860 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Mon, 3 Jan 2022 14:45:11 -0800
Subject: [PATCH 07/13] Address comments

---
 doc/source/whatsnew/v1.4.0.rst                |  2 +-
 pandas/io/parsers/python_parser.py            |  1 -
 pandas/io/parsers/readers.py                  |  7 ++--
 .../io/parser/test_python_parser_only.py      | 36 ++++++++++++++++---
 4 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 0b7dd3e89f87d..e36a637454f06 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -207,7 +207,7 @@ Other enhancements
 - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
 - Added :meth:`DataFrameGroupBy.value_counts` (:issue:`43564`)
-- :meth:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"`` for custom handling of bad lines (:issue:`5686`)
+- :func:`read_csv` now accepts a ``callable`` function in ``on_bad_lines`` when ``engine="python"`` for custom handling of bad lines (:issue:`5686`)
 - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`)
 - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`)
 - :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 514737220f9b4..53be3e3fd08cd 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -197,7 +197,6 @@ class MyDialect(csv.Dialect):
                 skipinitialspace = self.skipinitialspace
                 quoting = self.quoting
                 lineterminator = "\n"
-                strict = not callable(self.on_bad_lines)
 
             dia = MyDialect
 
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index b8b1c41bf8907..ef17a2bec7c88 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -355,7 +355,7 @@
     .. deprecated:: 1.3.0
        The ``on_bad_lines`` parameter should be used instead to specify behavior upon
        encountering a bad line instead.
-on_bad_lines : str or callable, default 'error'
+on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error'
     Specifies what to do upon encountering a bad line (a line with too many fields).
     Allowed values are :
 
@@ -367,7 +367,10 @@
 
         - callable, function with signature ``(bad_line: list[str]) -> list[str]``
           that will process a single bad line. ``bad_line`` is a list of strings
-          split by the ``sep``. Only supported when ``engine="python"``
+          split by the ``sep``. If the function returns a new list of strings
+          with more elements than expected, a ``ParserWarning`` will be emitted
+          while dropping extra elements.
+          Only supported when ``engine="python"``
 
     .. versionadded:: 1.4.0
 
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 7c54d8e05c171..737e762908938 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -338,7 +338,12 @@ def readline(self):
 def test_on_bad_lines_callable(python_parser_only, bad_line_func):
     # GH 5686
     parser = python_parser_only
-    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+    data = """a,b
+1,2
+2,3,4,5,6
+3,4
+"""
+    bad_sio = StringIO(data)
     result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
     expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
     tm.assert_frame_equal(result, expected)
@@ -347,7 +352,12 @@ def test_on_bad_lines_callable(python_parser_only, bad_line_func):
 def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
     # GH 5686
     parser = python_parser_only
-    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+    data = """a,b
+1,2
+2,3,4,5,6
+3,4
+"""
+    bad_sio = StringIO(data)
     lst = []
 
     def bad_line_func(bad_line):
@@ -366,7 +376,13 @@ def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func,
     # GH 5686
     # iterator=True has a separate code path than iterator=False
     parser = python_parser_only
-    bad_sio = StringIO(f"0{sep}1\nhi{sep}there\nfoo{sep}bar{sep}baz\ngood{sep}bye")
+    data = f"""
+0{sep}1
+hi{sep}there
+foo{sep}bar{sep}baz
+good{sep}bye
+"""
+    bad_sio = StringIO(data)
     result_iter = parser.read_csv(
         bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
     )
@@ -383,7 +399,12 @@ def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func,
 def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
     # GH 5686
     parser = python_parser_only
-    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+    data = """a,b
+1,2
+2,3,4,5,6
+3,4
+"""
+    bad_sio = StringIO(data)
     msg = "This function is buggy."
 
     def bad_line_func(bad_line):
@@ -396,7 +417,12 @@ def bad_line_func(bad_line):
 def test_on_bad_lines_callable_not_expected_length(python_parser_only):
     # GH 5686
     parser = python_parser_only
-    bad_sio = StringIO("a,b\n1,2\n2,3,4,5,6\n3,4")
+    data = """a,b
+1,2
+2,3,4,5,6
+3,4
+"""
+    bad_sio = StringIO(data)
 
     with tm.assert_produces_warning(ParserWarning, match="Length of header or names"):
         result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x)

From 39a83b45872cdb4c119a65c5163a4ed0fc4ae089 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Mon, 3 Jan 2022 15:30:29 -0800
Subject: [PATCH 08/13] Allow callable behavior returning None

---
 pandas/io/parsers/python_parser.py                |  4 +++-
 pandas/io/parsers/readers.py                      | 11 ++++++-----
 pandas/tests/io/parser/test_python_parser_only.py | 15 +++++++++++++++
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 53be3e3fd08cd..55ad6be3100e7 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -991,7 +991,9 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
 
                 if actual_len > col_len:
                     if callable(self.on_bad_lines):
-                        content.append(self.on_bad_lines(l))
+                        new_l = self.on_bad_lines(l)
+                        if new_l is not None:
+                            content.append(new_l)
                     elif (
                         self.on_bad_lines == self.BadLineHandleMethod.ERROR
                         or self.on_bad_lines == self.BadLineHandleMethod.WARN
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index ef17a2bec7c88..c417a511a1d81 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -365,11 +365,12 @@
 
     .. versionadded:: 1.3.0
 
-        - callable, function with signature ``(bad_line: list[str]) -> list[str]``
-          that will process a single bad line. ``bad_line`` is a list of strings
-          split by the ``sep``. If the function returns a new list of strings
-          with more elements than expected, a ``ParserWarning`` will be emitted
-          while dropping extra elements.
+        - callable, function with signature
+          ``(bad_line: list[str]) -> list[str] | None`` that will process a single
+          bad line. ``bad_line`` is a list of strings split by the ``sep``.
+          If the function returns ``None`, the bad line will be ignored.
+          If the function returns a new list of strings with more elements than
+          expected, a ``ParserWarning`` will be emitted while dropping extra elements.
           Only supported when ``engine="python"``
 
     .. versionadded:: 1.4.0
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 737e762908938..2d086e0a1b6a1 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -428,3 +428,18 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
         result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x)
     expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_callable_returns_none(python_parser_only):
+    # GH 5686
+    parser = python_parser_only
+    data = """a,b
+1,2
+2,3,4,5,6
+3,4
+"""
+    bad_sio = StringIO(data)
+
+    result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
+    expected = DataFrame({"a": [1, 3], "b": [2, 4]})
+    tm.assert_frame_equal(result, expected)

From a5f3656b2c7a111ba3f69bdfb24045e14e6a3b8e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Mon, 3 Jan 2022 15:37:30 -0800
Subject: [PATCH 09/13] Add test for index_col inferred

---
 pandas/tests/io/parser/test_python_parser_only.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 2d086e0a1b6a1..ca9051b8148db 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -443,3 +443,17 @@ def test_on_bad_lines_callable_returns_none(python_parser_only):
     result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
     expected = DataFrame({"a": [1, 3], "b": [2, 4]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_index_col_inferred(python_parser_only):
+    # GH 5686
+    parser = python_parser_only
+    data = """a,b
+1,2,3
+4,5,6
+"""
+    bad_sio = StringIO(data)
+
+    result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
+    expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
+    tm.assert_frame_equal(result, expected)

From 743b83b3580d3e726a32ee23b74f28fb1d38d842 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 5 Jan 2022 17:35:15 -0800
Subject: [PATCH 10/13] improve docs

---
 doc/source/user_guide/io.rst | 48 ++++++++++++------------------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 5cae40e833ac3..6ccf41a0d3c54 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -1295,57 +1295,41 @@ too many fields will raise an error by default:
 
 You can elect to skip bad lines:
 
-.. code-block:: ipython
-
-    In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn")
-    Skipping line 3: expected 3 fields, saw 4
+.. ipython:: ipython
 
-    Out[29]:
-       a  b   c
-    0  1  2   3
-    1  8  9  10
+    pd.read_csv(StringIO(data), on_bad_lines="warn")
 
 Or pass a callable function to handle the bad line if ``engine="python"``.
 The bad line will be a list of strings that was split by the ``sep``:
 
-.. code-block:: ipython
+.. versionadded:: 1.4.0
+
+.. ipython:: ipython
+
+    external_list = []
 
-    In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python")
-    Out[30]:
-       a  b   c
-    0  1  2   3
-    1  5  6   7
-    2  8  9  10
+    def func(line):
+        external_list.append(line)
+        return line[-3:]
 
-    .. versionadded:: 1.4.0
+    pd.read_csv(StringIO(data), on_bad_lines=func, engine="python")
 
+    external_list
 
 You can also use the ``usecols`` parameter to eliminate extraneous column
 data that appear in some lines but not others:
 
-.. code-block:: ipython
-
-   In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
+.. ipython:: ipython
 
-    Out[31]:
-       a  b   c
-    0  1  2   3
-    1  4  5   6
-    2  8  9  10
+   pd.read_csv(StringIO(data), usecols=[0, 1, 2])
 
 In case you want to keep all data including the lines with too many fields, you can
 specify a sufficient number of ``names``. This ensures that lines with not enough
 fields are filled with ``NaN``.
 
-.. code-block:: ipython
-
-   In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
+.. ipython:: ipython
 
-   Out[32]:
-       a  b   c  d
-    0  1  2   3  NaN
-    1  4  5   6  7
-    2  8  9  10  NaN
+   pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
 
 .. _io.dialect:
 

From bd6715272915de6e363546eea2e4121a567a5e53 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 5 Jan 2022 17:36:13 -0800
Subject: [PATCH 11/13] type

---
 pandas/tests/io/parser/test_python_parser_only.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index ca9051b8148db..73a6c8226b554 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -4,6 +4,7 @@
 these tests out of this module as soon as the C parser can accept further
 arguments when parsing.
 """
+from __future__ import annotations
 
 import csv
 from io import (
@@ -360,7 +361,7 @@ def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
     bad_sio = StringIO(data)
     lst = []
 
-    def bad_line_func(bad_line):
+    def bad_line_func(bad_line: list[str]) -> list[str]:
         lst.append(bad_line)
         return ["2", "3"]
 

From e04124ae5ee614a4b29dacbc96bf8e0fc545e301 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 5 Jan 2022 20:22:51 -0800
Subject: [PATCH 12/13] Revert "improve docs"

This reverts commit 743b83b3580d3e726a32ee23b74f28fb1d38d842.
---
 doc/source/user_guide/io.rst | 48 ++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 6ccf41a0d3c54..5cae40e833ac3 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -1295,41 +1295,57 @@ too many fields will raise an error by default:
 
 You can elect to skip bad lines:
 
-.. ipython:: ipython
+.. code-block:: ipython
+
+    In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn")
+    Skipping line 3: expected 3 fields, saw 4
 
-    pd.read_csv(StringIO(data), on_bad_lines="warn")
+    Out[29]:
+       a  b   c
+    0  1  2   3
+    1  8  9  10
 
 Or pass a callable function to handle the bad line if ``engine="python"``.
 The bad line will be a list of strings that was split by the ``sep``:
 
-.. versionadded:: 1.4.0
-
-.. ipython:: ipython
-
-    external_list = []
+.. code-block:: ipython
 
-    def func(line):
-        external_list.append(line)
-        return line[-3:]
+    In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python")
+    Out[30]:
+       a  b   c
+    0  1  2   3
+    1  5  6   7
+    2  8  9  10
 
-    pd.read_csv(StringIO(data), on_bad_lines=func, engine="python")
+    .. versionadded:: 1.4.0
 
-    external_list
 
 You can also use the ``usecols`` parameter to eliminate extraneous column
 data that appear in some lines but not others:
 
-.. ipython:: ipython
+.. code-block:: ipython
+
+   In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
 
-   pd.read_csv(StringIO(data), usecols=[0, 1, 2])
+    Out[31]:
+       a  b   c
+    0  1  2   3
+    1  4  5   6
+    2  8  9  10
 
 In case you want to keep all data including the lines with too many fields, you can
 specify a sufficient number of ``names``. This ensures that lines with not enough
 fields are filled with ``NaN``.
 
-.. ipython:: ipython
+.. code-block:: ipython
+
+   In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
 
-   pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
+   Out[32]:
+       a  b   c  d
+    0  1  2   3  NaN
+    1  4  5   6  7
+    2  8  9  10  NaN
 
 .. _io.dialect:
 

From 4817770eb4588fcd52bab07a9f91ae781966264e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Thu, 6 Jan 2022 11:52:12 -0800
Subject: [PATCH 13/13] Add example of writing to an external list

---
 doc/source/user_guide/io.rst | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 5cae40e833ac3..f3be3277003ee 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -1310,13 +1310,22 @@ The bad line will be a list of strings that was split by the ``sep``:
 
 .. code-block:: ipython
 
-    In [30]: pd.read_csv(StringIO(data), on_bad_lines=lambda x: x[-3:], engine="python")
-    Out[30]:
+    In [29]: external_list = []
+
+    In [30]: def bad_lines_func(line):
+        ...:     external_list.append(line)
+        ...:     return line[-3:]
+
+    In [31]: pd.read_csv(StringIO(data), on_bad_lines=bad_lines_func, engine="python")
+    Out[31]:
        a  b   c
     0  1  2   3
     1  5  6   7
     2  8  9  10
 
+    In [32]: external_list
+    Out[32]: [4, 5, 6, 7]
+
     .. versionadded:: 1.4.0
 
 
@@ -1325,9 +1334,9 @@ data that appear in some lines but not others:
 
 .. code-block:: ipython
 
-   In [31]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
+   In [33]: pd.read_csv(StringIO(data), usecols=[0, 1, 2])
 
-    Out[31]:
+    Out[33]:
        a  b   c
     0  1  2   3
     1  4  5   6
@@ -1339,9 +1348,9 @@ fields are filled with ``NaN``.
 
 .. code-block:: ipython
 
-   In [32]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
+   In [34]: pd.read_csv(StringIO(data), names=['a', 'b', 'c', 'd'])
 
-   Out[32]:
+   Out[34]:
        a  b   c  d
     0  1  2   3  NaN
     1  4  5   6  7