diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 91cd7315f7213..115c0e7eaf8b0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1373,8 +1373,7 @@ Files with fixed width columns While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters -to ``read_fwf`` are largely the same as ``read_csv`` with two extra parameters, and -a different usage of the ``delimiter`` parameter: +to ``read_fwf`` are largely the same as ``read_csv`` with five extra parameters: * ``colspecs``: A list of pairs (tuples) giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). @@ -1383,12 +1382,42 @@ a different usage of the ``delimiter`` parameter: behavior, if not specified, is to infer. * ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. -* ``delimiter``: Characters to consider as filler characters in the fixed-width file. - Can be used to specify the filler character of the fields - if it is not spaces (e.g., '~'). +* ``keep_whitespace``: A boolean or a tuple(bool,bool) indicating how whitespace + at the (start,end) of each field / column should be handled. +* ``whitespace_chars``: A string of characters to strip from the start and/or end + of fields / columns when 'keep_whitespace' contains a False value. +* ``delimiter``: Character(s) separating columns when inferring 'colspecs'. Consider a typical fixed-width data file: +.. ipython:: python + + data = ( + "Company One Alice Smythe 7567.89 5 A D B D F\n" + "Global Org Bob Jonstone 8765.43 6 F C A E BC\n" + ) + df = pd.read_fwf(StringIO(data), + header=None, + widths=[12,12,8,2,12], + keep_whitespace=(True,False), + names=["Company", "Contact", "Pay_sum", "Pay_count", "Credit_scores"], + dtypes=[str,str,float,int,str], + # Do not convert data to NaN: + na_filter=False, + ) + df + df.values + +Note that the name field had trailing whitespace removed, as +did the other text fields. However, the *leading* whitespace in Credit_scores was +preserved. + +This is due to ``keep_whitespace`` setting of (True,False) (representing start/end) and +``whitespace_chars`` default of ``' '`` and ``'\t'`` ([space] and [tab]). + + +Parsing a table is possible (see also ``read_table``): + .. ipython:: python data1 = ( @@ -1398,41 +1427,40 @@ Consider a typical fixed-width data file: "id1230 413.836124 184.375703 11916.8\n" "id1948 502.953953 173.237159 12468.3" ) - with open("bar.csv", "w") as f: - f.write(data1) -In order to parse this file into a ``DataFrame``, we simply need to supply the -column specifications to the ``read_fwf`` function along with the file name: +In order to parse this data set into a ``DataFrame``, we simply need to supply the +column specifications to the ``read_fwf`` function: .. ipython:: python # Column specifications are a list of half-intervals colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] - df = pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0) + df = pd.read_fwf(StringIO(data1), + colspecs=colspecs, + header=None, + index_col=0 + ) df Note how the parser automatically picks column names X. when -``header=None`` argument is specified. Alternatively, you can supply just the -column widths for contiguous columns: - -.. ipython:: python - - # Widths are a list of integers - widths = [6, 14, 13, 10] - df = pd.read_fwf("bar.csv", widths=widths, header=None) - df +``header=None`` argument is specified. -The parser will take care of extra white spaces around the columns -so it's ok to have extra separation between the columns in the file. +The parser will take care of extra white spaces around the numeric data columns, and +trailing spaces on string data, so it's ok to have extra separation between the columns +in the file. By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the first 100 rows of the file. It can do it only in cases when the columns are aligned and correctly separated by the provided ``delimiter`` (default delimiter is whitespace). + .. ipython:: python - df = pd.read_fwf("bar.csv", header=None, index_col=0) + df = pd.read_fwf(StringIO(data1), + header=None, + index_col=0 + ) df ``read_fwf`` supports the ``dtype`` parameter for specifying the types of @@ -1440,10 +1468,16 @@ parsed columns to be different from the inferred type. .. ipython:: python - pd.read_fwf("bar.csv", header=None, index_col=0).dtypes - pd.read_fwf("bar.csv", header=None, dtype={2: "object"}).dtypes + pd.read_fwf(StringIO(data1), + header=None, + index_col=0).dtypes + + pd.read_fwf(StringIO(data1), + header=None, + dtype={2: "object"}).dtypes .. ipython:: python + :okexcept: :suppress: os.remove("bar.csv") diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3ddc8b8919228..9eae359dac377 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -290,6 +290,7 @@ Other enhancements - Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`) - Added support for SQLAlchemy 2.0 (:issue:`40686`) - Added support for ``decimal`` parameter when ``engine="pyarrow"`` in :func:`read_csv` (:issue:`51302`) +- Added new arguments ``keep_whitespace`` and ``whitespace_chars`` to :func:`read_fwf` giving more control and more intuitive control over whitespace handling (:issue:`51569`) - :class:`Index` set operations :meth:`Index.union`, :meth:`Index.intersection`, :meth:`Index.difference`, and :meth:`Index.symmetric_difference` now support ``sort=True``, which will always return a sorted result, unlike the default ``sort=None`` which does not sort in some cases (:issue:`25151`) - Added new escape mode "latex-math" to avoid escaping "$" in formatter (:issue:`50040`) @@ -828,8 +829,10 @@ Deprecations - Deprecated :meth:`Series.backfill` in favor of :meth:`Series.bfill` (:issue:`33396`) - Deprecated :meth:`DataFrame.pad` in favor of :meth:`DataFrame.ffill` (:issue:`33396`) - Deprecated :meth:`DataFrame.backfill` in favor of :meth:`DataFrame.bfill` (:issue:`33396`) +- Deprecated using ``delimiter`` option to ``read_fwf`` to preserve whitespace in favour of ``keep_whitespace`` and ``whitespace_chars`` (:issue:`51569`) - Deprecated :meth:`~pandas.io.stata.StataReader.close`. Use :class:`~pandas.io.stata.StataReader` as a context manager instead (:issue:`49228`) - Deprecated producing a scalar when iterating over a :class:`.DataFrameGroupBy` or a :class:`.SeriesGroupBy` that has been grouped by a ``level`` parameter that is a list of length 1; a tuple of length one will be returned instead (:issue:`51583`) +- Deprecated using ``delimiter`` option to ``read_fwf`` to preserve whitespace in favour of ``keep_whitespace`` and ``whitespace_chars`` (:issue:`51569`) .. --------------------------------------------------------------------------- .. _whatsnew_200.prior_deprecations: diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 4e1bcf54c0ae9..39abcb226f23d 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -110,6 +110,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: self.decimal = kwds["decimal"] self.comment = kwds["comment"] + ## GH51569 + self.keep_whitespace = kwds.get("keep_whitespace") + self.whitespace_chars = kwds.get("whitespace_chars") # Set self.data to something that can read lines. if isinstance(f, list): @@ -1194,11 +1197,20 @@ def __init__( comment: str | None, skiprows: set[int] | None = None, infer_nrows: int = 100, + ## GH51569 + keep_whitespace: bool | tuple[bool, bool] = (False, False), + whitespace_chars: str = " \t", ) -> None: self.f = f self.buffer: Iterator | None = None self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t " self.comment = comment + self.keep_whitespace = keep_whitespace + ## Backwards compatibility means supporting delimiter: + if delimiter: + whitespace_chars = whitespace_chars + delimiter + self.whitespace_chars = whitespace_chars + if colspecs == "infer": self.colspecs = self.detect_colspecs( infer_nrows=infer_nrows, skiprows=skiprows @@ -1224,6 +1236,33 @@ def __init__( "2 element tuple or list of integers" ) + ## GH51569 + ## Accept boolean, but convert to tuple(bool,bool) for (left,right) of fields: + if isinstance(self.keep_whitespace, bool): + self.keep_whitespace = (keep_whitespace, keep_whitespace) + ## Ensure tuple is (bool,bool): + if ( + isinstance(self.keep_whitespace, tuple) + and len(self.keep_whitespace) == 2 + and isinstance(self.keep_whitespace[0], bool) + and isinstance(self.keep_whitespace[1], bool) + ): + # Define custom lstrip & rstrip *once*, at __init__: + if self.keep_whitespace[0] is True: + self.ltrim = lambda x: x + else: + self.ltrim = lambda x: x.lstrip(self.whitespace_chars) + if self.keep_whitespace[1] is True: + self.rtrim = lambda x: x + else: + self.rtrim = lambda x: x.rstrip(self.whitespace_chars) + else: + raise ValueError( + "'keep_whitespace' must be a bool or tuple(bool,bool)." + f"\nReceived '{type(self.keep_whitespace).__name__}': " + f"'{self.keep_whitespace}'." + ) + def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]: """ Read rows from self.f, skipping as specified. @@ -1295,8 +1334,14 @@ def __next__(self) -> list[str]: line = next(self.f) # type: ignore[arg-type] else: line = next(self.f) # type: ignore[arg-type] + + line = line.rstrip("\r\n") + # Note: 'colspecs' is a sequence of half-open intervals. - return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs] + return [self.ltrim(self.rtrim(line[from_:to])) for (from_, to) in self.colspecs] + + +# return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs] class FixedWidthFieldParser(PythonParser): @@ -1319,6 +1364,9 @@ def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: self.comment, self.skiprows, self.infer_nrows, + ## GH51569 + self.keep_whitespace, + self.whitespace_chars, ) def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 8b2a02f0ac63a..03d126176ffd6 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -456,9 +456,19 @@ class _Fwf_Defaults(TypedDict): colspecs: Literal["infer"] infer_nrows: Literal[100] widths: None + keep_whitespace: Literal(False, False) + whitespace_chars: Literal(" \t") + + +_fwf_defaults = { + "colspecs": "infer", + "infer_nrows": 100, + "widths": None, + "keep_whitespace": (False, False), + "whitespace_chars": " \t", +} -_fwf_defaults: _Fwf_Defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} _pyarrow_unsupported = { @@ -1271,10 +1281,13 @@ def read_fwf( widths: Sequence[int] | None = None, infer_nrows: int = 100, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + ## GH51569 + keep_whitespace: bool | tuple[bool, bool] = (False, False), + whitespace_chars: str = " \t", **kwds, ) -> DataFrame | TextFileReader: r""" - Read a table of fixed-width formatted lines into DataFrame. + Read a file of fixed-width lines into DataFrame. Also supports optionally iterating or breaking of the file into chunks. @@ -1302,6 +1315,8 @@ def read_fwf( infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`. + delimiter : str, default ``' '`` and ``'\t'`` characters + When inferring colspecs, sets the column / field separator. dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames Which dtype_backend to use, e.g. whether a DataFrame should have NumPy arrays, nullable dtypes are used for all dtypes that have a nullable @@ -1312,6 +1327,14 @@ def read_fwf( .. versionadded:: 2.0 + keep_whitespace : bool, or tuple (bool,bool), default (False,False) + How to handle whitespace at start,end of each field / column. + whitespace_chars : str, default = ``' '`` and ``'\t'`` characters + If ``keep_whitespace`` is to remove whitespace, these characters are + stripped from each field / column. + + .. versionadded:: 2.0 + **kwds : optional Optional keyword arguments can be passed to ``TextFileReader``. @@ -1323,6 +1346,7 @@ def read_fwf( See Also -------- + read_table : Read data from table (i.e. columns with delimiting spaces). DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. read_csv : Read a comma-separated values (csv) file into DataFrame. @@ -1371,6 +1395,9 @@ def read_fwf( check_dtype_backend(dtype_backend) kwds["dtype_backend"] = dtype_backend + ## GH51569 + kwds["keep_whitespace"] = keep_whitespace + kwds["whitespace_chars"] = whitespace_chars return _read(filepath_or_buffer, kwds) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 2a05a3aa3297e..16e9195f7f036 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -645,8 +645,15 @@ def test_whitespace_preservation(): fwf_data = """ a bbb ccdd """ + ## This test is a mess: + ## It's trying to keep whitespace via passing in a non-space delimiter: result = read_fwf( - StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t" + StringIO(fwf_data), + widths=[3, 3], + header=header, + skiprows=[0], + # delimiter="\n\t", + keep_whitespace=True, ) expected = read_csv(StringIO(csv_data), header=header) tm.assert_frame_equal(result, expected) @@ -1058,3 +1065,89 @@ def test_url_urlopen(): ).columns tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize( + "keep_whitespace, data, expected", + [ + ( + # Preserve all whitespace: + True, + # 10-byte wide fields: + ["left ", " centre ", " right "], + DataFrame(["left ", " centre ", " right "]), + ), + ( + # Preserve no whitespace: + False, + # 10-byte wide fields: + ["left ", " centre ", " right "], + DataFrame(["left", "centre", "right"]), + ), + # Preserve leading whitespace only: + ( + (True, False), + ["left ", " centre ", " right"], + DataFrame(["left", " centre", " right"]), + ), + # Preserve trailing whitespace only: + ( + (False, True), + ["left ", " centre ", " right"], + DataFrame(["left ", "centre ", "right"]), + ), + ], +) +def test_fwf_keep_whitespace_true(keep_whitespace, data, expected): + # see GH51569 + + result = read_fwf( + StringIO("\n".join(data)), + header=None, + widths=[10], + keep_whitespace=keep_whitespace, + ) + tm.assert_frame_equal(result, expected) + + +# @pytest.mark.parametrize( +# "keep_whitespace, data, expected", +# [ +# ( +# # Preserve all whitespace: +# True, +# # 10-byte wide fields: +# ["left ", " centre ", " right "], +# DataFrame(["left ", " centre ", " right "]), +# ), +# ( +# # Preserve no whitespace: +# False, +# # 10-byte wide fields: +# ["left ", " centre ", " right "], +# DataFrame(["left", "centre", "right"]), +# ), +# # Preserve leading whitespace only: +# ( +# (True, False), +# ["left ", " centre ", " right"], +# DataFrame(["left", " centre", " right"]), +# ), +# # Preserve trailing whitespace only: +# ( +# (False, True), +# ["left ", " centre ", " right"], +# DataFrame(["left ", "centre ", "right"]), +# ), +# ], +# ) +# def test_fwf_keep_whitespace_true(keep_whitespace, data, expected): +# # see GH51569 + +# result = read_fwf( +# StringIO("\n".join(data)), +# header=None, +# widths=[10], +# keep_whitespace=keep_whitespace, +# ) +# tm.assert_frame_equal(result, expected) diff --git a/pyproject.toml b/pyproject.toml index 6eef8e4fa9b7c..1508121901cbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -427,7 +427,8 @@ disable = [ [tool.pytest.ini_options] # sync minversion with pyproject.toml & install.rst minversion = "7.0" -addopts = "--strict-data-files --strict-markers --strict-config --capture=no --durations=30 --junitxml=test-data.xml" +## addopts = "--strict-data-files --strict-markers --strict-config --capture=no --durations=30 --junitxml=test-data.xml" +addopts = "--strict-markers --strict-config --capture=no --durations=30 --junitxml=test-data.xml" empty_parameter_set_mark = "fail_at_collect" xfail_strict = true testpaths = "pandas"