diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bce6a735b7b07..ee53b83c1764c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -96,6 +96,19 @@ For example: buffer = io.BytesIO() data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") +:.. _whatsnew_read_csv_table_precision_default: + +Change in default floating precision for ``read_csv`` and ``read_table`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` previously defaulted to a parser that +could read floating point numbers slightly incorrectly with respect to the last bit in precision. +The option ``floating_precision="high"`` has always been available to avoid this issue. +Beginning with this version, the default is now to use the more accurate parser by making +``floating_precision=None`` correspond to the high precision parser, and the new option +``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision +parser by default should have no impact on performance. (:issue:`17154`) + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 811e28b830921..b87e46f9b6648 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -476,10 +476,13 @@ cdef class TextReader: if float_precision == "round_trip": # see gh-15140 self.parser.double_converter = round_trip - elif float_precision == "high": + elif float_precision == "legacy": + self.parser.double_converter = xstrtod + elif float_precision == "high" or float_precision is None: self.parser.double_converter = precise_xstrtod else: - self.parser.double_converter = xstrtod + raise ValueError(f'Unrecognized float_precision option: ' + f'{float_precision}') if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4c619a636f057..fe94a722f2ec8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -338,9 +338,9 @@ option can improve performance because there is no longer any I/O overhead. float_precision : str, optional Specifies which converter the C engine should use for floating-point - values. The options are `None` for the ordinary converter, - `high` for the high-precision converter, and `round_trip` for the - round-trip converter. + values. The options are `None` or `high` for the ordinary converter, + `legacy` for the original lower precision pandas converter, and + `round_trip` for the round-trip converter. Returns ------- @@ -2299,6 +2299,7 @@ def TextParser(*args, **kwds): values. The options are None for the ordinary converter, 'high' for the high-precision converter, and 'round_trip' for the round-trip converter. + .. versionchanged:: 1.2 """ kwds["engine"] = "python" return TextFileReader(*args, **kwds) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 50d5fb3e49c2a..7c58afe867440 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -160,7 +160,9 @@ def test_precise_conversion(c_parser_only): # 25 decimal digits of precision text = f"a\n{num:.25}" - normal_val = float(parser.read_csv(StringIO(text))["a"][0]) + normal_val = float( + parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] + ) precise_val = float( parser.read_csv(StringIO(text), float_precision="high")["a"][0] ) @@ -608,7 +610,7 @@ def test_unix_style_breaks(c_parser_only): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -646,7 +648,7 @@ def test_1000_sep_with_decimal( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "value,expected", [ @@ -702,3 +704,22 @@ def test_1000_sep_decimal_float_precision( ) val = df.iloc[0, 0] assert val == expected + + +def test_float_precision_options(c_parser_only): + # GH 17154, 36228 + parser = c_parser_only + s = "foo\n243.164\n" + df = parser.read_csv(StringIO(s)) + df2 = parser.read_csv(StringIO(s), float_precision="high") + + tm.assert_frame_equal(df, df2) + + df3 = parser.read_csv(StringIO(s), float_precision="legacy") + + assert not df.iloc[0, 0] == df3.iloc[0, 0] + + msg = "Unrecognized float_precision option: junk" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(s), float_precision="junk")