From acffef2ee184251f25cbe05a1ecd1f8c184fb3ca Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 13:32:44 -0400 Subject: [PATCH 1/7] change read_csv and read_table to use high precision by default --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a0466c5ac6b57..90a2105eb93d5 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -589,7 +589,7 @@ def read_csv( delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], memory_map=False, - float_precision=None, + float_precision="high", storage_options: StorageOptions = None, ): # gh-23761 @@ -747,7 +747,7 @@ def read_table( delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], memory_map=False, - float_precision=None, + float_precision="high", ): return read_csv(**locals()) From 68ecda33e75768a168cf13860985f031f7933ffb Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 15:33:19 -0400 Subject: [PATCH 2/7] Modify test, whatsnew --- doc/source/whatsnew/v1.2.0.rst | 11 +++++++++++ pandas/tests/io/parser/test_c_parser_only.py | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2aac2596c18cb..797341877bd54 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -96,6 +96,17 @@ For example: buffer = io.BytesIO() data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") +:.. _whatsnew_read_csv_table_precision_default: + +Change in default floating precision for ``read_csv`` and ``read_table`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` previously defaulted to a parser that +could read floating point numbers slightly incorrectly with respect to the last bit in precision. +The option ``floating_precision="high"`` has always been available to avoid this issue. +Beginning with this version, the default is now to use the more accurate parser by making +``floating_precision="high"`` the default, with no impact on performance. (:issue:`17154`) + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 50d5fb3e49c2a..042c2baf90450 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -160,7 +160,9 @@ def test_precise_conversion(c_parser_only): # 25 decimal digits of precision text = f"a\n{num:.25}" - normal_val = float(parser.read_csv(StringIO(text))["a"][0]) + normal_val = float( + parser.read_csv(StringIO(text), float_precision=None)["a"][0] + ) precise_val = float( parser.read_csv(StringIO(text), float_precision="high")["a"][0] ) From 9aa25daea343fdb13d576e5ddc3a63dbad1bab56 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 16:43:24 -0400 Subject: [PATCH 3/7] add legacy option for float_precision for C parser --- doc/source/whatsnew/v1.2.0.rst | 4 +++- pandas/_libs/parsers.pyx | 6 +++--- pandas/io/parsers.py | 10 +++++----- pandas/tests/io/parser/test_c_parser_only.py | 21 +++++++++++++++++--- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 797341877bd54..8fecf4f864b97 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -105,7 +105,9 @@ For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` pr could read floating point numbers slightly incorrectly with respect to the last bit in precision. The option ``floating_precision="high"`` has always been available to avoid this issue. Beginning with this version, the default is now to use the more accurate parser by making -``floating_precision="high"`` the default, with no impact on performance. (:issue:`17154`) +``floating_precision=None`` correspond to the high precision parser, and the new option +``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision +parser by default should have no impact on performance. (:issue:`17154`) .. _whatsnew_120.enhancements.other: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 811e28b830921..efb58e33777d7 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -476,10 +476,10 @@ cdef class TextReader: if float_precision == "round_trip": # see gh-15140 self.parser.double_converter = round_trip - elif float_precision == "high": - self.parser.double_converter = precise_xstrtod - else: + elif float_precision == "legacy": self.parser.double_converter = xstrtod + else: # float_precision == "high" or float_precision is None: + self.parser.double_converter = precise_xstrtod if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 90a2105eb93d5..1c177c88b0052 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -338,9 +338,9 @@ option can improve performance because there is no longer any I/O overhead. float_precision : str, optional Specifies which converter the C engine should use for floating-point - values. The options are `None` for the ordinary converter, - `high` for the high-precision converter, and `round_trip` for the - round-trip converter. + values. The options are `None` or `high` for the ordinary converter, + `legacy` for the original lower precision pandas converter, and + `round_trip` for the round-trip converter. Returns ------- @@ -589,7 +589,7 @@ def read_csv( delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], memory_map=False, - float_precision="high", + float_precision=None, storage_options: StorageOptions = None, ): # gh-23761 @@ -747,7 +747,7 @@ def read_table( delim_whitespace=False, low_memory=_c_parser_defaults["low_memory"], memory_map=False, - float_precision="high", + float_precision=None, ): return read_csv(**locals()) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 042c2baf90450..22d5ef22b2059 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -161,7 +161,7 @@ def test_precise_conversion(c_parser_only): text = f"a\n{num:.25}" normal_val = float( - parser.read_csv(StringIO(text), float_precision=None)["a"][0] + parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] ) precise_val = float( parser.read_csv(StringIO(text), float_precision="high")["a"][0] @@ -610,7 +610,7 @@ def test_unix_style_breaks(c_parser_only): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -648,7 +648,7 @@ def test_1000_sep_with_decimal( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "value,expected", [ @@ -704,3 +704,18 @@ def test_1000_sep_decimal_float_precision( ) val = df.iloc[0, 0] assert val == expected + + +def test_high_is_default(c_parser_only): + # GH 17154, 36228 + parser = c_parser_only + s = "foo\n243.164\n" + df = parser.read_csv(StringIO(s)) + df2 = parser.read_csv(StringIO(s), float_precision="high") + + tm.assert_frame_equal(df, df2) + + df3 = parser.read_csv(StringIO(s), float_precision="legacy") + + assert not df.iloc[0, 0] == df3.iloc[0, 0] + From afaf031d7f19929302ab53d0e7b2df7bdb9f5a23 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 16:45:20 -0400 Subject: [PATCH 4/7] remove blank line in tst file --- pandas/tests/io/parser/test_c_parser_only.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 22d5ef22b2059..953b936426602 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -718,4 +718,3 @@ def test_high_is_default(c_parser_only): df3 = parser.read_csv(StringIO(s), float_precision="legacy") assert not df.iloc[0, 0] == df3.iloc[0, 0] - From fa97aabb0d7d756316dfea5d40835e93baf99513 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 17:16:44 -0400 Subject: [PATCH 5/7] two spaces before inline comment --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index efb58e33777d7..eb0aa155d1c4a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -478,7 +478,7 @@ cdef class TextReader: self.parser.double_converter = round_trip elif float_precision == "legacy": self.parser.double_converter = xstrtod - else: # float_precision == "high" or float_precision is None: + else: # float_precision == "high" or float_precision is None: self.parser.double_converter = precise_xstrtod if isinstance(dtype, dict): From 7f4cf45066f259b29d39267d25431f535dc9ad20 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Fri, 11 Sep 2020 18:08:09 -0400 Subject: [PATCH 6/7] add test for invalid float_precision option --- pandas/_libs/parsers.pyx | 5 ++++- pandas/io/parsers.py | 1 + pandas/tests/io/parser/test_c_parser_only.py | 7 ++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index eb0aa155d1c4a..b87e46f9b6648 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -478,8 +478,11 @@ cdef class TextReader: self.parser.double_converter = round_trip elif float_precision == "legacy": self.parser.double_converter = xstrtod - else: # float_precision == "high" or float_precision is None: + elif float_precision == "high" or float_precision is None: self.parser.double_converter = precise_xstrtod + else: + raise ValueError(f'Unrecognized float_precision option: ' + f'{float_precision}') if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1c177c88b0052..7f6739d6d0022 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2299,6 +2299,7 @@ def TextParser(*args, **kwds): values. The options are None for the ordinary converter, 'high' for the high-precision converter, and 'round_trip' for the round-trip converter. + .. versionchanged:: 1.1.2 """ kwds["engine"] = "python" return TextFileReader(*args, **kwds) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 953b936426602..7c58afe867440 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -706,7 +706,7 @@ def test_1000_sep_decimal_float_precision( assert val == expected -def test_high_is_default(c_parser_only): +def test_float_precision_options(c_parser_only): # GH 17154, 36228 parser = c_parser_only s = "foo\n243.164\n" @@ -718,3 +718,8 @@ def test_high_is_default(c_parser_only): df3 = parser.read_csv(StringIO(s), float_precision="legacy") assert not df.iloc[0, 0] == df3.iloc[0, 0] + + msg = "Unrecognized float_precision option: junk" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(s), float_precision="junk") From be5910dcdd330b89f8f05620d477e58cce8e92be Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Fri, 11 Sep 2020 18:09:15 -0400 Subject: [PATCH 7/7] correct versionadded for 1.2 --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7637f28e9ae49..fe94a722f2ec8 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2299,7 +2299,7 @@ def TextParser(*args, **kwds): values. The options are None for the ordinary converter, 'high' for the high-precision converter, and 'round_trip' for the round-trip converter. - .. versionchanged:: 1.1.2 + .. versionchanged:: 1.2 """ kwds["engine"] = "python" return TextFileReader(*args, **kwds)