From 1ba93e4c91eaf8ed8f7829ddce8df7a067e60f1d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 10:00:40 +0100 Subject: [PATCH 1/7] BUG: DataFrame constructor not tracking reference if called with df or mgr --- doc/source/whatsnew/v2.0.0.rst | 4 ++++ pandas/core/frame.py | 3 +++ pandas/tests/copy_view/test_constructors.py | 25 +++++++++++++++++++++ 3 files changed, 32 insertions(+) create mode 100644 pandas/tests/copy_view/test_constructors.py diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..ca1b4bb093dd3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -878,6 +878,10 @@ Indexing - Bug in :meth:`DataFrame.compare` does not recognize differences when comparing ``NA`` with value in nullable dtypes (:issue:`48939`) - +Copy on write +^^^^^^^^^^^^^ +- Bug in :class:`DataFrame` constructor not tracking reference if called with another :class:`DataFrame` (:issue:``) + Missing ^^^^^^^ - Bug in :meth:`Index.equals` raising ``TypeError`` when :class:`Index` consists of tuples that contain ``NA`` (:issue:`48446`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e671f45216968..f1e119b81a6d5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,6 +205,7 @@ to_arrays, treat_as_nested, ) +from pandas.core.internals.managers import _using_copy_on_write from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs @@ -643,6 +644,8 @@ def __init__( # -> use fastpath (without checking Manager type) if index is None and columns is None and dtype is None and not copy: # GH#33357 fastpath + if _using_copy_on_write(): + data = data.copy(deep=False) NDFrame.__init__(self, data) return diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py new file mode 100644 index 0000000000000..e169bef916528 --- /dev/null +++ b/pandas/tests/copy_view/test_constructors.py @@ -0,0 +1,25 @@ +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + + +@pytest.mark.parametrize("columns", [None, ["a"]]) +@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr]) +def test_dataframe_constructor_mgr(using_copy_on_write, func, columns): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + + new_df = DataFrame(func(df)) + + assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + new_df.iloc[0] = 100 + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + tm.assert_frame_equal(df, df_orig) + else: + assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) + tm.assert_frame_equal(df, new_df) From 24912bda509955c489ad90090175e48f11133e31 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 10:02:13 +0100 Subject: [PATCH 2/7] Add gh ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ca1b4bb093dd3..7a87ea0b3872b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -880,7 +880,7 @@ Indexing Copy on write ^^^^^^^^^^^^^ -- Bug in :class:`DataFrame` constructor not tracking reference if called with another :class:`DataFrame` (:issue:``) +- Bug in :class:`DataFrame` constructor not tracking reference if called with another :class:`DataFrame` (:issue:`50499`) Missing ^^^^^^^ From 11a1db854d28396eca29c734c72bb72f23d8f764 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 11:37:18 +0100 Subject: [PATCH 3/7] Fix tests --- pandas/tests/frame/methods/test_align.py | 7 +++++-- pandas/tests/indexing/test_iloc.py | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 88963dcc4b0f7..b1941dce53bf4 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -40,12 +40,15 @@ def test_frame_align_aware(self): assert new1.index.tz is timezone.utc assert new2.index.tz is timezone.utc - def test_align_float(self, float_frame): + def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align(float_frame) assert af._mgr is not float_frame._mgr af, bf = float_frame.align(float_frame, copy=False) - assert af._mgr is float_frame._mgr + if using_copy_on_write: + assert not (af._mgr is float_frame._mgr) + else: + assert af._mgr is float_frame._mgr # axis = 0 other = float_frame.iloc[:-5, :3] diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 0f85cb4515e13..91f9ad3244f20 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -76,7 +76,9 @@ class TestiLocBaseIndependent: ], ) @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manager): + def test_iloc_setitem_fullcol_categorical( + self, indexer, key, using_array_manager, using_copy_on_write + ): frame = DataFrame({0: range(3)}, dtype=object) cat = Categorical(["alpha", "beta", "gamma"]) @@ -90,7 +92,7 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage indexer(df)[key, 0] = cat expected = DataFrame({0: cat}).astype(object) - if not using_array_manager: + if not using_array_manager and not using_copy_on_write: assert np.shares_memory(df[0].values, orig_vals) tm.assert_frame_equal(df, expected) From bd304fc3f33b061a02290a8f882501cca68bdabf Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 12:09:35 +0100 Subject: [PATCH 4/7] Restrict to dataframes, not managers --- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 8 +++++++- pandas/tests/copy_view/test_constructors.py | 5 ++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f1e119b81a6d5..3f8f02aa657ed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -638,14 +638,14 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr + if not copy and _using_copy_on_write(): + data = data.copy(deep=False) if isinstance(data, (BlockManager, ArrayManager)): # first check if a Manager is passed without any other arguments # -> use fastpath (without checking Manager type) if index is None and columns is None and dtype is None and not copy: # GH#33357 fastpath - if _using_copy_on_write(): - data = data.copy(deep=False) NDFrame.__init__(self, data) return diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c893e9ce3d9a9..9a0bb55debe10 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -159,6 +159,7 @@ SingleArrayManager, ) from pandas.core.internals.construction import mgr_to_mgr +from pandas.core.internals.managers import _using_copy_on_write from pandas.core.missing import ( clean_fill_method, clean_reindex_fill_method, @@ -5285,7 +5286,12 @@ def _reindex_with_indexers( # If we've made a copy once, no need to make another one copy = False - if (copy or copy is None) and new_data is self._mgr: + if ( + (copy or copy is None) + and new_data is self._mgr + or not copy + and _using_copy_on_write() + ): new_data = new_data.copy(deep=copy) return self._constructor(new_data).__finalize__(self) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index e169bef916528..bc4c9d91aee18 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -7,12 +7,11 @@ @pytest.mark.parametrize("columns", [None, ["a"]]) -@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr]) -def test_dataframe_constructor_mgr(using_copy_on_write, func, columns): +def test_dataframe_constructor_mgr(using_copy_on_write, columns): df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() - new_df = DataFrame(func(df)) + new_df = DataFrame(df) assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a")) new_df.iloc[0] = 100 From c51a2046ff09c3499ff0a9a6e9f2c995d8f109c1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 30 Dec 2022 14:03:38 +0100 Subject: [PATCH 5/7] Fix for now --- pandas/tests/frame/methods/test_align.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index b1941dce53bf4..89da9017e43af 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -40,13 +40,13 @@ def test_frame_align_aware(self): assert new1.index.tz is timezone.utc assert new2.index.tz is timezone.utc - def test_align_float(self, float_frame, using_copy_on_write): + def test_align_float(self, float_frame, using_copy_on_write, using_array_manager): af, bf = float_frame.align(float_frame) assert af._mgr is not float_frame._mgr af, bf = float_frame.align(float_frame, copy=False) - if using_copy_on_write: - assert not (af._mgr is float_frame._mgr) + if using_copy_on_write or using_array_manager: + assert af._mgr is not float_frame._mgr else: assert af._mgr is float_frame._mgr From f221e0c3ca0498348f472508c2b5fb2aef365262 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 22:53:47 +0100 Subject: [PATCH 6/7] Merge remote-tracking branch 'upstream/main' into cow_mgr_constructor # Conflicts: # pandas/core/generic.py --- .github/workflows/codeql.yml | 3 + .github/workflows/macos-windows.yml | 2 +- .github/workflows/ubuntu.yml | 11 +- .github/workflows/wheels.yml | 3 + .pre-commit-config.yaml | 140 ++++- asv_bench/benchmarks/pandas_vb_common.py | 2 +- asv_bench/benchmarks/rolling.py | 5 +- asv_bench/benchmarks/series_methods.py | 19 + ci/deps/actions-310.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- ci/run_tests.sh | 7 + doc/scripts/eval_performance.py | 3 +- .../development/contributing_codebase.rst | 2 +- doc/source/reference/arrays.rst | 31 ++ doc/source/user_guide/io.rst | 17 +- doc/source/whatsnew/v1.4.0.rst | 2 +- doc/source/whatsnew/v1.5.0.rst | 46 ++ doc/source/whatsnew/v2.0.0.rst | 24 +- environment.yml | 3 +- pandas/__init__.py | 2 +- pandas/_libs/algos.pyx | 53 +- pandas/_libs/lib.pyi | 3 + pandas/_libs/lib.pyx | 31 +- pandas/_libs/ops.pyx | 2 +- pandas/_libs/tslibs/np_datetime.pxd | 6 + pandas/_libs/tslibs/np_datetime.pyx | 10 +- pandas/_libs/tslibs/offsets.pyx | 9 +- pandas/_libs/tslibs/parsing.pyx | 10 +- .../tslibs/src/datetime/np_datetime_strings.c | 147 ++++-- .../tslibs/src/datetime/np_datetime_strings.h | 17 +- pandas/_libs/tslibs/strptime.pyx | 68 +-- pandas/_libs/tslibs/timestamps.pyx | 7 +- pandas/_libs/tslibs/tzconversion.pyx | 26 +- pandas/_testing/__init__.py | 2 +- pandas/_typing.py | 5 + pandas/compat/__init__.py | 8 +- .../{_compressors.py => compressors.py} | 0 pandas/core/arrays/arrow/array.py | 40 +- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/datetimelike.py | 133 +++-- pandas/core/arrays/datetimes.py | 25 +- pandas/core/arrays/interval.py | 135 +++-- pandas/core/arrays/period.py | 4 +- pandas/core/arrays/string_.py | 24 +- pandas/core/arrays/string_arrow.py | 6 +- pandas/core/base.py | 19 +- pandas/core/dtypes/common.py | 2 + pandas/core/groupby/generic.py | 155 +++++- pandas/core/groupby/groupby.py | 88 ++-- pandas/core/groupby/grouper.py | 35 +- pandas/core/groupby/ops.py | 6 +- pandas/core/indexes/base.py | 6 +- pandas/core/indexes/datetimelike.py | 12 +- pandas/core/indexes/multi.py | 8 +- pandas/core/indexes/range.py | 7 +- pandas/core/interchange/dataframe.py | 8 +- pandas/core/{ => methods}/describe.py | 0 pandas/core/resample.py | 6 + pandas/core/series.py | 13 +- pandas/core/shared_docs.py | 8 +- pandas/core/strings/base.py | 2 +- pandas/core/strings/object_array.py | 2 +- pandas/core/tools/datetimes.py | 2 +- pandas/core/window/doc.py | 8 +- pandas/core/window/ewm.py | 2 +- pandas/core/window/rolling.py | 33 +- pandas/io/_util.py | 23 + pandas/io/common.py | 8 +- pandas/io/formats/printing.py | 8 +- pandas/io/json/_json.py | 4 +- pandas/io/orc.py | 24 +- pandas/io/parquet.py | 19 +- pandas/io/xml.py | 28 +- pandas/plotting/_matplotlib/hist.py | 17 +- pandas/tests/api/test_api.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 10 +- pandas/tests/arrays/boolean/test_function.py | 2 +- pandas/tests/copy_view/test_methods.py | 244 +++++++++ pandas/tests/dtypes/test_common.py | 18 + pandas/tests/extension/base/setitem.py | 8 + pandas/tests/extension/test_arrow.py | 45 +- pandas/tests/extension/test_string.py | 17 + pandas/tests/frame/indexing/test_mask.py | 11 + pandas/tests/frame/indexing/test_where.py | 6 +- pandas/tests/frame/methods/test_asfreq.py | 3 +- pandas/tests/frame/methods/test_asof.py | 8 +- .../tests/frame/methods/test_combine_first.py | 6 +- pandas/tests/frame/methods/test_equals.py | 3 +- pandas/tests/frame/methods/test_isetitem.py | 37 ++ pandas/tests/frame/methods/test_truncate.py | 6 - pandas/tests/frame/test_query_eval.py | 6 +- pandas/tests/frame/test_reductions.py | 4 + pandas/tests/frame/test_stack_unstack.py | 6 + .../tests/groupby/aggregate/test_aggregate.py | 15 + .../tests/groupby/test_frame_value_counts.py | 38 ++ pandas/tests/groupby/test_groupby.py | 10 + pandas/tests/groupby/test_timegrouper.py | 2 + pandas/tests/groupby/test_value_counts.py | 9 +- .../tests/groupby/transform/test_transform.py | 2 - .../indexes/datetimes/test_constructors.py | 9 +- .../indexing/multiindex/test_multiindex.py | 78 +-- pandas/tests/io/formats/style/test_style.py | 4 +- .../json/test_json_table_schema_ext_dtype.py | 10 +- pandas/tests/io/parser/test_parse_dates.py | 2 +- pandas/tests/io/pytables/test_round_trip.py | 9 +- pandas/tests/io/pytables/test_store.py | 2 +- pandas/tests/io/test_common.py | 4 +- pandas/tests/io/test_compression.py | 2 +- pandas/tests/io/test_orc.py | 58 ++- pandas/tests/io/test_pickle.py | 4 +- pandas/tests/io/xml/test_xml.py | 82 ++- pandas/tests/libs/test_lib.py | 21 + pandas/tests/plotting/test_hist_method.py | 30 ++ .../tests/resample/test_resampler_grouper.py | 22 + .../scalar/timestamp/test_constructors.py | 5 +- .../tests/scalar/timestamp/test_timestamp.py | 9 + pandas/tests/series/methods/test_replace.py | 3 +- pandas/tests/series/methods/test_to_numpy.py | 17 + .../tests/series/methods/test_tz_localize.py | 10 +- pandas/tests/tools/test_to_datetime.py | 54 ++ .../tseries/offsets/test_business_hour.py | 12 + pandas/tests/tslibs/test_parsing.py | 3 +- pandas/tests/window/test_api.py | 19 +- pandas/tests/window/test_dtypes.py | 2 +- pandas/tests/window/test_ewm.py | 8 +- pandas/tests/window/test_groupby.py | 39 +- pandas/tests/window/test_numba.py | 22 +- requirements-dev.txt | 3 +- ...check_for_inconsistent_pandas_namespace.py | 142 +++++ scripts/sync_flake8_versions.py | 13 +- .../test_inconsistent_namespace_check.py | 61 +++ scripts/tests/test_sync_flake8_versions.py | 3 - .../tests/test_validate_unwanted_patterns.py | 419 +++++++++++++++ scripts/validate_unwanted_patterns.py | 488 ++++++++++++++++++ setup.cfg | 16 - 138 files changed, 3156 insertions(+), 705 deletions(-) rename pandas/compat/{_compressors.py => compressors.py} (100%) rename pandas/core/{ => methods}/describe.py (100%) create mode 100644 pandas/io/_util.py create mode 100644 pandas/tests/frame/methods/test_isetitem.py create mode 100644 pandas/tests/series/methods/test_to_numpy.py create mode 100644 scripts/check_for_inconsistent_pandas_namespace.py create mode 100644 scripts/tests/test_inconsistent_namespace_check.py create mode 100644 scripts/tests/test_validate_unwanted_patterns.py create mode 100755 scripts/validate_unwanted_patterns.py diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 05a5d003c1dd1..23609f692df7c 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -8,6 +8,9 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + jobs: analyze: runs-on: ubuntu-22.04 diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index 5efc1aa67b4cd..d762e20db196a 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -16,7 +16,7 @@ env: PANDAS_CI: 1 PYTEST_TARGET: pandas PATTERN: "not slow and not db and not network and not single_cpu" - TEST_ARGS: "-W error:::pandas" + ERROR_ON_WARNINGS: "1" permissions: diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 7dbf74278d433..9c93725ea15ec 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -38,7 +38,7 @@ jobs: - name: "Minimum Versions" env_file: actions-38-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" - test_args: "" + error_on_warnings: "0" - name: "Locale: it_IT" env_file: actions-38.yaml pattern: "not slow and not network and not single_cpu" @@ -63,20 +63,22 @@ jobs: env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" - test_args: "" + error_on_warnings: "0" - name: "Data Manager" env_file: actions-38.yaml pattern: "not slow and not network and not single_cpu" pandas_data_manager: "array" - test_args: "" + error_on_warnings: "0" - name: "Pypy" env_file: actions-pypy-38.yaml pattern: "not slow and not network and not single_cpu" test_args: "--max-worker-restart 0" + error_on_warnings: "0" - name: "Numpy Dev" env_file: actions-310-numpydev.yaml pattern: "not slow and not network and not single_cpu" test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy" + error_on_warnings: "0" exclude: - env_file: actions-38.yaml pyarrow_version: "7" @@ -96,11 +98,12 @@ jobs: ENV_FILE: ci/deps/${{ matrix.env_file }} PATTERN: ${{ matrix.pattern }} EXTRA_APT: ${{ matrix.extra_apt || '' }} + ERROR_ON_WARNINGS: ${{ matrix.error_on_warnings || '1' }} LANG: ${{ matrix.lang || '' }} LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }} PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - TEST_ARGS: ${{ matrix.test_args || '-W error:::pandas' }} + TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 49d29c91f86cd..0e347b166e425 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -30,6 +30,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true +permissions: + contents: read + jobs: build_wheels: name: Build wheel for ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6b531215813d3..82043f79643e4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,7 @@ repos: types_or: [python, rst, markdown] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.9.1 + rev: v0.10.1 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -63,25 +63,23 @@ repos: '--extensions=c,h', '--headers=h', --recursive, - '--filter=-readability/casting,-runtime/int,-build/include_subdir' + --linelength=88, + '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] - repo: https://github.com/PyCQA/flake8 rev: 6.0.0 hooks: - id: flake8 - # Need to patch os.remove rule in pandas-dev-flaker - exclude: ^ci/fix_wheels.py additional_dependencies: &flake8_dependencies - flake8==6.0.0 - flake8-bugbear==22.7.1 - - pandas-dev-flaker==0.5.0 - repo: https://github.com/pycqa/pylint - rev: v2.15.6 + rev: v2.15.9 hooks: - id: pylint stages: [manual] - repo: https://github.com/pycqa/pylint - rev: v2.15.6 + rev: v2.15.9 hooks: - id: pylint alias: redefined-outer-name @@ -94,15 +92,14 @@ repos: |^pandas/util/_test_decorators\.py # keep excluded |^pandas/_version\.py # keep excluded |^pandas/conftest\.py # keep excluded - |^pandas/core/generic\.py args: [--disable=all, --enable=redefined-outer-name] stages: [manual] - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 5.11.4 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.2.2 + rev: v3.3.1 hooks: - id: pyupgrade args: [--py38-plus] @@ -183,6 +180,21 @@ repos: types: [rst] args: [--filename=*.rst] additional_dependencies: [flake8-rst==0.7.0, flake8==3.7.9] + - id: inconsistent-namespace-usage + name: 'Check for inconsistent use of pandas namespace' + entry: python scripts/check_for_inconsistent_pandas_namespace.py + exclude: ^pandas/core/interchange/ + language: python + types: [python] + - id: no-os-remove + name: Check code for instances of os.remove + entry: os\.remove + language: pygrep + types: [python] + files: ^pandas/tests/ + exclude: | + (?x)^ + pandas/tests/io/pytables/test_store\.py$ - id: unwanted-patterns name: Unwanted patterns language: pygrep @@ -192,6 +204,20 @@ repos: \#\ type:\ (?!ignore) |\#\ type:\s?ignore(?!\[) + # foo._class__ instead of type(foo) + |\.__class__ + + # np.bool/np.object instead of np.bool_/np.object_ + |np\.bool[^_8`] + |np\.object[^_8`] + + # imports from collections.abc instead of `from collections import abc` + |from\ collections\.abc\ import + + # Numpy + |from\ numpy\ import\ random + |from\ numpy\.random\ import + # Incorrect code-block / IPython directives |\.\.\ code-block\ :: |\.\.\ ipython\ :: @@ -200,7 +226,17 @@ repos: # Check for deprecated messages without sphinx directive |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) + + # {foo!r} instead of {repr(foo)} + |!r} + + # builtin filter function + |(?obj`, not ` obj` language: pygrep @@ -231,6 +267,58 @@ repos: files: ^pandas/tests/extension/base types: [python] exclude: ^pandas/tests/extension/base/base\.py + - id: unwanted-patterns-in-tests + name: Unwanted patterns in tests + language: pygrep + entry: | + (?x) + # pytest.xfail instead of pytest.mark.xfail + pytest\.xfail + + # imports from pandas._testing instead of `import pandas._testing as tm` + |from\ pandas\._testing\ import + |from\ pandas\ import\ _testing\ as\ tm + + # No direct imports from conftest + |conftest\ import + |import\ conftest + + # pandas.testing instead of tm + |pd\.testing\. + + # pd.api.types instead of from pandas.api.types import ... + |(pd|pandas)\.api\.types\. + + # np.testing, np.array_equal + |(numpy|np)(\.testing|\.array_equal) + + # unittest.mock (use pytest builtin monkeypatch fixture instead) + |(unittest(\.| import )mock|mock\.Mock\(\)|mock\.patch) + + # pytest raises without context + |\s\ pytest.raises + + # pytest.warns (use tm.assert_produces_warning instead) + |pytest\.warns + files: ^pandas/tests/ + types_or: [python, cython, rst] + - id: unwanted-patterns-in-ea-tests + name: Unwanted patterns in EA tests + language: pygrep + entry: | + (?x) + tm.assert_(series|frame)_equal + files: ^pandas/tests/extension/base/ + exclude: ^pandas/tests/extension/base/base\.py$ + types_or: [python, cython, rst] + - id: unwanted-patterns-in-cython + name: Unwanted patterns in Cython code + language: pygrep + entry: | + (?x) + # `obj` as opposed to ` obj` + [a-zA-Z0-9*]>[ ] + types: [cython] - id: pip-to-conda name: Generate pip dependency from conda language: python @@ -251,6 +339,38 @@ repos: language: python types: [rst] files: ^doc/source/(development|reference)/ + - id: unwanted-patterns-bare-pytest-raises + name: Check for use of bare pytest raises + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="bare_pytest_raises" + types: [python] + files: ^pandas/tests/ + exclude: ^pandas/tests/extension/ + - id: unwanted-patterns-private-function-across-module + name: Check for use of private functions across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" + types: [python] + exclude: ^(asv_bench|pandas/tests|doc)/ + - id: unwanted-patterns-private-import-across-module + name: Check for import of private attributes across modules + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" + types: [python] + exclude: | + (?x) + ^(asv_bench|pandas/tests|doc)/ + |scripts/validate_min_versions_in_sync\.py$ + - id: unwanted-patterns-strings-to-concatenate + name: Check for use of not concatenated strings + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_to_concatenate" + types_or: [python, cython] + - id: unwanted-patterns-strings-with-misplaced-whitespace + name: Check for strings with misplaced spaces + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" + types_or: [python, cython] - id: use-pd_array-in-core name: Import pandas.array as pd_array in core language: python diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index d3168bde0a783..97d91111e833a 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -70,7 +70,7 @@ class BaseIO: def remove(self, f): """Remove created files""" try: - os.remove(f) # noqa: PDF008 + os.remove(f) except OSError: # On Windows, attempting to remove a file that is in use # causes an exception to be raised diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index d65a1a39e8bc7..7e94763f3f293 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -292,7 +292,7 @@ class Groupby: ["sum", "median", "mean", "max", "min", "kurt", "sum"], [ ("rolling", {"window": 2}), - ("rolling", {"window": "30s", "on": "C"}), + ("rolling", {"window": "30s"}), ("expanding", {}), ], ) @@ -304,9 +304,10 @@ def setup(self, method, window_kwargs): { "A": [str(i) for i in range(N)] * 10, "B": list(range(N)) * 10, - "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10), } ) + if isinstance(kwargs.get("window", None), str): + df.index = pd.date_range(start="1900-01-01", freq="1min", periods=N * 10) self.groupby_window = getattr(df.groupby("A"), window)(**kwargs) def time_method(self, method, window_kwargs): diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index dc86352082cca..a0dd52e9f17e4 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -382,4 +382,23 @@ def time_iter(self, dtype): pass +class ToNumpy: + def setup(self): + N = 1_000_000 + self.ser = Series( + np.random.randn( + N, + ) + ) + + def time_to_numpy(self): + self.ser.to_numpy() + + def time_to_numpy_double_copy(self): + self.ser.to_numpy(dtype="float64", copy=True) + + def time_to_numpy_copy(self): + self.ser.to_numpy(copy=True) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index d787571d9d112..79457cd503876 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -48,7 +48,7 @@ dependencies: - pyxlsb - s3fs>=2021.08.0 - scipy - - sqlalchemy + - sqlalchemy<1.4.46 - tabulate - tzdata>=2022a - xarray diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index 95ec98d72ebcc..6955baa282274 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -48,7 +48,7 @@ dependencies: - pyxlsb - s3fs>=2021.08.0 - scipy - - sqlalchemy + - sqlalchemy<1.4.46 - tabulate - xarray - xlrd diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index f7de8bbee7d8a..004ef93606457 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -48,7 +48,7 @@ dependencies: - pyxlsb - s3fs>=2021.08.0 - scipy - - sqlalchemy + - sqlalchemy<1.4.46 - tabulate - xarray - xlrd diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 821ec9c5d4234..ec7ffebde964f 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -48,7 +48,7 @@ dependencies: - pyxlsb - s3fs>=2021.08.0 - scipy - - sqlalchemy + - sqlalchemy<1.4.46 - tabulate - tzdata>=2022a - xarray diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index c94ce79ea2ff8..b4171710564bf 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -49,7 +49,7 @@ dependencies: - pyxlsb - s3fs>=2021.08.0 - scipy - - sqlalchemy + - sqlalchemy<1.4.46 - tabulate - xarray - xlrd diff --git a/ci/run_tests.sh b/ci/run_tests.sh index e6de5caf955fc..a48d6c1ad6580 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -30,6 +30,13 @@ if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi +if [[ "$ERROR_ON_WARNINGS" == "1" ]]; then + for pth in $(find pandas -name '*.py' -not -path "pandas/tests/*" | sed -e 's/\.py//g' -e 's/\/__init__//g' -e 's/\//./g'); + do + PYTEST_CMD="$PYTEST_CMD -W error:::$pth" + done +fi + echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py index 85d9ce4ad01e9..f6087e02a9330 100644 --- a/doc/scripts/eval_performance.py +++ b/doc/scripts/eval_performance.py @@ -6,8 +6,7 @@ from pandas import DataFrame setup_common = """from pandas import DataFrame -from numpy.random import randn -df = DataFrame(randn(%d, 3), columns=list('abc')) +df = DataFrame(np.random.randn(%d, 3), columns=list('abc')) %s""" setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index b05f026bbbb44..449b6de36cd24 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -43,7 +43,7 @@ Pre-commit ---------- Additionally, :ref:`Continuous Integration ` will run code formatting checks -like ``black``, ``flake8`` (including a `pandas-dev-flaker `_ plugin), +like ``black``, ``flake8``, ``isort``, and ``cpplint`` and more using `pre-commit hooks `_ Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, it is helpful to run the check yourself before submitting code. This diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 5b41de4e12e6f..aeaca7caea25d 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -60,6 +60,37 @@ is an :class:`ArrowDtype`. `Pyarrow `__ provides similar array and `data type `__ support as NumPy including first-class nullability support for all data types, immutability and more. +The table below shows the equivalent pyarrow-backed (``pa``), pandas extension, and numpy (``np``) types that are recognized by pandas. +Pyarrow-backed types below need to be passed into :class:`ArrowDtype` to be recognized by pandas e.g. ``pd.ArrowDtype(pa.bool_())`` + +=============================================== ========================== =================== +PyArrow type pandas extension type NumPy type +=============================================== ========================== =================== +:external+pyarrow:py:func:`pyarrow.bool_` :class:`BooleanDtype` ``np.bool_`` +:external+pyarrow:py:func:`pyarrow.int8` :class:`Int8Dtype` ``np.int8`` +:external+pyarrow:py:func:`pyarrow.int16` :class:`Int16Dtype` ``np.int16`` +:external+pyarrow:py:func:`pyarrow.int32` :class:`Int32Dtype` ``np.int32`` +:external+pyarrow:py:func:`pyarrow.int64` :class:`Int64Dtype` ``np.int64`` +:external+pyarrow:py:func:`pyarrow.uint8` :class:`UInt8Dtype` ``np.uint8`` +:external+pyarrow:py:func:`pyarrow.uint16` :class:`UInt16Dtype` ``np.uint16`` +:external+pyarrow:py:func:`pyarrow.uint32` :class:`UInt32Dtype` ``np.uint32`` +:external+pyarrow:py:func:`pyarrow.uint64` :class:`UInt64Dtype` ``np.uint64`` +:external+pyarrow:py:func:`pyarrow.float32` :class:`Float32Dtype` ``np.float32`` +:external+pyarrow:py:func:`pyarrow.float64` :class:`Float64Dtype` ``np.float64`` +:external+pyarrow:py:func:`pyarrow.time32` (none) (none) +:external+pyarrow:py:func:`pyarrow.time64` (none) (none) +:external+pyarrow:py:func:`pyarrow.timestamp` :class:`DatetimeTZDtype` ``np.datetime64`` +:external+pyarrow:py:func:`pyarrow.date32` (none) (none) +:external+pyarrow:py:func:`pyarrow.date64` (none) (none) +:external+pyarrow:py:func:`pyarrow.duration` (none) ``np.timedelta64`` +:external+pyarrow:py:func:`pyarrow.binary` (none) (none) +:external+pyarrow:py:func:`pyarrow.string` :class:`StringDtype` ``np.str_`` +:external+pyarrow:py:func:`pyarrow.decimal128` (none) (none) +:external+pyarrow:py:func:`pyarrow.list_` (none) (none) +:external+pyarrow:py:func:`pyarrow.map_` (none) (none) +:external+pyarrow:py:func:`pyarrow.dictionary` :class:`CategoricalDtype` (none) +=============================================== ========================== =================== + .. note:: For string types (``pyarrow.string()``, ``string[pyarrow]``), PyArrow support is still facilitated diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 677be7bf29479..dc21b9f35d272 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1255,6 +1255,21 @@ The bad line will be a list of strings that was split by the ``sep``: .. versionadded:: 1.4.0 +Note that the callable function will handle only a line with too many fields. +Bad lines caused by other errors will be silently skipped. + +For example: + +.. code-block:: ipython + + def bad_lines_func(line): + print(line) + + data = 'name,type\nname a,a is of type a\nname b,"b\" is of type b"' + data + pd.read_csv(data, on_bad_lines=bad_lines_func, engine="python") + +The line was not processed in this case, as a "bad line" here is caused by an escape character. You can also use the ``usecols`` parameter to eliminate extraneous column data that appear in some lines but not others: @@ -3833,7 +3848,7 @@ OpenDocument Spreadsheets The io methods for `Excel files`_ also support reading and writing OpenDocument spreadsheets using the `odfpy `__ module. The semantics and features for reading and writing OpenDocument spreadsheets match what can be done for `Excel files`_ using -``engine='odf'``. +``engine='odf'``. The optional dependency 'odfpy' needs to be installed. The :func:`~pandas.read_excel` method can read OpenDocument spreadsheets diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5895a06792ffb..9dbe450261e54 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -320,7 +320,7 @@ Null-values are no longer coerced to NaN-value in value_counts and mode ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, -``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This +``NaT`` and other null-values to a NaN-value for ``np.object_``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`). diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index a1c374db91f8b..b61547d1523cf 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -290,6 +290,52 @@ and attributes without holding entire tree in memory (:issue:`45442`). .. _`lxml's iterparse`: https://lxml.de/3.2/parsing.html#iterparse-and-iterwalk .. _`etree's iterparse`: https://docs.python.org/3/library/xml.etree.elementtree.html#xml.etree.ElementTree.iterparse +.. _whatsnew_150.enhancements.copy_on_write: + +Copy on Write +^^^^^^^^^^^^^ + +A new feature ``copy_on_write`` was added (:issue:`46958`). Copy on write ensures that +any DataFrame or Series derived from another in any way always behaves as a copy. +Copy on write disallows updating any other object than the object the method +was applied to. + +Copy on write can be enabled through: + +.. code-block:: python + + pd.set_option("mode.copy_on_write", True) + pd.options.mode.copy_on_write = True + +Alternatively, copy on write can be enabled locally through: + +.. code-block:: python + + with pd.option_context("mode.copy_on_write", True): + ... + +Without copy on write, the parent :class:`DataFrame` is updated when updating a child +:class:`DataFrame` that was derived from this :class:`DataFrame`. + +.. ipython:: python + + df = pd.DataFrame({"foo": [1, 2, 3], "bar": 1}) + view = df["foo"] + view.iloc[0] + df + +With copy on write enabled, df won't be updated anymore: + +.. ipython:: python + + with pd.option_context("mode.copy_on_write", True): + df = pd.DataFrame({"foo": [1, 2, 3], "bar": 1}) + view = df["foo"] + view.iloc[0] + df + +A more detailed explanation can be found `here `_. + .. _whatsnew_150.enhancements.other: Other enhancements diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7a87ea0b3872b..ddcf516076a49 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -39,9 +39,11 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_fwf` * :func:`read_excel` * :func:`read_html` +* :func:`read_xml` * :func:`read_sql` * :func:`read_sql_query` * :func:`read_sql_table` +* :func:`read_orc` Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. @@ -49,6 +51,7 @@ to select the nullable dtypes implementation. * :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``) * :func:`read_excel` * :func:`read_html` +* :func:`read_xml` * :func:`read_parquet` * :func:`read_orc` @@ -82,7 +85,7 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( Other enhancements ^^^^^^^^^^^^^^^^^^ - :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`) -- :meth:`.DataFrameGroupBy.quantile` and :meth:`.SeriesGroupBy.quantile` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) +- :meth:`.DataFrameGroupBy.quantile`, :meth:`.SeriesGroupBy.quantile` and :meth:`.DataFrameGroupBy.std` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) @@ -103,6 +106,7 @@ Other enhancements - :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`) - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) +- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - .. --------------------------------------------------------------------------- @@ -461,7 +465,7 @@ to each element individually, e.g. :: Other API changes ^^^^^^^^^^^^^^^^^ -- The ``freq``, ``tz``, ``nanosecond``, and ``unit`` keywords in the :class:`Timestamp` constructor are now keyword-only (:issue:`45307`) +- The ``freq``, ``tz``, ``nanosecond``, and ``unit`` keywords in the :class:`Timestamp` constructor are now keyword-only (:issue:`45307`, :issue:`32526`) - Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`) - :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser. - Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`) @@ -725,6 +729,7 @@ Removal of prior version deprecations/changes - Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`) - Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`) - Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`) +- Enforced deprecation of silently dropping nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops. This will now raise a :class:`.errors.DataError` (:issue:`42834`) - Changed behavior in setting values with ``df.loc[:, foo] = bar`` or ``df.iloc[:, foo] = bar``, these now always attempt to set values inplace before falling back to casting (:issue:`45333`) - Changed default of ``numeric_only`` in various :class:`.DataFrameGroupBy` methods; all methods now default to ``numeric_only=False`` (:issue:`46072`) - Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`) @@ -754,6 +759,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.putmask` (:issue:`49830`) - Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`) - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) +- Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) - Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) @@ -768,6 +774,7 @@ Performance improvements - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.__setitem__` when key is a null slice (:issue:`50248`) +- Performance improvement in :class:`~arrays.ArrowExtensionArray` comparison methods when array contains NA (:issue:`50524`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) @@ -775,6 +782,7 @@ Performance improvements - Performance improvement when iterating over pyarrow and nullable dtypes (:issue:`49825`, :issue:`49851`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) +- Performance improvement in :meth:`Series.to_numpy` if ``copy=True`` by avoiding copying twice (:issue:`24345`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`) - Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`) @@ -799,6 +807,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :func:`pandas.infer_freq`, raising ``TypeError`` when inferred on :class:`RangeIndex` (:issue:`47084`) +- Bug in :func:`to_datetime` incorrectly raising ``OverflowError`` with string arguments corresponding to large integers (:issue:`50533`) - Bug in :func:`to_datetime` was raising on invalid offsets with ``errors='coerce'`` and ``infer_datetime_format=True`` (:issue:`48633`) - Bug in :class:`DatetimeIndex` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``dtype`` or data (:issue:`48659`) - Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`) @@ -818,7 +827,7 @@ Datetimelike - Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`) - Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`) - Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`) -- +- Bug in :func:`Timestamp.utctimetuple` raising a ``TypeError`` (:issue:`32174`) Timedelta ^^^^^^^^^ @@ -849,6 +858,7 @@ Conversion - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) +- Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - @@ -871,11 +881,14 @@ Indexing - Bug in :meth:`DataFrame.loc` coercing dtypes when setting values with a list indexer (:issue:`49159`) - Bug in :meth:`Series.loc` raising error for out of bounds end of slice indexer (:issue:`50161`) - Bug in :meth:`DataFrame.loc` raising ``ValueError`` with ``bool`` indexer and :class:`MultiIndex` (:issue:`47687`) +- Bug in :meth:`DataFrame.loc` raising ``IndexError`` when setting values for a pyarrow-backed column with a non-scalar indexer (:issue:`50085`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when right hand side is :class:`DataFrame` with :class:`MultiIndex` columns (:issue:`49121`) - Bug in :meth:`DataFrame.reindex` casting dtype to ``object`` when :class:`DataFrame` has single extension array column when re-indexing ``columns`` and ``index`` (:issue:`48190`) - Bug in :meth:`DataFrame.iloc` raising ``IndexError`` when indexer is a :class:`Series` with numeric extension array dtype (:issue:`49521`) - Bug in :func:`~DataFrame.describe` when formatting percentiles in the resulting index showed more decimals than needed (:issue:`46362`) - Bug in :meth:`DataFrame.compare` does not recognize differences when comparing ``NA`` with value in nullable dtypes (:issue:`48939`) +- Bug in :meth:`DataFrame.isetitem` coercing extension array dtypes in :class:`DataFrame` to object (:issue:`49922`) +- Bug in :class:`BusinessHour` would cause creation of :class:`DatetimeIndex` to fail when no opening hour was included in the index (:issue:`49835`) - Copy on write @@ -929,6 +942,7 @@ Period Plotting ^^^^^^^^ +- Bug in :meth:`DataFrame.plot.hist`, not dropping elements of ``weights`` corresponding to ``NaN`` values in ``data`` (:issue:`48884`) - ``ax.set_xlim`` was sometimes raising ``UserWarning`` which users couldn't address due to ``set_xlim`` not accepting parsing arguments - the converter now uses :func:`Timestamp` instead (:issue:`49148`) - @@ -948,6 +962,9 @@ Groupby/resample/rolling - Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`) - Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`) - Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`) +- Bug in :meth:`.DataFrameGrouBy.value_counts` would raise when used with a :class:`.TimeGrouper` (:issue:`50486`) +- Bug in :meth:`Resampler.size` caused a wide :class:`DataFrame` to be returned instead of a :class:`Series` with :class:`MultiIndex` (:issue:`46826`) +- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"idxmin"`` and ``"idxmax"`` arguments (:issue:`45986`) - Reshaping @@ -976,6 +993,7 @@ ExtensionArray - Bug in :meth:`Series.round` for pyarrow-backed dtypes raising ``AttributeError`` (:issue:`50437`) - Bug when concatenating an empty DataFrame with an ExtensionDtype to another DataFrame with the same ExtensionDtype, the resulting dtype turned into object (:issue:`48510`) - Bug in :meth:`array.PandasArray.to_numpy` raising with ``NA`` value when ``na_value`` is specified (:issue:`40638`) +- Bug in :meth:`api.types.is_numeric_dtype` where a custom :class:`ExtensionDtype` would not return ``True`` if ``_is_numeric`` returned ``True`` (:issue:`50563`) Styler ^^^^^^ diff --git a/environment.yml b/environment.yml index b6b8f7d6af1ba..96753f0f1c9b3 100644 --- a/environment.yml +++ b/environment.yml @@ -51,7 +51,7 @@ dependencies: - pyxlsb - s3fs>=2021.08.0 - scipy - - sqlalchemy + - sqlalchemy<1.4.46 - tabulate - tzdata>=2022a - xarray @@ -90,7 +90,6 @@ dependencies: - gitdb - natsort # DataFrame.sort_values doctest - numpydoc - - pandas-dev-flaker=0.5.0 - pydata-sphinx-theme<0.11 - pytest-cython # doctest - sphinx diff --git a/pandas/__init__.py b/pandas/__init__.py index 951cb38656d0b..048d20f0de72f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -135,7 +135,7 @@ ) from pandas import api, arrays, errors, io, plotting, tseries -from pandas import testing # noqa:PDF015 +from pandas import testing from pandas.util._print_versions import show_versions from pandas.io.api import ( diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7fcba58772ac4..77876d0c55337 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -647,40 +647,37 @@ def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None val = values[j, i] -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - - @cython.boundscheck(False) @cython.wraparound(False) def backfill( ndarray[numeric_object_t] old, ndarray[numeric_object_t] new, limit=None -) -> ndarray: - # -> ndarray[intp_t, ndim=1] +) -> ndarray: # -> ndarray[intp_t, ndim=1] + """ + Backfilling logic for generating fill vector + + Diagram of what's going on + + Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 + A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + B B 1 1 + . 2 1 + . 2 1 + . 2 1 + C C 2 1 + . 0 + . 0 + D + """ cdef: Py_ssize_t i, j, nleft, nright ndarray[intp_t, ndim=1] indexer diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 9bc02e90ebb9e..2439082bf7413 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -240,3 +240,6 @@ def get_reverse_indexer( ) -> npt.NDArray[np.intp]: ... def is_bool_list(obj: list) -> bool: ... def dtypes_all_equal(types: list[DtypeObj]) -> bool: ... +def array_equal_fast( + left: np.ndarray, right: np.ndarray # np.ndarray[np.int64, ndim=1] +) -> bool: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc7b876cb5de8..89e02ac0fa86d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -50,6 +50,7 @@ from numpy cimport ( complex128_t, flatiter, float64_t, + int32_t, int64_t, intp_t, ndarray, @@ -642,6 +643,34 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool: return True +ctypedef fused int6432_t: + int64_t + int32_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def array_equal_fast( + ndarray[int6432_t, ndim=1] left, ndarray[int6432_t, ndim=1] right, +) -> bool: + """ + Perform an element by element comparison on 1-d integer arrays, meant for indexer + comparisons + """ + cdef: + Py_ssize_t i, n = left.size + + if left.size != right.size: + return False + + for i in range(n): + + if left[i] != right[i]: + return False + + return True + + ctypedef fused ndarr_object: ndarray[object, ndim=1] ndarray[object, ndim=2] @@ -1482,7 +1511,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return val if values.descr.type_num != NPY_OBJECT: - # i.e. values.dtype != np.object + # i.e. values.dtype != np.object_ # This should not be reached values = values.astype(object) diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 478e7eaee90c1..9154e836b3477 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -292,7 +292,7 @@ def maybe_convert_bool(ndarray[object] arr, result[i] = 1 elif val in false_vals: result[i] = 0 - elif is_nan(val): + elif is_nan(val) or val is None: mask[i] = 1 result[i] = 0 # Value here doesn't matter, will be replaced w/ nan has_na = True diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index de81c611c9ee9..492f45af09e80 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -120,3 +120,9 @@ cdef int64_t convert_reso( NPY_DATETIMEUNIT to_reso, bint round_ok, ) except? -1 + +cdef extern from "src/datetime/np_datetime_strings.h": + ctypedef enum FormatRequirement: + PARTIAL_MATCH + EXACT_MATCH + INFER_FORMAT diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 9db3f7cb4648e..b1e4022527437 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -53,7 +53,8 @@ cdef extern from "src/datetime/np_datetime_strings.h": npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, - const char *format, int format_len, int exact) + const char *format, int format_len, + FormatRequirement exact) # ---------------------------------------------------------------------- @@ -286,17 +287,20 @@ cdef int string_to_dts( const char* buf Py_ssize_t format_length const char* format_buf + FormatRequirement format_requirement buf = get_c_string_buf_and_size(val, &length) if format is None: format_buf = b"" format_length = 0 - exact = False + format_requirement = INFER_FORMAT else: format_buf = get_c_string_buf_and_size(format, &format_length) + format_requirement = exact return parse_iso_8601_datetime(buf, length, want_exc, dts, out_bestunit, out_local, out_tzoffset, - format_buf, format_length, exact) + format_buf, format_length, + format_requirement) cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0bc9751694e9f..b59d17321d8bf 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1847,15 +1847,20 @@ cdef class BusinessHour(BusinessMixin): earliest_start = self.start[0] latest_start = self.start[-1] + if self.n == 0: + is_same_sign = sign > 0 + else: + is_same_sign = self.n * sign >= 0 + if not self.next_bday.is_on_offset(other): # today is not business day other = other + sign * self.next_bday - if self.n * sign >= 0: + if is_same_sign: hour, minute = earliest_start.hour, earliest_start.minute else: hour, minute = latest_start.hour, latest_start.minute else: - if self.n * sign >= 0: + if is_same_sign: if latest_start < other.time(): # current time is after latest starting time in today other = other + sign * self.next_bday diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index aa95febfc9721..9e05640723929 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -53,6 +53,7 @@ from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, c_nat_strings as nat_strings, ) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, npy_datetimestruct, @@ -298,6 +299,12 @@ def parse_datetime_string( # following may be raised from dateutil # TypeError: 'NoneType' object is not iterable raise ValueError(f'Given date string "{date_string}" not likely a datetime') + except OverflowError as err: + # with e.g. "08335394550" dateutil raises when trying to pass + # year=8335394550 to datetime.replace + raise OutOfBoundsDatetime( + f'Parsing "{date_string}" to datetime overflows' + ) from err return dt @@ -1005,7 +1012,8 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst): ) if (day_index < month_index) and not dayfirst: warnings.warn( - f"Parsing dates in {format} format when dayfirst=False was specified. " + f"Parsing dates in {format} format when dayfirst=False (the default) " + "was specified. " "Pass `dayfirst=True` or specify a format to silence this warning.", UserWarning, stacklevel=find_stack_level(), diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 7bb94012fad0c..f1f03e6467eac 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -67,42 +67,54 @@ This file implements string parsing and creation for NumPy datetime. * Returns 0 on success, -1 on failure. */ +typedef enum { + COMPARISON_SUCCESS, + COMPLETED_PARTIAL_MATCH, + COMPARISON_ERROR +} DatetimePartParseResult; // This function will advance the pointer on format // and decrement characters_remaining by n on success -// On failure will return -1 without incrementing -static int compare_format(const char **format, int *characters_remaining, - const char *compare_to, int n, const int exact) { +// On failure will return COMPARISON_ERROR without incrementing +// If `format_requirement` is PARTIAL_MATCH, and the `format` string has +// been exhausted, then return COMPLETED_PARTIAL_MATCH. +static DatetimePartParseResult compare_format( + const char **format, + int *characters_remaining, + const char *compare_to, + int n, + const FormatRequirement format_requirement +) { + if (format_requirement == INFER_FORMAT) { + return COMPARISON_SUCCESS; + } + if (*characters_remaining < 0) { + return COMPARISON_ERROR; + } + if (format_requirement == PARTIAL_MATCH && *characters_remaining == 0) { + return COMPLETED_PARTIAL_MATCH; + } if (*characters_remaining < n) { - if (exact) { - // TODO(pandas-dev): in the future we should set a PyErr here - // to be very clear about what went wrong - return -1; - } else if (*characters_remaining) { - // TODO(pandas-dev): same return value in this function as - // above branch, but stub out a future where - // we have a better error message - return -1; - } else { - return 0; - } + // TODO(pandas-dev): PyErr to differentiate what went wrong + return COMPARISON_ERROR; } else { if (strncmp(*format, compare_to, n)) { // TODO(pandas-dev): PyErr to differentiate what went wrong - return -1; + return COMPARISON_ERROR; } else { *format += n; *characters_remaining -= n; - return 0; + return COMPARISON_SUCCESS; } } - return 0; + return COMPARISON_SUCCESS; } int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, int *out_tzoffset, - const char* format, int format_len, int exact) { + const char* format, int format_len, + FormatRequirement format_requirement) { if (len < 0 || format_len < 0) goto parse_error; int year_leap = 0; @@ -110,6 +122,7 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, const char *substr; int sublen; NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC; + DatetimePartParseResult comparison; /* If year-month-day are separated by a valid separator, * months/days without leading zeroes will be parsed @@ -139,8 +152,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (compare_format(&format, &format_len, " ", 1, exact)) { + comparison = compare_format(&format, &format_len, " ", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } } @@ -155,8 +171,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ - if (compare_format(&format, &format_len, "%Y", 2, exact)) { + comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } out->year = 0; @@ -202,8 +221,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ++substr; --sublen; - if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { + comparison = compare_format(&format, &format_len, &ymd_sep, 1, + format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { @@ -212,8 +235,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MONTH */ - if (compare_format(&format, &format_len, "%m", 2, exact)) { + comparison = compare_format(&format, &format_len, "%m", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* First digit required */ out->month = (*substr - '0'); @@ -258,14 +284,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; - if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { + comparison = compare_format(&format, &format_len, &ymd_sep, 1, + format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } } /* PARSE THE DAY */ - if (compare_format(&format, &format_len, "%d", 2, exact)) { + comparison = compare_format(&format, &format_len, "%d", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* First digit required */ if (!isdigit(*substr)) { @@ -306,15 +339,21 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } - if (compare_format(&format, &format_len, substr, 1, exact)) { - goto parse_error; - } + comparison = compare_format(&format, &format_len, substr, 1, format_requirement); + if (comparison == COMPARISON_ERROR) { + goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; + } ++substr; --sublen; /* PARSE THE HOURS */ - if (compare_format(&format, &format_len, "%H", 2, exact)) { + comparison = compare_format(&format, &format_len, "%H", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* First digit required */ if (!isdigit(*substr)) { @@ -359,8 +398,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } - if (compare_format(&format, &format_len, ":", 1, exact)) { + comparison = compare_format(&format, &format_len, ":", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { @@ -370,8 +412,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ - if (compare_format(&format, &format_len, "%M", 2, exact)) { + comparison = compare_format(&format, &format_len, "%M", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* First digit required */ out->min = (*substr - '0'); @@ -405,8 +450,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { - if (compare_format(&format, &format_len, ":", 1, exact)) { + comparison = compare_format(&format, &format_len, ":", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } ++substr; --sublen; @@ -420,8 +468,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ - if (compare_format(&format, &format_len, "%S", 2, exact)) { + comparison = compare_format(&format, &format_len, "%S", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* First digit required */ out->sec = (*substr - '0'); @@ -448,8 +499,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0 && *substr == '.') { ++substr; --sublen; - if (compare_format(&format, &format_len, ".", 1, exact)) { + comparison = compare_format(&format, &format_len, ".", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } } else { bestunit = NPY_FR_s; @@ -457,8 +511,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MICROSECONDS (0 to 6 digits) */ - if (compare_format(&format, &format_len, "%f", 2, exact)) { + comparison = compare_format(&format, &format_len, "%f", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } numdigits = 0; for (i = 0; i < 6; ++i) { @@ -524,8 +581,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (compare_format(&format, &format_len, " ", 1, exact)) { + comparison = compare_format(&format, &format_len, " ", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } } @@ -539,8 +599,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, /* UTC specifier */ if (*substr == 'Z') { - if (compare_format(&format, &format_len, "%z", 2, exact)) { + comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { @@ -561,8 +624,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, --sublen; } } else if (*substr == '-' || *substr == '+') { - if (compare_format(&format, &format_len, "%z", 2, exact)) { + comparison = compare_format(&format, &format_len, "%z", 2, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -647,8 +713,11 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; - if (compare_format(&format, &format_len, " ", 1, exact)) { + comparison = compare_format(&format, &format_len, " ", 1, format_requirement); + if (comparison == COMPARISON_ERROR) { goto parse_error; + } else if (comparison == COMPLETED_PARTIAL_MATCH) { + goto finish; } } diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 734f7daceba05..a635192d70809 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -26,6 +26,21 @@ This file implements string parsing and creation for NumPy datetime. #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API +/* 'format_requirement' can be one of three values: + * * PARTIAL_MATCH : Only require a partial match with 'format'. + * For example, if the string is '2020-01-01 05:00:00' and + * 'format' is '%Y-%m-%d', then parse '2020-01-01'; + * * EXACT_MATCH : require an exact match with 'format'. If the + * string is '2020-01-01', then the only format which will + * be able to parse it without error is '%Y-%m-%d'; + * * INFER_FORMAT: parse without comparing 'format' (i.e. infer it). + */ +typedef enum { + PARTIAL_MATCH, + EXACT_MATCH, + INFER_FORMAT +} FormatRequirement; + /* * Parses (almost) standard ISO 8601 date strings. The differences are: * @@ -61,7 +76,7 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc, int *out_tzoffset, const char* format, int format_len, - int exact); + FormatRequirement format_requirement); /* * Provides a string length to use for converting datetime diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 27e99706137b6..69878625295d6 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,4 +1,18 @@ """Strptime-related classes and functions. + +TimeRE, _calc_julian_from_U_or_W are vendored +from the standard library, see +https://github.com/python/cpython/blob/main/Lib/_strptime.py +The original module-level docstring follows. + +Strptime-related classes and functions. +CLASSES: + LocaleTime -- Discovers and stores locale-specific time information + TimeRE -- Creates regexes for pattern matching a string of text containing + time information +FUNCTIONS: + _getlang -- Figure out what language is being used for the locale + strptime -- Calculates the time struct represented by the passed-in string """ from datetime import timezone @@ -10,10 +24,16 @@ from cpython.datetime cimport ( timedelta, tzinfo, ) +from _strptime import ( + TimeRE as _TimeRE, + _getlang, +) +from _strptime import LocaleTime # no-cython-lint import_datetime() from _thread import allocate_lock as _thread_allocate_lock +import re import numpy as np import pytz @@ -50,6 +70,7 @@ from pandas._libs.util cimport ( is_float_object, is_integer_object, ) + from pandas._libs.tslibs.timestamps import Timestamp cnp.import_array() @@ -60,15 +81,23 @@ cdef bint format_is_iso(f: str): Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different but must be consistent. Leading 0s in dates and times are optional. """ + iso_regex = re.compile( + r""" + ^ # start of string + %Y # Year + (?:([-/ \\.]?)%m # month with or without separators + (?: \1%d # day with same separator as for year-month + (?:[ T]%H # hour with separator + (?:\:%M # minute with separator + (?:\:%S # second with separator + (?:%z|\.%f(?:%z)? # timezone or fractional second + )?)?)?)?)?)? # optional + $ # end of string + """, + re.VERBOSE, + ) excluded_formats = ["%Y%m"] - - for date_sep in [" ", "/", "\\", "-", ".", ""]: - for time_sep in [" ", "T"]: - for micro_or_tz in ["", "%z", ".%f", ".%f%z"]: - iso_fmt = f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" - if iso_fmt.startswith(f) and f not in excluded_formats: - return True - return False + return re.match(iso_regex, f) is not None and f not in excluded_formats def _test_format_is_iso(f: str) -> bool: @@ -487,29 +516,6 @@ def array_strptime( return result, result_timezone.base -""" -TimeRE, _calc_julian_from_U_or_W are vendored -from the standard library, see -https://github.com/python/cpython/blob/main/Lib/_strptime.py -The original module-level docstring follows. - -Strptime-related classes and functions. -CLASSES: - LocaleTime -- Discovers and stores locale-specific time information - TimeRE -- Creates regexes for pattern matching a string of text containing - time information -FUNCTIONS: - _getlang -- Figure out what language is being used for the locale - strptime -- Calculates the time struct represented by the passed-in string -""" - -from _strptime import ( - TimeRE as _TimeRE, - _getlang, -) -from _strptime import LocaleTime # no-cython-lint - - class TimeRE(_TimeRE): """ Handle conversion from format directives to regexes. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 0cef0ad128aee..b57f2ce5bd953 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1525,16 +1525,11 @@ class Timestamp(_Timestamp): elif is_integer_object(year): # User passed positional arguments: # Timestamp(year, month, day[, hour[, minute[, second[, - # microsecond[, nanosecond[, tzinfo]]]]]]) + # microsecond[, tzinfo]]]]]) ts_input = datetime(ts_input, year, month, day or 0, hour or 0, minute or 0, second or 0, fold=fold or 0) unit = None - if nanosecond is None: - # nanosecond was not passed as a keyword, but may have been - # passed positionally see test_constructor_nanosecond - nanosecond = microsecond - if getattr(ts_input, "tzinfo", None) is not None and tz is not None: raise ValueError("Cannot pass a datetime or Timestamp with tzinfo with " "the tz parameter. Use tz_convert instead.") diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a54050fdf3cf2..547286bd40b64 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -240,6 +240,8 @@ timedelta-like} str stamp Localizer info = Localizer(tz, creso=creso) int64_t pph = periods_per_day(creso) // 24 + int64_t pps = periods_per_second(creso) + npy_datetimestruct dts # Vectorized version of DstTzInfo.localize if info.use_utc: @@ -388,14 +390,26 @@ timedelta-like} new_local = val - remaining_mins - 1 if is_zi: - raise NotImplementedError( - "nonexistent shifting is not implemented with ZoneInfo tzinfos" - ) + # use the same construction as in _get_utc_bounds_zoneinfo + pandas_datetime_to_datetimestruct(new_local, creso, &dts) + extra = (dts.ps // 1000) * (pps // 1_000_000_000) + + dt = datetime_new(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, None) - delta_idx = bisect_right_i8(info.tdata, new_local, info.ntrans) + if shift_forward or shift_delta > 0: + dt = dt.replace(tzinfo=tz, fold=1) + else: + dt = dt.replace(tzinfo=tz, fold=0) + dt = dt.astimezone(utc_stdlib) + dt = dt.replace(tzinfo=None) + result[i] = pydatetime_to_dt64(dt, &dts, creso) + extra + + else: + delta_idx = bisect_right_i8(info.tdata, new_local, info.ntrans) - delta_idx = delta_idx - delta_idx_offset - result[i] = new_local - info.deltas[delta_idx] + delta_idx = delta_idx - delta_idx_offset + result[i] = new_local - info.deltas[delta_idx] elif fill_nonexist: result[i] = NPY_NAT else: diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 43020ae471f10..eb2905751a9b4 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -886,7 +886,7 @@ def external_error_raised(expected_exception: type[Exception]) -> ContextManager """ import pytest - return pytest.raises(expected_exception, match=None) # noqa: PDF010 + return pytest.raises(expected_exception, match=None) cython_table = pd.core.common._cython_table.items() diff --git a/pandas/_typing.py b/pandas/_typing.py index 1ba5be8b5b0ed..8d3044a978291 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -44,6 +44,10 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas import Interval + from pandas.arrays import ( + DatetimeArray, + TimedeltaArray, + ) from pandas.core.arrays.base import ExtensionArray from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame @@ -88,6 +92,7 @@ ArrayLike = Union["ExtensionArray", np.ndarray] AnyArrayLike = Union[ArrayLike, "Index", "Series"] +TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"] # scalars diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 085a2a80ca8ec..b59b9632913e4 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,7 +14,6 @@ import sys from pandas._typing import F -import pandas.compat._compressors from pandas.compat._constants import ( IS64, PY39, @@ -22,6 +21,7 @@ PY311, PYPY, ) +import pandas.compat.compressors from pandas.compat.numpy import ( is_numpy_dev, np_version_under1p21, @@ -131,7 +131,7 @@ def is_ci_environment() -> bool: return os.environ.get("PANDAS_CI", "0") == "1" -def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: +def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]: """ Importing the `LZMAFile` class from the `lzma` module. @@ -145,13 +145,13 @@ def get_lzma_file() -> type[pandas.compat._compressors.LZMAFile]: RuntimeError If the `lzma` module was not imported correctly, or didn't exist. """ - if not pandas.compat._compressors.has_lzma: + if not pandas.compat.compressors.has_lzma: raise RuntimeError( "lzma module not available. " "A Python re-install with the proper dependencies, " "might be required to solve this issue." ) - return pandas.compat._compressors.LZMAFile + return pandas.compat.compressors.LZMAFile __all__ = [ diff --git a/pandas/compat/_compressors.py b/pandas/compat/compressors.py similarity index 100% rename from pandas/compat/_compressors.py rename to pandas/compat/compressors.py diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0076b3d2b64c3..de85ed67e7e8c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Any, + Literal, TypeVar, cast, ) @@ -116,6 +117,11 @@ def floordiv_compat( } if TYPE_CHECKING: + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import Series ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") @@ -406,8 +412,14 @@ def _cmp_method(self, other, op): f"{op.__name__} not implemented for {type(other)}" ) - result = result.to_numpy() - return BooleanArray._from_sequence(result) + if result.null_count > 0: + # GH50524: avoid conversion to object for better perf + values = pc.fill_null(result, False).to_numpy() + mask = result.is_null().to_numpy() + else: + values = result.to_numpy() + mask = np.zeros(len(values), dtype=np.bool_) + return BooleanArray(values, mask) def _evaluate_op_method(self, other, op, arrow_funcs): pc_func = arrow_funcs[op.__name__] @@ -687,6 +699,23 @@ def round( """ return type(self)(pc.round(self._data, ndigits=decimals)) + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + if isinstance(value, ExtensionArray): + value = value.astype(object) + # Base class searchsorted would cast to object, which is *much* slower. + return self.to_numpy().searchsorted(value, side=side, sorter=sorter) + def take( self, indices: TakeIndexer, @@ -977,7 +1006,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs): return self.dtype.na_value return result.as_py() - def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: + def __setitem__(self, key, value) -> None: """Set one or more values inplace. Parameters @@ -998,6 +1027,10 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: ------- None """ + # GH50085: unwrap 1D indexers + if isinstance(key, tuple) and len(key) == 1: + key = key[0] + key = check_array_indexer(self, key) value = self._maybe_convert_setitem_value(value) @@ -1023,7 +1056,6 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: return indices = self._indexing_key_to_indices(key) - argsort = np.argsort(indices) indices = indices[argsort] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 6954c97007d23..422b9effeface 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -348,7 +348,7 @@ def __getitem__( """ raise AbstractMethodError(self) - def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: + def __setitem__(self, key, value) -> None: """ Set one or more values inplace. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index adf6522f76a1a..d7d28eed16f8b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -190,10 +190,9 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): Assumes that __new__/__init__ defines: _ndarray - _freq - and that the inheriting class has methods: - _generate_range + and that inheriting subclass implements: + freq """ # _infer_matches -> which infer_dtype strings are close enough to our own @@ -201,6 +200,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): _is_recognized_dtype: Callable[[DtypeObj], bool] _recognized_scalars: tuple[type, ...] _ndarray: np.ndarray + freq: BaseOffset | None @cache_readonly def _can_hold_na(self) -> bool: @@ -407,7 +407,7 @@ def _get_getitem_freq(self, key) -> BaseOffset | None: # error: Argument 1 of "__setitem__" is incompatible with supertype # "ExtensionArray"; supertype defines the argument type as "Union[int, # ndarray]" - def __setitem__( # type: ignore[override] + def __setitem__( self, key: int | Sequence[int] | Sequence[bool] | slice, value: NaTType | Any | Sequence[Any], @@ -896,24 +896,6 @@ def _maybe_mask_results( # ------------------------------------------------------------------ # Frequency Properties/Methods - @property - def freq(self): - """ - Return the frequency object if it is set, otherwise None. - """ - return self._freq - - @freq.setter - def freq(self, value) -> None: - if value is not None: - value = to_offset(value) - self._validate_frequency(self, value) - - if self.ndim > 1: - raise ValueError("Cannot set freq with ndim > 1") - - self._freq = value - @property def freqstr(self) -> str | None: """ @@ -955,51 +937,6 @@ def resolution(self) -> str: # error: Item "None" of "Optional[Any]" has no attribute "attrname" return self._resolution_obj.attrname # type: ignore[union-attr] - @classmethod - def _validate_frequency(cls, index, freq, **kwargs): - """ - Validate that a frequency is compatible with the values of a given - Datetime Array/Index or Timedelta Array/Index - - Parameters - ---------- - index : DatetimeIndex or TimedeltaIndex - The index on which to determine if the given frequency is valid - freq : DateOffset - The frequency to validate - """ - # TODO: this is not applicable to PeriodArray, move to correct Mixin - inferred = index.inferred_freq - if index.size == 0 or inferred == freq.freqstr: - return None - - try: - on_freq = cls._generate_range( - start=index[0], end=None, periods=len(index), freq=freq, **kwargs - ) - if not np.array_equal(index.asi8, on_freq.asi8): - raise ValueError - except ValueError as e: - if "non-fixed" in str(e): - # non-fixed frequencies are not meaningful for timedelta64; - # we retain that error message - raise e - # GH#11587 the main way this is reached is if the `np.array_equal` - # check above is False. This can also be reached if index[0] - # is `NaT`, in which case the call to `cls._generate_range` will - # raise a ValueError, which we re-raise with a more targeted - # message. - raise ValueError( - f"Inferred frequency {inferred} from passed values " - f"does not conform to passed frequency {freq.freqstr}" - ) from e - - @classmethod - def _generate_range( - cls: type[DatetimeLikeArrayT], start, end, periods, freq, *args, **kwargs - ) -> DatetimeLikeArrayT: - raise AbstractMethodError(cls) - # monotonicity/uniqueness properties are called via frequencies.infer_freq, # see GH#23789 @@ -1953,6 +1890,68 @@ def __init__( def _validate_dtype(cls, values, dtype): raise AbstractMethodError(cls) + @property + def freq(self): + """ + Return the frequency object if it is set, otherwise None. + """ + return self._freq + + @freq.setter + def freq(self, value) -> None: + if value is not None: + value = to_offset(value) + self._validate_frequency(self, value) + + if self.ndim > 1: + raise ValueError("Cannot set freq with ndim > 1") + + self._freq = value + + @classmethod + def _validate_frequency(cls, index, freq, **kwargs): + """ + Validate that a frequency is compatible with the values of a given + Datetime Array/Index or Timedelta Array/Index + + Parameters + ---------- + index : DatetimeIndex or TimedeltaIndex + The index on which to determine if the given frequency is valid + freq : DateOffset + The frequency to validate + """ + inferred = index.inferred_freq + if index.size == 0 or inferred == freq.freqstr: + return None + + try: + on_freq = cls._generate_range( + start=index[0], end=None, periods=len(index), freq=freq, **kwargs + ) + if not np.array_equal(index.asi8, on_freq.asi8): + raise ValueError + except ValueError as err: + if "non-fixed" in str(err): + # non-fixed frequencies are not meaningful for timedelta64; + # we retain that error message + raise err + # GH#11587 the main way this is reached is if the `np.array_equal` + # check above is False. This can also be reached if index[0] + # is `NaT`, in which case the call to `cls._generate_range` will + # raise a ValueError, which we re-raise with a more targeted + # message. + raise ValueError( + f"Inferred frequency {inferred} from passed values " + f"does not conform to passed frequency {freq.freqstr}" + ) from err + + @classmethod + def _generate_range( + cls: type[DatetimeLikeArrayT], start, end, periods, freq, *args, **kwargs + ) -> DatetimeLikeArrayT: + raise AbstractMethodError(cls) + # -------------------------------------------------------------- @cache_readonly diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 608b38765621b..01584a66f424b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -50,10 +50,7 @@ TimeNonexistent, npt, ) -from pandas.errors import ( - OutOfBoundsDatetime, - PerformanceWarning, -) +from pandas.errors import PerformanceWarning from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_inclusive @@ -2154,18 +2151,14 @@ def objects_to_datetime64ns( flags = data.flags order: Literal["F", "C"] = "F" if flags.f_contiguous else "C" - try: - result, tz_parsed = tslib.array_to_datetime( - data.ravel("K"), - errors=errors, - utc=utc, - dayfirst=dayfirst, - yearfirst=yearfirst, - ) - result = result.reshape(data.shape, order=order) - except OverflowError as err: - # Exception is raised when a part of date is greater than 32 bit signed int - raise OutOfBoundsDatetime("Out of bounds nanosecond timestamp") from err + result, tz_parsed = tslib.array_to_datetime( + data.ravel("K"), + errors=errors, + utc=utc, + dayfirst=dayfirst, + yearfirst=yearfirst, + ) + result = result.reshape(data.shape, order=order) if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f7107a1f7c83c..2f13f199f9744 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -39,6 +39,7 @@ ScalarIndexer, SequenceIndexer, SortKind, + TimeArrayLike, npt, ) from pandas.compat.numpy import function as nv @@ -82,6 +83,8 @@ ExtensionArray, _extension_array_shared_docs, ) +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.timedeltas import TimedeltaArray import pandas.core.common as com from pandas.core.construction import ( array as pd_array, @@ -102,6 +105,7 @@ IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") +IntervalSideT = Union[TimeArrayLike, np.ndarray] IntervalOrNA = Union[Interval, float] _interval_shared_docs: dict[str, str] = {} @@ -123,8 +127,8 @@ Parameters ---------- data : array-like (1-dimensional) - Array-like containing Interval objects from which to build the - %(klass)s. + Array-like (ndarray, :class:`DateTimeArray`, :class:`TimeDeltaArray`) containing + Interval objects from which to build the %(klass)s. closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. @@ -213,8 +217,8 @@ def ndim(self) -> Literal[1]: return 1 # To make mypy recognize the fields - _left: np.ndarray - _right: np.ndarray + _left: IntervalSideT + _right: IntervalSideT _dtype: IntervalDtype # --------------------------------------------------------------------- @@ -232,9 +236,10 @@ def __new__( data = extract_array(data, extract_numpy=True) if isinstance(data, cls): - left = data._left - right = data._right + left: IntervalSideT = data._left + right: IntervalSideT = data._right closed = closed or data.closed + dtype = IntervalDtype(left.dtype, closed=closed) else: # don't allow scalars @@ -255,37 +260,57 @@ def __new__( right = lib.maybe_convert_objects(right) closed = closed or infer_closed + left, right, dtype = cls._ensure_simple_new_inputs( + left, + right, + closed=closed, + copy=copy, + dtype=dtype, + ) + + if verify_integrity: + cls._validate(left, right, dtype=dtype) + return cls._simple_new( left, right, - closed, - copy=copy, dtype=dtype, - verify_integrity=verify_integrity, ) @classmethod def _simple_new( cls: type[IntervalArrayT], + left: IntervalSideT, + right: IntervalSideT, + dtype: IntervalDtype, + ) -> IntervalArrayT: + result = IntervalMixin.__new__(cls) + result._left = left + result._right = right + result._dtype = dtype + + return result + + @classmethod + def _ensure_simple_new_inputs( + cls, left, right, closed: IntervalClosedType | None = None, copy: bool = False, dtype: Dtype | None = None, - verify_integrity: bool = True, - ) -> IntervalArrayT: - result = IntervalMixin.__new__(cls) + ) -> tuple[IntervalSideT, IntervalSideT, IntervalDtype]: + """Ensure correctness of input parameters for cls._simple_new.""" + from pandas.core.indexes.base import ensure_index + + left = ensure_index(left, copy=copy) + right = ensure_index(right, copy=copy) if closed is None and isinstance(dtype, IntervalDtype): closed = dtype.closed closed = closed or "right" - from pandas.core.indexes.base import ensure_index - - left = ensure_index(left, copy=copy) - right = ensure_index(right, copy=copy) - if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) @@ -346,13 +371,8 @@ def _simple_new( right = right.copy() dtype = IntervalDtype(left.dtype, closed=closed) - result._dtype = dtype - result._left = left - result._right = right - if verify_integrity: - result._validate() - return result + return left, right, dtype @classmethod def _from_sequence( @@ -512,9 +532,16 @@ def from_arrays( left = _maybe_convert_platform_interval(left) right = _maybe_convert_platform_interval(right) - return cls._simple_new( - left, right, closed, copy=copy, dtype=dtype, verify_integrity=True + left, right, dtype = cls._ensure_simple_new_inputs( + left, + right, + closed=closed, + copy=copy, + dtype=dtype, ) + cls._validate(left, right, dtype=dtype) + + return cls._simple_new(left, right, dtype=dtype) _interval_shared_docs["from_tuples"] = textwrap.dedent( """ @@ -599,32 +626,33 @@ def from_tuples( return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) - def _validate(self): + @classmethod + def _validate(cls, left, right, dtype: IntervalDtype) -> None: """ Verify that the IntervalArray is valid. Checks that - * closed is valid + * dtype is correct * left and right match lengths * left and right have the same missing values * left is always below right """ - if self.closed not in VALID_CLOSED: - msg = f"invalid option for 'closed': {self.closed}" + if not isinstance(dtype, IntervalDtype): + msg = f"invalid dtype: {dtype}" raise ValueError(msg) - if len(self._left) != len(self._right): + if len(left) != len(right): msg = "left and right must have the same length" raise ValueError(msg) - left_mask = notna(self._left) - right_mask = notna(self._right) + left_mask = notna(left) + right_mask = notna(right) if not (left_mask == right_mask).all(): msg = ( "missing values must be missing in the same " "location both left and right sides" ) raise ValueError(msg) - if not (self._left[left_mask] <= self._right[left_mask]).all(): + if not (left[left_mask] <= right[left_mask]).all(): msg = "left side of interval must be <= right side" raise ValueError(msg) @@ -639,7 +667,11 @@ def _shallow_copy(self: IntervalArrayT, left, right) -> IntervalArrayT: right : Index Values to be used for the right-side of the intervals. """ - return self._simple_new(left, right, closed=self.closed, verify_integrity=False) + dtype = IntervalDtype(left.dtype, closed=self.closed) + left, right, dtype = self._ensure_simple_new_inputs(left, right, dtype=dtype) + self._validate(left, right, dtype=dtype) + + return self._simple_new(left, right, dtype=dtype) # --------------------------------------------------------------------- # Descriptive @@ -988,7 +1020,10 @@ def _concat_same_type( left = np.concatenate([interval.left for interval in to_concat]) right = np.concatenate([interval.right for interval in to_concat]) - return cls._simple_new(left, right, closed=closed, copy=False) + + left, right, dtype = cls._ensure_simple_new_inputs(left, right, closed=closed) + + return cls._simple_new(left, right, dtype=dtype) def copy(self: IntervalArrayT) -> IntervalArrayT: """ @@ -1000,9 +1035,8 @@ def copy(self: IntervalArrayT) -> IntervalArrayT: """ left = self._left.copy() right = self._right.copy() - closed = self.closed - # TODO: Could skip verify_integrity here. - return type(self).from_arrays(left, right, closed=closed) + dtype = self.dtype + return self._simple_new(left, right, dtype=dtype) def isna(self) -> np.ndarray: return isna(self._left) @@ -1402,9 +1436,9 @@ def set_closed(self: IntervalArrayT, closed: IntervalClosedType) -> IntervalArra msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) - return type(self)._simple_new( - left=self._left, right=self._right, closed=closed, verify_integrity=False - ) + left, right = self._left, self._right + dtype = IntervalDtype(left.dtype, closed=closed) + return self._simple_new(left, right, dtype=dtype) _interval_shared_docs[ "is_non_overlapping_monotonic" @@ -1546,9 +1580,11 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: if isinstance(self._left, np.ndarray): np.putmask(self._left, mask, value_left) + assert isinstance(self._right, np.ndarray) np.putmask(self._right, mask, value_right) else: self._left._putmask(mask, value_left) + assert not isinstance(self._right, np.ndarray) self._right._putmask(mask, value_right) def insert(self: IntervalArrayT, loc: int, item: Interval) -> IntervalArrayT: @@ -1576,9 +1612,11 @@ def insert(self: IntervalArrayT, loc: int, item: Interval) -> IntervalArrayT: def delete(self: IntervalArrayT, loc) -> IntervalArrayT: if isinstance(self._left, np.ndarray): new_left = np.delete(self._left, loc) + assert isinstance(self._right, np.ndarray) new_right = np.delete(self._right, loc) else: new_left = self._left.delete(loc) + assert not isinstance(self._right, np.ndarray) new_right = self._right.delete(loc) return self._shallow_copy(left=new_left, right=new_right) @@ -1679,7 +1717,7 @@ def isin(self, values) -> npt.NDArray[np.bool_]: return isin(self.astype(object), values.astype(object)) @property - def _combined(self) -> ArrayLike: + def _combined(self) -> IntervalSideT: left = self.left._values.reshape(-1, 1) right = self.right._values.reshape(-1, 1) if needs_i8_conversion(left.dtype): @@ -1696,15 +1734,12 @@ def _from_combined(self, combined: np.ndarray) -> IntervalArray: dtype = self._left.dtype if needs_i8_conversion(dtype): - # error: "Type[ndarray[Any, Any]]" has no attribute "_from_sequence" - new_left = type(self._left)._from_sequence( # type: ignore[attr-defined] - nc[:, 0], dtype=dtype - ) - # error: "Type[ndarray[Any, Any]]" has no attribute "_from_sequence" - new_right = type(self._right)._from_sequence( # type: ignore[attr-defined] - nc[:, 1], dtype=dtype - ) + assert isinstance(self._left, (DatetimeArray, TimedeltaArray)) + new_left = type(self._left)._from_sequence(nc[:, 0], dtype=dtype) + assert isinstance(self._right, (DatetimeArray, TimedeltaArray)) + new_right = type(self._right)._from_sequence(nc[:, 1], dtype=dtype) else: + assert isinstance(dtype, np.dtype) new_left = nc[:, 0].view(dtype) new_right = nc[:, 1].view(dtype) return self._shallow_copy(left=new_left, right=new_right) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 859bb53b6489a..e6682b0dea814 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -353,8 +353,8 @@ def _check_compatible_with(self, other) -> None: def dtype(self) -> PeriodDtype: return self._dtype - # error: Read-only property cannot override read-write property - @property # type: ignore[misc] + # error: Cannot override writeable attribute with read-only property + @property # type: ignore[override] def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e5fb3fc3ff836..9b26db07fc28f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np @@ -54,6 +57,11 @@ if TYPE_CHECKING: import pyarrow + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import Series @@ -492,6 +500,20 @@ def memory_usage(self, deep: bool = False) -> int: return result + lib.memory_usage_of_objects(self._ndarray) return result + @doc(ExtensionArray.searchsorted) + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + if self._hasna: + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) + return super().searchsorted(value=value, side=side, sorter=sorter) + def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97262b1f4bb21..fb081d0e63c96 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,8 +1,10 @@ from __future__ import annotations -from collections.abc import Callable # noqa: PDF001 import re -from typing import Union +from typing import ( + Callable, + Union, +) import numpy as np diff --git a/pandas/core/base.py b/pandas/core/base.py index e5e0ac4e121ae..23121b7075fe1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -531,12 +531,19 @@ def to_numpy( f"to_numpy() got an unexpected keyword argument '{bad_keys}'" ) - result = np.asarray(self._values, dtype=dtype) - # TODO(GH-24345): Avoid potential double copy - if copy or na_value is not lib.no_default: - result = result.copy() - if na_value is not lib.no_default: - result[np.asanyarray(self.isna())] = na_value + if na_value is not lib.no_default: + values = self._values.copy() + values[np.asanyarray(self.isna())] = na_value + else: + values = self._values + + result = np.asarray(values, dtype=dtype) + + if copy and na_value is lib.no_default: + if np.shares_memory(self._values[:2], result[:2]): + # Take slices to improve performance of check + result = result.copy() + return result @final diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4735731e8d6d9..aae815bb68e05 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1200,6 +1200,8 @@ def is_numeric_dtype(arr_or_dtype) -> bool: """ return _is_dtype_type( arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_) + ) or _is_dtype( + arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ._is_numeric ) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 955f65585963d..8cd1a2543e23a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -95,7 +95,6 @@ ) from pandas.core.indexes.category import CategoricalIndex from pandas.core.series import Series -from pandas.core.shared_docs import _shared_docs from pandas.core.util.numba_ import maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -1848,17 +1847,82 @@ def nunique(self, dropna: bool = True) -> DataFrame: return results - @doc( - _shared_docs["idxmax"], - numeric_only_default="False", - ) def idxmax( self, - axis: Axis = 0, + axis: Axis | None = None, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: - axis = DataFrame._get_axis_number(axis) + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + + .. versionchanged:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object + """ + if axis is None: + axis = self.axis def func(df): res = df._reduce( @@ -1879,17 +1943,82 @@ def func(df): ) return result - @doc( - _shared_docs["idxmin"], - numeric_only_default="False", - ) def idxmin( self, - axis: Axis = 0, + axis: Axis | None = None, skipna: bool = True, numeric_only: bool = False, ) -> DataFrame: - axis = DataFrame._get_axis_number(axis) + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default None + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + If axis is not provided, grouper's axis is used. + + .. versionchanged:: 2.0.0 + + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48], + ... 'co2_emissions': [37.2, 19.66, 1712]}, + ... index=['Pork', 'Wheat Products', 'Beef']) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object + """ + if axis is None: + axis = self.axis def func(df): res = df._reduce( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b823a7a51943e..d1d361c4e8bee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1806,8 +1806,6 @@ def result_to_bool( libgroupby.group_any_all, numeric_only=False, cython_dtype=np.dtype(np.int8), - needs_mask=True, - needs_nullable=True, pre_processing=objs_to_bool, post_processing=result_to_bool, val_test=val_test, @@ -2084,13 +2082,24 @@ def std( f"{type(self).__name__}.std called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) + + def _postprocessing( + vals, inference, nullable: bool = False, mask=None + ) -> ArrayLike: + if nullable: + if mask.ndim == 2: + mask = mask[:, 0] + return FloatingArray(np.sqrt(vals), mask.view(np.bool_)) + return np.sqrt(vals) + result = self._get_cythonized_result( libgroupby.group_var, cython_dtype=np.dtype(np.float64), numeric_only=numeric_only, needs_counts=True, - post_processing=lambda vals, inference: np.sqrt(vals), + post_processing=_postprocessing, ddof=ddof, + how="std", ) return result @@ -3174,6 +3183,9 @@ def ngroup(self, ascending: bool = True): would be seen when iterating over the groupby object, not the order they are first observed. + Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN` + and will be skipped from the count. + Parameters ---------- ascending : bool, default True @@ -3190,38 +3202,38 @@ def ngroup(self, ascending: bool = True): Examples -------- - >>> df = pd.DataFrame({"A": list("aaabba")}) + >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]}) >>> df - A - 0 a - 1 a - 2 a - 3 b - 4 b - 5 a - >>> df.groupby('A').ngroup() - 0 0 - 1 0 - 2 0 - 3 1 - 4 1 - 5 0 - dtype: int64 - >>> df.groupby('A').ngroup(ascending=False) + color + 0 red + 1 None + 2 red + 3 blue + 4 blue + 5 red + >>> df.groupby("color").ngroup() + 0 1.0 + 1 NaN + 2 1.0 + 3 0.0 + 4 0.0 + 5 1.0 + dtype: float64 + >>> df.groupby("color", dropna=False).ngroup() 0 1 - 1 1 + 1 2 2 1 3 0 4 0 5 1 dtype: int64 - >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() - 0 0 + >>> df.groupby("color", dropna=False).ngroup(ascending=False) + 0 1 1 0 2 1 - 3 3 + 3 2 4 2 - 5 0 + 5 1 dtype: int64 """ with self._group_selection_context(): @@ -3495,10 +3507,9 @@ def _get_cythonized_result( cython_dtype: np.dtype, numeric_only: bool = False, needs_counts: bool = False, - needs_nullable: bool = False, - needs_mask: bool = False, pre_processing=None, post_processing=None, + how: str = "any_all", **kwargs, ): """ @@ -3513,12 +3524,6 @@ def _get_cythonized_result( Whether only numeric datatypes should be computed needs_counts : bool, default False Whether the counts should be a part of the Cython call - needs_mask : bool, default False - Whether boolean mask needs to be part of the Cython call - signature - needs_nullable : bool, default False - Whether a bool specifying if the input is nullable is part - of the Cython call signature pre_processing : function, default None Function to be applied to `values` prior to passing to Cython. Function should return a tuple where the first element is the @@ -3533,6 +3538,8 @@ def _get_cythonized_result( second argument, i.e. the signature should be (ndarray, Type). If `needs_nullable=True`, a third argument should be `nullable`, to allow for processing specific to nullable values. + how : str, default any_all + Determines if any/all cython interface or std interface is used. **kwargs : dict Extra arguments to be passed back to Cython funcs @@ -3576,16 +3583,20 @@ def blk_func(values: ArrayLike) -> ArrayLike: vals = vals.reshape((-1, 1)) func = partial(func, values=vals) - if needs_mask: + if how != "std" or isinstance(values, BaseMaskedArray): mask = isna(values).view(np.uint8) if mask.ndim == 1: mask = mask.reshape(-1, 1) func = partial(func, mask=mask) - if needs_nullable: + if how != "std": is_nullable = isinstance(values, BaseMaskedArray) func = partial(func, nullable=is_nullable) + else: + result_mask = np.zeros(result.shape, dtype=np.bool_) + func = partial(func, result_mask=result_mask) + func(**kwargs) # Call func to modify indexer values in place if values.ndim == 1: @@ -3593,9 +3604,10 @@ def blk_func(values: ArrayLike) -> ArrayLike: result = result[:, 0] if post_processing: - pp_kwargs = {} - if needs_nullable: - pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) + pp_kwargs: dict[str, bool | np.ndarray] = {} + pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) + if how == "std": + pp_kwargs["mask"] = result_mask result = post_processing(result, inferences, **pp_kwargs) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e323877a512b0..66ad1b3ea7196 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -425,14 +425,22 @@ class Grouping: If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list + dropna : bool, default True + Whether to drop NA groups. + uniques : Array-like, optional + When specified, will be used for unique values. Enables including empty groups + in the result for a BinGrouper. Must not contain duplicates. - Returns + Attributes ------- - **Attributes**: - * indices : dict of {group -> index_list} - * codes : ndarray, group codes - * group_index : unique groups - * groups : dict of {group -> label_list} + indices : dict + Mapping of {group -> index_list} + codes : ndarray + Group codes + group_index : Index or None + unique groups + groups : dict + Mapping of {group -> label_list} """ _codes: npt.NDArray[np.signedinteger] | None = None @@ -452,6 +460,7 @@ def __init__( observed: bool = False, in_axis: bool = False, dropna: bool = True, + uniques: ArrayLike | None = None, ) -> None: self.level = level self._orig_grouper = grouper @@ -464,6 +473,7 @@ def __init__( self._observed = observed self.in_axis = in_axis self._dropna = dropna + self._uniques = uniques self._passed_categorical = False @@ -653,6 +663,7 @@ def group_index(self) -> Index: @cache_readonly def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: + uniques: ArrayLike if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes; @@ -697,11 +708,13 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers codes = self.grouping_vector.codes_info - # error: Incompatible types in assignment (expression has type "Union - # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical") - uniques = ( - self.grouping_vector.result_index._values # type: ignore[assignment] - ) + uniques = self.grouping_vector.result_index._values + elif self._uniques is not None: + # GH#50486 Code grouping_vector using _uniques; allows + # including uniques that are not present in grouping_vector. + cat = Categorical(self.grouping_vector, categories=self._uniques) + codes = cat.codes + uniques = self._uniques else: # GH35667, replace dropna=False with use_na_sentinel=False # error: Incompatible types in assignment (expression has type "Union[ diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c20fe34a178f5..ea902800cf7e0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1214,7 +1214,11 @@ def names(self) -> list[Hashable]: @property def groupings(self) -> list[grouper.Grouping]: lev = self.binlabels - ping = grouper.Grouping(lev, lev, in_axis=False, level=None) + codes = self.group_info[0] + labels = lev.take(codes) + ping = grouper.Grouping( + labels, labels, in_axis=False, level=None, uniques=lev._values + ) return [ping] def _aggregate_series_fast(self, obj: Series, func: Callable) -> NoReturn: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3dc6aed56fa24..775d137523d2b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -601,7 +601,9 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): # See each method's docstring. @classmethod - def _simple_new(cls: type[_IndexT], values, name: Hashable = None) -> _IndexT: + def _simple_new( + cls: type[_IndexT], values: ArrayLike, name: Hashable = None + ) -> _IndexT: """ We require that we have a dtype compat for the values. If we are passed a non-dtype compat, then coerce using the constructor. @@ -1961,7 +1963,7 @@ def droplevel(self, level: IndexLabel = 0): Return index with requested level(s) removed. If resulting index has only 1 level left, the result will be - of Index type, not MultiIndex. + of Index type, not MultiIndex. The original index is not modified inplace. Parameters ---------- diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1119b6e3b83ad..fde000f84e581 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -82,7 +82,7 @@ DatetimeLikeArrayMixin, cache=True, ) -@inherit_names(["mean", "freq", "freqstr"], DatetimeLikeArrayMixin) +@inherit_names(["mean", "freqstr"], DatetimeLikeArrayMixin) class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. @@ -90,10 +90,18 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): _can_hold_strings = False _data: DatetimeArray | TimedeltaArray | PeriodArray - freq: BaseOffset | None freqstr: str | None _resolution_obj: Resolution + @property + def freq(self) -> BaseOffset | None: + return self._data.freq + + @freq.setter + def freq(self, value) -> None: + # error: Property "freq" defined in "PeriodArray" is read-only [misc] + self._data.freq = value # type: ignore[misc] + @property def asi8(self) -> npt.NDArray[np.int64]: return self._data.asi8 diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 48cf6000d100d..3d149eccc746b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3503,12 +3503,8 @@ def equal_levels(self, other: MultiIndex) -> bool: def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) - if ( - any(-1 in code for code in self.codes) - and any(-1 in code for code in other.codes) - or other.has_duplicates - ): - # This is only necessary if both sides have nans or other has dups, + if other.has_duplicates: + # This is only necessary if other has dupes, # otherwise difference is faster result = super()._union(other, sort) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 1937cd4254790..a9b35b99e4b51 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -167,8 +167,13 @@ def from_range( cls._validate_dtype(dtype) return cls._simple_new(data, name=name) + # error: Argument 1 of "_simple_new" is incompatible with supertype "Index"; + # supertype defines the argument type as + # "Union[ExtensionArray, ndarray[Any, Any]]" [override] @classmethod - def _simple_new(cls, values: range, name: Hashable = None) -> RangeIndex: + def _simple_new( # type: ignore[override] + cls, values: range, name: Hashable = None + ) -> RangeIndex: result = object.__new__(cls) assert isinstance(values, range) diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 9139cb41e3af7..0de9b130f0aab 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -7,8 +7,10 @@ from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg if TYPE_CHECKING: - import pandas as pd - from pandas import Index + from pandas import ( + DataFrame, + Index, + ) class PandasDataFrameXchg(DataFrameXchg): @@ -21,7 +23,7 @@ class PandasDataFrameXchg(DataFrameXchg): """ def __init__( - self, df: pd.DataFrame, nan_as_null: bool = False, allow_copy: bool = True + self, df: DataFrame, nan_as_null: bool = False, allow_copy: bool = True ) -> None: """ Constructor - an instance of this (private) class is returned from diff --git a/pandas/core/describe.py b/pandas/core/methods/describe.py similarity index 100% rename from pandas/core/describe.py rename to pandas/core/methods/describe.py diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 4af4be20a056e..d6cba824767b5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -989,6 +989,12 @@ def var( @doc(GroupBy.size) def size(self): result = self._downsample("size") + + # If the result is a non-empty DataFrame we stack to get a Series + # GH 46826 + if isinstance(result, ABCDataFrame) and not result.empty: + result = result.stack() + if not len(self.ax): from pandas import Series diff --git a/pandas/core/series.py b/pandas/core/series.py index 873ebd16ac80b..b5d73373f061e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4070,7 +4070,9 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: dtype: object""" ), ) - def swaplevel(self, i: Level = -2, j: Level = -1, copy: bool = True) -> Series: + def swaplevel( + self, i: Level = -2, j: Level = -1, copy: bool | None = None + ) -> Series: """ Swap levels i and j in a :class:`MultiIndex`. @@ -4090,10 +4092,9 @@ def swaplevel(self, i: Level = -2, j: Level = -1, copy: bool = True) -> Series: {examples} """ assert isinstance(self.index, MultiIndex) - new_index = self.index.swaplevel(i, j) - return self._constructor(self._values, index=new_index, copy=copy).__finalize__( - self, method="swaplevel" - ) + result = self.copy(deep=copy) + result.index = self.index.swaplevel(i, j) + return result def reorder_levels(self, order: Sequence[Level]) -> Series: """ @@ -4113,7 +4114,7 @@ def reorder_levels(self, order: Sequence[Level]) -> Series: if not isinstance(self.index, MultiIndex): # pragma: no cover raise Exception("Can only reorder levels on a hierarchical axis.") - result = self.copy() + result = self.copy(deep=None) assert isinstance(result.index, MultiIndex) result.index = result.index.reorder_levels(order) return result diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 147fa622fdedc..486fab62d93e7 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -798,8 +798,8 @@ Consider a dataset containing food consumption in Argentina. >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... 'co2_emissions': [37.2, 19.66, 1712]}}, + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions @@ -865,8 +865,8 @@ Consider a dataset containing food consumption in Argentina. >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) + ... 'co2_emissions': [37.2, 19.66, 1712]}}, + ... index=['Pork', 'Wheat Products', 'Beef']) >>> df consumption co2_emissions diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index b5618207ab9d8..c96e5a1abcf86 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -1,10 +1,10 @@ from __future__ import annotations import abc -from collections.abc import Callable # noqa: PDF001 import re from typing import ( TYPE_CHECKING, + Callable, Literal, ) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 2d77cd0da816f..508ac122d67af 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -1,12 +1,12 @@ from __future__ import annotations -from collections.abc import Callable # noqa: PDF001 import functools import re import sys import textwrap from typing import ( TYPE_CHECKING, + Callable, Literal, ) import unicodedata diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index e7be2db293527..27328809e23d8 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -139,7 +139,7 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str return guessed_format warnings.warn( "Could not infer format, so each element will be parsed " - "individually by `dateutil`. To ensure parsing is " + "individually, falling back to `dateutil`. To ensure parsing is " "consistent and as-expected, please specify a format.", UserWarning, stacklevel=find_stack_level(), diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index b1ff53e9d1a44..6e188531a0502 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -24,10 +24,10 @@ def create_section_header(header: str) -> str: template_see_also = dedent( """ - pandas.Series.{window_method} : Calling {window_method} with Series data. - pandas.DataFrame.{window_method} : Calling {window_method} with DataFrames. - pandas.Series.{agg_method} : Aggregating {agg_method} for Series. - pandas.DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n + Series.{window_method} : Calling {window_method} with Series data. + DataFrame.{window_method} : Calling {window_method} with DataFrames. + Series.{agg_method} : Aggregating {agg_method} for Series. + DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n """ ).replace("\n", "", 1) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index a6c32133803d4..c0a7b2b7cc361 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -26,7 +26,7 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core import common # noqa: PDF018 +from pandas.core import common from pandas.core.indexers.objects import ( BaseIndexer, ExponentialMovingWindowIndexer, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 989b82f45339f..ef0524e48f9e2 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -18,7 +18,6 @@ Sized, cast, ) -import warnings import numpy as np @@ -37,7 +36,6 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import DataError from pandas.util._decorators import doc -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( ensure_float64, @@ -473,10 +471,6 @@ def _apply_blockwise( obj = notna(obj).astype(int) obj._mgr = obj._mgr.consolidate() - def hfunc(values: ArrayLike) -> ArrayLike: - values = self._prep_values(values) - return homogeneous_func(values) - if self.axis == 1: obj = obj.T @@ -484,13 +478,16 @@ def hfunc(values: ArrayLike) -> ArrayLike: res_values = [] for i, arr in enumerate(obj._iter_column_arrays()): # GH#42736 operate column-wise instead of block-wise + # As of 2.0, hfunc will raise for nuisance columns try: - res = hfunc(arr) - except (TypeError, NotImplementedError): - pass - else: - res_values.append(res) - taker.append(i) + arr = self._prep_values(arr) + except (TypeError, NotImplementedError) as err: + raise DataError( + f"Cannot aggregate non-numeric type: {arr.dtype}" + ) from err + res = homogeneous_func(arr) + res_values.append(res) + taker.append(i) index = self._slice_axis_for_step( obj.index, res_values[0] if len(res_values) > 0 else None @@ -505,18 +502,6 @@ def hfunc(values: ArrayLike) -> ArrayLike: if self.axis == 1: df = df.T - if 0 != len(res_values) != len(obj.columns): - # GH#42738 ignore_failures dropped nuisance columns - dropped = obj.columns.difference(obj.columns.take(taker)) - warnings.warn( - "Dropping of nuisance columns in rolling operations " - "is deprecated; in a future version this will raise TypeError. " - "Select only valid columns before calling the operation. " - f"Dropped columns were {dropped}", - FutureWarning, - stacklevel=find_stack_level(), - ) - return self._resolve_output(df, obj) def _apply_tablewise( diff --git a/pandas/io/_util.py b/pandas/io/_util.py new file mode 100644 index 0000000000000..d2a001f0cf925 --- /dev/null +++ b/pandas/io/_util.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from pandas.compat._optional import import_optional_dependency + +import pandas as pd + + +def _arrow_dtype_mapping() -> dict: + pa = import_optional_dependency("pyarrow") + return { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + pa.uint8(): pd.UInt8Dtype(), + pa.uint16(): pd.UInt16Dtype(), + pa.uint32(): pd.UInt32Dtype(), + pa.uint64(): pd.UInt64Dtype(), + pa.bool_(): pd.BooleanDtype(), + pa.string(): pd.StringDtype(), + pa.float32(): pd.Float32Dtype(), + pa.float64(): pd.Float64Dtype(), + } diff --git a/pandas/io/common.py b/pandas/io/common.py index 4dae46c8f39f6..6deaf40f00c69 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -55,8 +55,8 @@ WriteBuffer, ) from pandas.compat import get_lzma_file -from pandas.compat._compressors import BZ2File as _BZ2File from pandas.compat._optional import import_optional_dependency +from pandas.compat.compressors import BZ2File as _BZ2File from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -478,7 +478,7 @@ def file_path_to_url(path: str) -> str: return urljoin("file:", pathname2url(path)) -_extension_to_compression = { +extension_to_compression = { ".tar": "tar", ".tar.gz": "tar", ".tar.bz2": "tar", @@ -489,7 +489,7 @@ def file_path_to_url(path: str) -> str: ".xz": "xz", ".zst": "zstd", } -_supported_compressions = set(_extension_to_compression.values()) +_supported_compressions = set(extension_to_compression.values()) def get_compression_method( @@ -565,7 +565,7 @@ def infer_compression( return None # Infer compression from the filename/URL extension - for extension, compression in _extension_to_compression.items(): + for extension, compression in extension_to_compression.items(): if filepath_or_buffer.lower().endswith(extension): return compression return None diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index a12e62223c1f1..6cc343703d00c 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -259,10 +259,14 @@ def enable_data_resource_formatter(enable: bool) -> None: if mimetype not in formatters: # define tableschema formatter from IPython.core.formatters import BaseFormatter + from traitlets import ObjectName class TableSchemaFormatter(BaseFormatter): - print_method = "_repr_data_resource_" - _return_type = (dict,) + print_method = ObjectName("_repr_data_resource_") + # Incompatible types in assignment (expression has type + # "Tuple[Type[Dict[Any, Any]]]", base class "BaseFormatter" + # defined the type as "Type[str]") + _return_type = (dict,) # type: ignore[assignment] # register it: formatters[mimetype] = TableSchemaFormatter() diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f2780d5fa6832..88974f3ab4afa 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -57,7 +57,7 @@ from pandas.io.common import ( IOHandles, - _extension_to_compression, + extension_to_compression, file_exists, get_handle, is_fsspec_url, @@ -854,7 +854,7 @@ def _get_data_from_filepath(self, filepath_or_buffer): elif ( isinstance(filepath_or_buffer, str) and filepath_or_buffer.lower().endswith( - (".json",) + tuple(f".json{c}" for c in _extension_to_compression) + (".json",) + tuple(f".json{c}" for c in extension_to_compression) ) and not file_exists(filepath_or_buffer) ): diff --git a/pandas/io/orc.py b/pandas/io/orc.py index cfa02de9bbcb3..169cb5d16da8d 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -91,18 +91,20 @@ def read_orc( pa_table = orc_file.read(columns=columns, **kwargs) if use_nullable_dtypes: dtype_backend = get_option("mode.dtype_backend") - if dtype_backend != "pyarrow": - raise NotImplementedError( - f"mode.dtype_backend set to {dtype_backend} is not implemented." + if dtype_backend == "pyarrow": + df = DataFrame( + { + col_name: ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } ) - df = DataFrame( - { - col_name: ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + else: + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() + df = pa_table.to_pandas(types_mapper=mapping.get) return df else: return pa_table.to_pandas() diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 568747685a36e..67e00dde5498b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -225,24 +225,13 @@ def read( dtype_backend = get_option("mode.dtype_backend") to_pandas_kwargs = {} if use_nullable_dtypes: - import pandas as pd if dtype_backend == "pandas": - mapping = { - self.api.int8(): pd.Int8Dtype(), - self.api.int16(): pd.Int16Dtype(), - self.api.int32(): pd.Int32Dtype(), - self.api.int64(): pd.Int64Dtype(), - self.api.uint8(): pd.UInt8Dtype(), - self.api.uint16(): pd.UInt16Dtype(), - self.api.uint32(): pd.UInt32Dtype(), - self.api.uint64(): pd.UInt64Dtype(), - self.api.bool_(): pd.BooleanDtype(), - self.api.string(): pd.StringDtype(), - self.api.float32(): pd.Float32Dtype(), - self.api.float64(): pd.Float64Dtype(), - } + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping() to_pandas_kwargs["types_mapper"] = mapping.get + manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] diff --git a/pandas/io/xml.py b/pandas/io/xml.py index 4f61455826286..6ffa3356cc9de 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -46,10 +46,7 @@ if TYPE_CHECKING: from xml.etree.ElementTree import Element - from lxml.etree import ( - _Element, - _XSLTResultTree, - ) + from lxml import etree from pandas import DataFrame @@ -417,7 +414,7 @@ def _validate_names(self) -> None: def _parse_doc( self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] - ) -> Element | _Element: + ) -> Element | etree._Element: """ Build tree from path_or_buffer. @@ -625,7 +622,7 @@ def _validate_names(self) -> None: def _parse_doc( self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] - ) -> _Element: + ) -> etree._Element: from lxml.etree import ( XMLParser, fromstring, @@ -656,7 +653,7 @@ def _parse_doc( return document - def _transform_doc(self) -> _XSLTResultTree: + def _transform_doc(self) -> etree._XSLTResultTree: """ Transform original tree using stylesheet. @@ -774,6 +771,7 @@ def _parse( iterparse: dict[str, list[str]] | None, compression: CompressionOptions, storage_options: StorageOptions, + use_nullable_dtypes: bool = False, **kwargs, ) -> DataFrame: """ @@ -843,6 +841,7 @@ def _parse( dtype=dtype, converters=converters, parse_dates=parse_dates, + use_nullable_dtypes=use_nullable_dtypes, **kwargs, ) @@ -869,6 +868,7 @@ def read_xml( iterparse: dict[str, list[str]] | None = None, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, + use_nullable_dtypes: bool = False, ) -> DataFrame: r""" Read XML document into a ``DataFrame`` object. @@ -980,6 +980,19 @@ def read_xml( {storage_options} + use_nullable_dtypes : bool = False + Whether or not to use nullable dtypes as default when reading data. If + set to True, nullable dtypes are used for all dtypes that have a nullable + implementation, even if no nulls are present. + + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.dtype_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0 + Returns ------- df @@ -1113,4 +1126,5 @@ def read_xml( iterparse=iterparse, compression=compression, storage_options=storage_options, + use_nullable_dtypes=use_nullable_dtypes, ) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 1add485e03760..956390f739481 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -146,14 +146,23 @@ def _make_plot(self) -> None: kwds["label"] = self.columns kwds.pop("color") - y = reformat_hist_y_given_by(y, self.by) - # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array, # and each sub-array (10,) will be called in each iteration. If users only # provide 1D array, we assume the same weights is used for all iterations weights = kwds.get("weights", None) - if weights is not None and np.ndim(weights) != 1: - kwds["weights"] = weights[:, i] + if weights is not None: + if np.ndim(weights) != 1 and np.shape(weights)[-1] != 1: + try: + weights = weights[:, i] + except IndexError as err: + raise ValueError( + "weights must have the same shape as data, " + "or be a single column" + ) from err + weights = weights[~isna(y)] + kwds["weights"] = weights + + y = reformat_hist_y_given_by(y, self.by) artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 995b1668046d2..e448e1bce9146 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -250,7 +250,7 @@ class TestTesting(Base): ] def test_testing(self): - from pandas import testing # noqa: PDF015 + from pandas import testing self.check(testing, self.funcs) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 4fb63d3c4b97b..538110396e063 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1756,12 +1756,18 @@ def test_td64arr_floordiv_td64arr_with_nat( # columns without missing values expected[[0, 1]] = expected[[0, 1]].astype("int64") - result = left // right + with tm.maybe_produces_warning( + RuntimeWarning, box is pd.array, check_stacklevel=False + ): + result = left // right tm.assert_equal(result, expected) # case that goes through __rfloordiv__ with arraylike - result = np.asarray(left) // right + with tm.maybe_produces_warning( + RuntimeWarning, box is pd.array, check_stacklevel=False + ): + result = np.asarray(left) // right tm.assert_equal(result, expected) @pytest.mark.filterwarnings("ignore:invalid value encountered:RuntimeWarning") diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 8e9112b531fad..b484dc39cf23b 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -67,7 +67,7 @@ def test_ufuncs_unary(ufunc): def test_ufunc_numeric(): - # np.sqrt on np.bool returns float16, which we upcast to Float32 + # np.sqrt on np.bool_ returns float16, which we upcast to Float32 # bc we do not have Float16 arr = pd.array([True, False, None], dtype="boolean") diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 878f1d8089d33..7deb5e50464d5 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -3,8 +3,10 @@ from pandas import ( DataFrame, + Index, MultiIndex, Series, + date_range, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -113,6 +115,53 @@ def test_rename_columns_modify_parent(using_copy_on_write): tm.assert_frame_equal(df2, df2_orig) +def test_pipe(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1.5}) + df_orig = df.copy() + + def testfunc(df): + return df + + df2 = df.pipe(testfunc) + + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + expected = DataFrame({"a": [0, 2, 3], "b": 1.5}) + tm.assert_frame_equal(df, expected) + + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + +def test_pipe_modify_df(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": 1.5}) + df_orig = df.copy() + + def testfunc(df): + df.iloc[0, 0] = 100 + return df + + df2 = df.pipe(testfunc) + + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + expected = DataFrame({"a": [100, 2, 3], "b": 1.5}) + tm.assert_frame_equal(df, expected) + + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + + def test_reindex_columns(using_copy_on_write): # Case: reindexing the column returns a new dataframe # + afterwards modifying the result @@ -172,6 +221,27 @@ def test_select_dtypes(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +def test_pop(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + view_original = df[:] + result = df.pop("a") + + assert np.shares_memory(result.values, get_array(view_original, "a")) + assert np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) + + if using_copy_on_write: + result.iloc[0] = 0 + assert not np.shares_memory(result.values, get_array(view_original, "a")) + df.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(view_original, "b")) + tm.assert_frame_equal(view_original, df_orig) + else: + expected = DataFrame({"a": [1, 2, 3], "b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(view_original, expected) + + @pytest.mark.parametrize( "func", [ @@ -369,6 +439,30 @@ def test_head_tail(method, using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize( + "kwargs", + [ + {"before": "a", "after": "b", "axis": 1}, + {"before": 0, "after": 1, "axis": 0}, + ], +) +def test_truncate(using_copy_on_write, kwargs): + df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}) + df_orig = df.copy() + df2 = df.truncate(**kwargs) + df2._mgr._verify_integrity() + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + @pytest.mark.parametrize("method", ["assign", "drop_duplicates"]) def test_assign_drop_duplicates(using_copy_on_write, method): df = DataFrame({"a": [1, 2, 3]}) @@ -405,6 +499,23 @@ def test_reindex_like(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +def test_sort_index(using_copy_on_write): + # GH 49473 + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + ser2 = ser.sort_index() + + if using_copy_on_write: + assert np.shares_memory(ser.values, ser2.values) + else: + assert not np.shares_memory(ser.values, ser2.values) + + # mutating ser triggers a copy-on-write for the column / block + ser2.iloc[0] = 0 + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + def test_reorder_levels(using_copy_on_write): index = MultiIndex.from_tuples( [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] @@ -424,6 +535,43 @@ def test_reorder_levels(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +def test_series_reorder_levels(using_copy_on_write): + index = MultiIndex.from_tuples( + [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"] + ) + ser = Series([1, 2, 3, 4], index=index) + ser_orig = ser.copy() + ser2 = ser.reorder_levels(order=["two", "one"]) + + if using_copy_on_write: + assert np.shares_memory(ser2.values, ser.values) + else: + assert not np.shares_memory(ser2.values, ser.values) + + ser2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + +@pytest.mark.parametrize("obj", [Series([1, 2, 3]), DataFrame({"a": [1, 2, 3]})]) +def test_swaplevel(using_copy_on_write, obj): + index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"]) + obj.index = index + obj_orig = obj.copy() + obj2 = obj.swaplevel() + + if using_copy_on_write: + assert np.shares_memory(obj2.values, obj.values) + else: + assert not np.shares_memory(obj2.values, obj.values) + + obj2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(obj2.values, obj.values) + tm.assert_equal(obj, obj_orig) + + def test_frame_set_axis(using_copy_on_write): # GH 49473 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -456,3 +604,99 @@ def test_series_set_axis(using_copy_on_write): ser2.iloc[0] = 0 assert not np.shares_memory(ser2, ser) tm.assert_series_equal(ser, ser_orig) + + +def test_set_flags(using_copy_on_write): + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + ser2 = ser.set_flags(allows_duplicate_labels=False) + + assert np.shares_memory(ser, ser2) + + # mutating ser triggers a copy-on-write for the column / block + ser2.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(ser2, ser) + tm.assert_series_equal(ser, ser_orig) + else: + assert np.shares_memory(ser2, ser) + expected = Series([0, 2, 3]) + tm.assert_series_equal(ser, expected) + + +@pytest.mark.parametrize("copy_kwargs", [{"copy": True}, {}]) +@pytest.mark.parametrize("kwargs", [{"mapper": "test"}, {"index": "test"}]) +def test_rename_axis(using_copy_on_write, kwargs, copy_kwargs): + df = DataFrame({"a": [1, 2, 3, 4]}, index=Index([1, 2, 3, 4], name="a")) + df_orig = df.copy() + df2 = df.rename_axis(**kwargs, **copy_kwargs) + + if using_copy_on_write and not copy_kwargs: + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "func, tz", [("tz_convert", "Europe/Berlin"), ("tz_localize", None)] +) +def test_tz_convert_localize(using_copy_on_write, func, tz): + # GH 49473 + ser = Series( + [1, 2], index=date_range(start="2014-08-01 09:00", freq="H", periods=2, tz=tz) + ) + ser_orig = ser.copy() + ser2 = getattr(ser, func)("US/Central") + + if using_copy_on_write: + assert np.shares_memory(ser.values, ser2.values) + else: + assert not np.shares_memory(ser.values, ser2.values) + + # mutating ser triggers a copy-on-write for the column / block + ser2.iloc[0] = 0 + assert not np.shares_memory(ser2.values, ser.values) + tm.assert_series_equal(ser, ser_orig) + + +def test_droplevel(using_copy_on_write): + # GH 49473 + index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"]) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, index=index) + df_orig = df.copy() + df2 = df.droplevel(0) + + if using_copy_on_write: + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + else: + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 0] = 0 + + assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + tm.assert_frame_equal(df, df_orig) + + +def test_squeeze(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + series = df.squeeze() + + # Should share memory regardless of CoW since squeeze is just an iloc + assert np.shares_memory(series.values, get_array(df, "a")) + + # mutating squeezed df triggers a copy-on-write for that column/block + series.iloc[0] = 0 + if using_copy_on_write: + assert not np.shares_memory(series.values, get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + else: + # Without CoW the original will be modified + assert np.shares_memory(series.values, get_array(df, "a")) + assert df.loc[0, "a"] == 0 diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index c08514900af7c..ce900ff649eec 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -556,6 +556,24 @@ def test_is_numeric_dtype(): assert com.is_numeric_dtype(pd.Series([1, 2])) assert com.is_numeric_dtype(pd.Index([1, 2.0])) + class MyNumericDType(ExtensionDtype): + @property + def type(self): + return str + + @property + def name(self): + raise NotImplementedError + + @classmethod + def construct_array_type(cls): + raise NotImplementedError + + def _is_numeric(self) -> bool: + return True + + assert com.is_numeric_dtype(MyNumericDType()) + def test_is_float_dtype(): assert not com.is_float_dtype(str) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index f52abb9349578..73445a96f4a03 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -418,3 +418,11 @@ def test_setitem_invalid(self, data, invalid_scalar): with pytest.raises((ValueError, TypeError), match=msg): data[:] = invalid_scalar + + def test_setitem_2d_values(self, data): + # GH50085 + original = data.copy() + df = pd.DataFrame({"a": data, "b": data}) + df.loc[[0, 1], :] = df.loc[[1, 0], :].values + assert (df.loc[0, :] == original[1]).all() + assert (df.loc[1, :] == original[0]).all() diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c1785591f41a9..8bb82bf644680 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -37,7 +37,10 @@ import pandas as pd import pandas._testing as tm -from pandas.api.types import is_bool_dtype +from pandas.api.types import ( + is_bool_dtype, + is_numeric_dtype, +) from pandas.tests.extension import base pa = pytest.importorskip("pyarrow", minversion="1.0.1") @@ -550,16 +553,6 @@ def test_groupby_extension_apply( ): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) - def test_in_numeric_groupby(self, data_for_grouping, request): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - reason="ArrowExtensionArray doesn't support .sum() yet.", - ) - ) - super().test_in_numeric_groupby(data_for_grouping) - @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype @@ -1446,6 +1439,19 @@ def test_is_bool_dtype(): tm.assert_series_equal(result, expected) +def test_is_numeric_dtype(data): + # GH 50563 + pa_type = data.dtype.pyarrow_dtype + if ( + pa.types.is_floating(pa_type) + or pa.types.is_integer(pa_type) + or pa.types.is_decimal(pa_type) + ): + assert is_numeric_dtype(data) + else: + assert not is_numeric_dtype(data) + + def test_pickle_roundtrip(data): # GH 42600 expected = pd.Series(data) @@ -1553,3 +1559,20 @@ def test_round(): result = ser.round(-1) expected = pd.Series([120.0, pd.NA, 60.0], dtype=dtype) tm.assert_series_equal(result, expected) + + +def test_searchsorted_with_na_raises(data_for_sorting, as_series): + # GH50447 + b, c, a = data_for_sorting + arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] + arr[-1] = pd.NA + + if as_series: + arr = pd.Series(arr) + + msg = ( + "searchsorted requires array to be sorted, " + "which is impossible with NAs present." + ) + with pytest.raises(ValueError, match=msg): + arr.searchsorted(b) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index de7967a8578b5..3e865947aa968 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -420,3 +420,20 @@ def arrow_not_supported(self, data, request): reason="2D support not implemented for ArrowStringArray" ) request.node.add_marker(mark) + + +def test_searchsorted_with_na_raises(data_for_sorting, as_series): + # GH50447 + b, c, a = data_for_sorting + arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c] + arr[-1] = pd.NA + + if as_series: + arr = pd.Series(arr) + + msg = ( + "searchsorted requires array to be sorted, " + "which is impossible with NAs present." + ) + with pytest.raises(ValueError, match=msg): + arr.searchsorted(b) diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index e8a49ab868425..23458b096a140 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -7,6 +7,7 @@ from pandas import ( NA, DataFrame, + Float64Dtype, Series, StringDtype, Timedelta, @@ -130,3 +131,13 @@ def test_mask_where_dtype_timedelta(): [np.nan, np.nan, np.nan, Timedelta("3 day"), Timedelta("4 day")] ) tm.assert_frame_equal(df.where(df > Timedelta(2, unit="d")), expected) + + +def test_mask_return_dtype(): + # GH#50488 + ser = Series([0.0, 1.0, 2.0, 3.0], dtype=Float64Dtype()) + cond = ~ser.isna() + other = Series([True, False, True, False]) + excepted = Series([1.0, 0.0, 1.0, 0.0], dtype=ser.dtype) + result = ser.mask(cond, other) + tm.assert_series_equal(result, excepted) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index f65f3a311b403..e37c881472b65 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -550,7 +550,8 @@ def test_where_axis_multiple_dtypes(self): # DataFrame vs DataFrame d1 = df.copy().drop(1, axis=0) - expected = df.copy() + # Explicit cast to avoid implicit cast when setting value to np.nan + expected = df.copy().astype("float") expected.loc[1, :] = np.nan result = df.where(mask, d1) @@ -669,7 +670,8 @@ def test_where_categorical_filtering(self): df["b"] = df["b"].astype("category") result = df.where(df["a"] > 0) - expected = df.copy() + # Explicitly cast to 'float' to avoid implicit cast when setting np.nan + expected = df.copy().astype({"a": "float"}) expected.loc[0, :] = np.nan tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 07eacb5e89e3a..5b3e1614e1ada 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -159,7 +159,8 @@ def test_asfreq_fillvalue(self): # setup rng = date_range("1/1/2016", periods=10, freq="2S") - ts = Series(np.arange(len(rng)), index=rng) + # Explicit cast to 'float' to avoid implicit cast when setting None + ts = Series(np.arange(len(rng)), index=rng, dtype="float") df = DataFrame({"one": ts}) # insert pre-existing missing value diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 0b27fe591f794..a08f8bf5f502e 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -29,7 +29,8 @@ def date_range_frame(): class TestFrameAsof: def test_basic(self, date_range_frame): - df = date_range_frame + # Explicitly cast to float to avoid implicit cast when setting np.nan + df = date_range_frame.astype({"A": "float"}) N = 50 df.loc[df.index[15:30], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") @@ -50,7 +51,8 @@ def test_basic(self, date_range_frame): def test_subset(self, date_range_frame): N = 10 - df = date_range_frame.iloc[:N].copy() + # explicitly cast to float to avoid implicit upcast when setting to np.nan + df = date_range_frame.iloc[:N].copy().astype({"A": "float"}) df.loc[df.index[4:8], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") @@ -163,7 +165,7 @@ def test_time_zone_aware_index(self, stamp, expected): def test_is_copy(self, date_range_frame): # GH-27357, GH-30784: ensure the result of asof is an actual copy and # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings - df = date_range_frame + df = date_range_frame.astype({"A": "float"}) N = 50 df.loc[df.index[15:30], "A"] = np.nan dates = date_range("1/1/1990", periods=N * 3, freq="25s") diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index ad6122501bc19..f7da28a43590d 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -398,11 +398,7 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): pa_version_under7p0 and nullable_string_dtype == "string[pyarrow]", ): df2.set_index(["a", "b"], inplace=True) - with tm.maybe_produces_warning( - PerformanceWarning, - pa_version_under7p0 and nullable_string_dtype == "string[pyarrow]", - ): - result = df.combine_first(df2) + result = df.combine_first(df2) with tm.maybe_produces_warning( PerformanceWarning, pa_version_under7p0 and nullable_string_dtype == "string[pyarrow]", diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index dddd6c6d2eaf2..beec3e965d542 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -36,7 +36,8 @@ def test_equals(self): df1["start"] = date_range("2000-1-1", periods=10, freq="T") df1["end"] = date_range("2000-1-1", periods=10, freq="D") df1["diff"] = df1["end"] - df1["start"] - df1["bool"] = np.arange(10) % 3 == 0 + # Explicitly cast to object, to avoid implicit cast when setting np.nan + df1["bool"] = (np.arange(10) % 3 == 0).astype(object) df1.loc[::2] = np.nan df2 = df1.copy() assert df1["text"].equals(df2["text"]) diff --git a/pandas/tests/frame/methods/test_isetitem.py b/pandas/tests/frame/methods/test_isetitem.py new file mode 100644 index 0000000000000..59328aafefefb --- /dev/null +++ b/pandas/tests/frame/methods/test_isetitem.py @@ -0,0 +1,37 @@ +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm + + +class TestDataFrameSetItem: + def test_isetitem_ea_df(self): + # GH#49922 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + rhs = DataFrame([[11, 12], [13, 14]], dtype="Int64") + + df.isetitem([0, 1], rhs) + expected = DataFrame( + { + 0: Series([11, 13], dtype="Int64"), + 1: Series([12, 14], dtype="Int64"), + 2: [3, 6], + } + ) + tm.assert_frame_equal(df, expected) + + def test_isetitem_ea_df_scalar_indexer(self): + # GH#49922 + df = DataFrame([[1, 2, 3], [4, 5, 6]]) + rhs = DataFrame([[11], [13]], dtype="Int64") + + df.isetitem(2, rhs) + expected = DataFrame( + { + 0: [1, 4], + 1: [2, 5], + 2: Series([11, 13], dtype="Int64"), + } + ) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index bfee3edc085d8..21f0664707ebe 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -66,12 +66,6 @@ def test_truncate(self, datetime_frame, frame_or_series): before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq ) - def test_truncate_copy(self, datetime_frame): - index = datetime_frame.index - truncated = datetime_frame.truncate(index[5], index[10]) - truncated.values[:] = 5.0 - assert not (datetime_frame.values[5:11] == 5).any() - def test_truncate_nonsortedindex(self, frame_or_series): # GH#17935 diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index e81837898c927..159dab04e7da6 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -448,7 +448,8 @@ def test_date_index_query(self): def test_date_index_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 - df = DataFrame(np.random.randn(n, 3)) + # Cast to object to avoid implicit cast when setting entry to pd.NaT below + df = DataFrame(np.random.randn(n, 3)).astype({0: object}) df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT @@ -808,7 +809,8 @@ def test_date_index_query(self): def test_date_index_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 - df = DataFrame(np.random.randn(n, 3)) + # Cast to object to avoid implicit cast when setting entry to pd.NaT below + df = DataFrame(np.random.randn(n, 3)).astype({0: object}) df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index a3cd3e4afdda1..2e0aa5fd0cf40 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -449,10 +449,14 @@ def test_var_std(self, datetime_frame): def test_numeric_only_flag(self, meth): # GH 9201 df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) + # Cast to object to avoid implicit cast when setting entry to "100" below + df1 = df1.astype({"foo": object}) # set one entry to a number in str format df1.loc[0, "foo"] = "100" df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) + # Cast to object to avoid implicit cast when setting entry to "a" below + df2 = df2.astype({"foo": object}) # set one entry to a non-number str df2.loc[0, "foo"] = "a" diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 1b6b158cc61f5..2be2a052401ed 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -857,6 +857,8 @@ def cast(val): def test_unstack_nan_index2(self): # GH7403 df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) + # Explicit cast to avoid implicit cast when setting to np.NaN + df = df.astype({"B": "float"}) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) @@ -874,6 +876,8 @@ def test_unstack_nan_index2(self): tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) + # Explicit cast to avoid implicit cast when setting to np.NaN + df = df.astype({"B": "float"}) df.iloc[2, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) @@ -886,6 +890,8 @@ def test_unstack_nan_index2(self): tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) + # Explicit cast to avoid implicit cast when setting to np.NaN + df = df.astype({"B": "float"}) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 40ae37cfaba2d..461ae05aedb82 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -210,6 +210,21 @@ def test_aggregate_str_func(tsframe, groupbyfunc): tm.assert_frame_equal(result, expected) +def test_std_masked_dtype(any_numeric_ea_dtype): + # GH#35516 + df = DataFrame( + { + "a": [2, 1, 1, 1, 2, 2, 1], + "b": Series([pd.NA, 1, 2, 1, 1, 1, 2], dtype="Float64"), + } + ) + result = df.groupby("a").std() + expected = DataFrame( + {"b": [0.57735, 0]}, index=Index([1, 2], name="a"), dtype="Float64" + ) + tm.assert_frame_equal(result, expected) + + def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func): gb = df.groupby(level=0) if reduction_func in ("idxmax", "idxmin"): diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 8255fbab40dce..56aa121cd48c2 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -4,9 +4,11 @@ from pandas import ( CategoricalIndex, DataFrame, + Grouper, Index, MultiIndex, Series, + to_datetime, ) import pandas._testing as tm @@ -781,3 +783,39 @@ def test_subset_duplicate_columns(): ), ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("utc", [True, False]) +def test_value_counts_time_grouper(utc): + # GH#50486 + df = DataFrame( + { + "Timestamp": [ + 1565083561, + 1565083561 + 86400, + 1565083561 + 86500, + 1565083561 + 86400 * 2, + 1565083561 + 86400 * 3, + 1565083561 + 86500 * 3, + 1565083561 + 86400 * 4, + ], + "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"], + } + ).drop([3]) + + df["Datetime"] = to_datetime( + df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" + ) + gb = df.groupby(Grouper(freq="1D", key="Datetime")) + result = gb.value_counts() + dates = to_datetime( + ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc + ) + timestamps = df["Timestamp"].unique() + index = MultiIndex( + levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]], + codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]], + names=["Datetime", "Timestamp", "Food"], + ) + expected = Series(1, index=index) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b2fc60b76fdf6..a59c2853fa50b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2828,3 +2828,13 @@ def test_groupby_index_name_in_index_content(val_in, index, val_out): result = series.to_frame().groupby("blah").sum() expected = expected.to_frame() tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("n", [1, 10, 32, 100, 1000]) +def test_sum_of_booleans(n): + # GH 50347 + df = DataFrame({"groupby_col": 1, "bool": [True] * n}) + df["bool"] = df["bool"].eq(True) + result = df.groupby("groupby_col").sum() + expected = DataFrame({"bool": [n]}, index=Index([1], name="groupby_col")) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 4a707d8875db3..f16cf4dd27016 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -103,6 +103,8 @@ def test_groupby_with_timegrouper(self): "20130901", "20131205", freq="5D", name="Date", inclusive="left" ), ) + # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" + expected = expected.astype({"Buyer": object}) expected.iloc[0, 0] = "CarlCarlCarl" expected.iloc[6, 0] = "CarlCarl" expected.iloc[18, 0] = "Joe" diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 577a72d3f5090..1c4e83abc55f7 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -56,6 +56,8 @@ def seed_df(seed_nans, n, m): ) if seed_nans: + # Explicitly cast to float to avoid implicit cast when setting nan + frame["3rd"] = frame["3rd"].astype("float") frame.loc[1::11, "1st"] = np.nan frame.loc[3::17, "2nd"] = np.nan frame.loc[7::19, "3rd"] = np.nan @@ -114,7 +116,8 @@ def rebuild_index(df): tm.assert_series_equal(left.sort_index(), right.sort_index()) -def test_series_groupby_value_counts_with_grouper(): +@pytest.mark.parametrize("utc", [True, False]) +def test_series_groupby_value_counts_with_grouper(utc): # GH28479 df = DataFrame( { @@ -131,7 +134,9 @@ def test_series_groupby_value_counts_with_grouper(): } ).drop([3]) - df["Datetime"] = to_datetime(df["Timestamp"].apply(lambda t: str(t)), unit="s") + df["Datetime"] = to_datetime( + df["Timestamp"].apply(lambda t: str(t)), utc=utc, unit="s" + ) dfg = df.groupby(Grouper(freq="1D", key="Datetime")) # have to sort on index because of unstable sort on values xref GH9212 diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 4c6f172b00a58..8f1d52c2ea03d 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -185,8 +185,6 @@ def test_transform_axis_1_reducer(request, reduction_func): # GH#45715 if reduction_func in ( "corrwith", - "idxmax", - "idxmin", "ngroup", "nth", ): diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index f962a552d9009..e1ada9f10c261 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -880,15 +880,8 @@ def test_constructor_with_ambiguous_keyword_arg(self): result = date_range(end=end, periods=2, ambiguous=False) tm.assert_index_equal(result, expected) - def test_constructor_with_nonexistent_keyword_arg(self, warsaw, request): + def test_constructor_with_nonexistent_keyword_arg(self, warsaw): # GH 35297 - if type(warsaw).__name__ == "ZoneInfo": - mark = pytest.mark.xfail( - reason="nonexistent-shift not yet implemented for ZoneInfo", - raises=NotImplementedError, - ) - request.node.add_marker(mark) - timezone = warsaw # nonexistent keyword in start diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index f5f58e7e818d9..8e507212976ec 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -151,57 +151,35 @@ def test_rename_multiindex_with_duplicates(self): expected = DataFrame(index=mi2) tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "data_result, data_expected", - [ - ( - [ - [(81.0, np.nan), (np.nan, np.nan)], - [(np.nan, np.nan), (82.0, np.nan)], - [1, 2], - [1, 2], - ], - [ - [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])], - [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])], - [1, np.nan, 2], - [np.nan, 2, 1], - ], - ), - ( - [ - [(81.0, np.nan), (np.nan, np.nan)], - [(np.nan, np.nan), (81.0, np.nan)], - [1, 2], - [1, 2], - ], - [ - [[81.0, np.nan], Series([np.nan, np.nan])], - [[81.0, np.nan], Series([np.nan, np.nan])], - [1, 2], - [2, 1], - ], - ), - ], - ) - def test_subtracting_two_series_with_unordered_index_and_all_nan_index( - self, data_result, data_expected - ): + def test_series_align_multiindex_with_nan_overlap_only(self): + # GH 38439 + mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) + mi2 = MultiIndex.from_arrays([[np.nan, 82.0], [np.nan, np.nan]]) + ser1 = Series([1, 2], index=mi1) + ser2 = Series([1, 2], index=mi2) + result1, result2 = ser1.align(ser2) + + mi = MultiIndex.from_arrays([[81.0, 82.0, np.nan], [np.nan, np.nan, np.nan]]) + expected1 = Series([1.0, np.nan, 2.0], index=mi) + expected2 = Series([np.nan, 2.0, 1.0], index=mi) + + tm.assert_series_equal(result1, expected1) + tm.assert_series_equal(result2, expected2) + + def test_series_align_multiindex_with_nan(self): # GH 38439 - # TODO: Refactor. This is impossible to understand GH#49443 - a_index_result = MultiIndex.from_tuples(data_result[0]) - b_index_result = MultiIndex.from_tuples(data_result[1]) - a_series_result = Series(data_result[2], index=a_index_result) - b_series_result = Series(data_result[3], index=b_index_result) - result = a_series_result.align(b_series_result) - - a_index_expected = MultiIndex.from_arrays(data_expected[0]) - b_index_expected = MultiIndex.from_arrays(data_expected[1]) - a_series_expected = Series(data_expected[2], index=a_index_expected) - b_series_expected = Series(data_expected[3], index=b_index_expected) - - tm.assert_series_equal(result[0], a_series_expected) - tm.assert_series_equal(result[1], b_series_expected) + mi1 = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) + mi2 = MultiIndex.from_arrays([[np.nan, 81.0], [np.nan, np.nan]]) + ser1 = Series([1, 2], index=mi1) + ser2 = Series([1, 2], index=mi2) + result1, result2 = ser1.align(ser2) + + mi = MultiIndex.from_arrays([[81.0, np.nan], [np.nan, np.nan]]) + expected1 = Series([1, 2], index=mi) + expected2 = Series([2, 1], index=mi) + + tm.assert_series_equal(result1, expected1) + tm.assert_series_equal(result2, expected2) def test_nunique_smoke(self): # GH 34019 diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 32ab0336aa93f..46fb614d96633 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -707,13 +707,13 @@ def test_applymap_subset_multiindex(self, slice_): and isinstance(slice_[-1][-1], list) and "C" in slice_[-1][-1] ): - ctx = pytest.raises(KeyError, match="C") # noqa: PDF010 + ctx = pytest.raises(KeyError, match="C") elif ( isinstance(slice_[0], tuple) and isinstance(slice_[0][1], list) and 3 in slice_[0][1] ): - ctx = pytest.raises(KeyError, match="3") # noqa: PDF010 + ctx = pytest.raises(KeyError, match="3") else: ctx = contextlib.nullcontext() diff --git a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py index cf521aafdc241..d9232a6bddf61 100644 --- a/pandas/tests/io/json/test_json_table_schema_ext_dtype.py +++ b/pandas/tests/io/json/test_json_table_schema_ext_dtype.py @@ -48,7 +48,7 @@ def test_build_table_schema(self): "fields": [ {"name": "index", "type": "integer"}, {"name": "A", "type": "any", "extDtype": "DateDtype"}, - {"name": "B", "type": "any", "extDtype": "decimal"}, + {"name": "B", "type": "number", "extDtype": "decimal"}, {"name": "C", "type": "any", "extDtype": "string"}, {"name": "D", "type": "integer", "extDtype": "Int64"}, ], @@ -82,10 +82,10 @@ def test_as_json_table_type_ext_date_dtype(self): ], ) def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data): - assert as_json_table_type(decimal_data.dtype) == "any" + assert as_json_table_type(decimal_data.dtype) == "number" def test_as_json_table_type_ext_decimal_dtype(self): - assert as_json_table_type(DecimalDtype()) == "any" + assert as_json_table_type(DecimalDtype()) == "number" @pytest.mark.parametrize( "string_data", @@ -180,7 +180,7 @@ def test_build_decimal_series(self, dc): fields = [ {"name": "id", "type": "integer"}, - {"name": "a", "type": "any", "extDtype": "decimal"}, + {"name": "a", "type": "number", "extDtype": "decimal"}, ] schema = {"fields": fields, "primaryKey": ["id"]} @@ -257,7 +257,7 @@ def test_to_json(self, df): fields = [ OrderedDict({"name": "idx", "type": "integer"}), OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}), - OrderedDict({"name": "B", "type": "any", "extDtype": "decimal"}), + OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}), OrderedDict({"name": "C", "type": "any", "extDtype": "string"}), OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}), ] diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index ee9314c8779dd..6c05f5defe4fb 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -2052,7 +2052,7 @@ def test_parse_dot_separated_dates(all_parsers): name="a", ) warn = UserWarning - msg = "when dayfirst=False was specified" + msg = r"when dayfirst=False \(the default\) was specified" result = parser.read_csv_check_warnings( warn, msg, StringIO(data), parse_dates=True, index_col=0 ) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 53be06cd491ef..5c7c4f9ce0b75 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -349,7 +349,7 @@ def test_index_types(setup_path): _check_roundtrip(ser, func, path=setup_path) -def test_timeseries_preepoch(setup_path): +def test_timeseries_preepoch(setup_path, request): dr = bdate_range("1/1/1940", "1/1/1960") ts = Series(np.random.randn(len(dr)), index=dr) @@ -357,9 +357,10 @@ def test_timeseries_preepoch(setup_path): _check_roundtrip(ts, tm.assert_series_equal, path=setup_path) except OverflowError: if is_platform_windows(): - pytest.xfail("known failure on some windows platforms") - else: - raise + request.node.add_marker( + pytest.mark.xfail("known failure on some windows platforms") + ) + raise @pytest.mark.parametrize( diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 1263d61b55cd5..2664c7df59223 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -911,7 +911,7 @@ def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): os.close(fd) except (OSError, ValueError): pass - os.remove(new_f) # noqa: PDF008 + os.remove(new_f) # new table df = tm.makeDataFrame() diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f74d268690a5b..2172a4bf839fb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -317,7 +317,7 @@ def test_read_expands_user_home_dir( ), ], ) - @pytest.mark.filterwarnings( # pytables np.object usage + @pytest.mark.filterwarnings( # pytables np.object_ usage "ignore:`np.object` is a deprecated alias:DeprecationWarning" ) def test_read_fspath_all(self, reader, module, path, datapath): @@ -372,7 +372,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.filterwarnings( # pytables np.object usage + @pytest.mark.filterwarnings( # pytables np.object_ usage "ignore:`np.object` is a deprecated alias:DeprecationWarning" ) def test_write_fspath_hdf5(self): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 782753177f245..fc15ff3488ce9 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -19,7 +19,7 @@ import pandas.io.common as icom _compression_to_extension = { - value: key for key, value in icom._extension_to_compression.items() + value: key for key, value in icom.extension_to_compression.items() } diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 87f648bb5acd6..d5c03dcc85a0d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import read_orc import pandas._testing as tm +from pandas.core.arrays import StringArray pytest.importorskip("pyarrow.orc") @@ -305,16 +306,6 @@ def test_orc_writer_dtypes_not_supported(df_not_supported): df_not_supported.to_orc() -def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath): - input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") - with pytest.raises( - NotImplementedError, - match="mode.dtype_backend set to pandas is not implemented.", - ): - with pd.option_context("mode.dtype_backend", "pandas"): - read_orc(input_file, use_nullable_dtypes=True) - - @td.skip_if_no("pyarrow", min_version="7.0.0") def test_orc_use_nullable_dtypes_pyarrow_backend(): df = pd.DataFrame( @@ -336,13 +327,60 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): ], } ) + bytes_data = df.copy().to_orc() with pd.option_context("mode.dtype_backend", "pyarrow"): result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + expected = pd.DataFrame( { col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True)) for col in df.columns } ) + + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_use_nullable_dtypes_pandas_backend(): + # GH#50503 + df = pd.DataFrame( + { + "string": list("abc"), + "string_with_nan": ["a", np.nan, "c"], + "string_with_none": ["a", None, "c"], + "int": list(range(1, 4)), + "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), + "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_nan": [2.0, np.nan, 3.0], + "bool": [True, False, True], + "bool_with_na": [True, False, None], + } + ) + + bytes_data = df.copy().to_orc() + with pd.option_context("mode.dtype_backend", "pandas"): + result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + + expected = pd.DataFrame( + { + "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)), + "string_with_nan": StringArray( + np.array(["a", pd.NA, "c"], dtype=np.object_) + ), + "string_with_none": StringArray( + np.array(["a", pd.NA, "c"], dtype=np.object_) + ), + "int": pd.Series([1, 2, 3], dtype="Int64"), + "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"), + "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"), + "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"), + "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"), + "bool": pd.Series([True, False, True], dtype="boolean"), + "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3dafe6fe61b35..07a028a19d7f9 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -37,8 +37,8 @@ get_lzma_file, is_platform_little_endian, ) -from pandas.compat._compressors import flatten_buffer from pandas.compat._optional import import_optional_dependency +from pandas.compat.compressors import flatten_buffer import pandas.util._test_decorators as td import pandas as pd @@ -255,7 +255,7 @@ def get_random_path(): class TestCompression: - _extension_to_compression = icom._extension_to_compression + _extension_to_compression = icom.extension_to_compression def compress_file(self, src_path, dest_path, compression): if compression is None: diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index aeaf2d3b7edbf..d65b9b8af4365 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -21,8 +21,17 @@ ) import pandas.util._test_decorators as td -from pandas import DataFrame +import pandas as pd +from pandas import ( + NA, + DataFrame, + Series, +) import pandas._testing as tm +from pandas.core.arrays import ( + ArrowStringArray, + StringArray, +) from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1702,3 +1711,74 @@ def test_s3_parser_consistency(): ) tm.assert_frame_equal(df_lxml, df_etree) + + +@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"]) +def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend): + # GH#50500 + if string_storage == "pyarrow" or dtype_backend == "pyarrow": + pa = pytest.importorskip("pyarrow") + data = """ + + + x + 1 + 4.0 + x + 2 + 4.0 + + True + False + + + y + 2 + 5.0 + + + + + False + + +""" + + if string_storage == "python": + string_array = StringArray(np.array(["x", "y"], dtype=np.object_)) + string_array_na = StringArray(np.array(["x", NA], dtype=np.object_)) + + else: + string_array = ArrowStringArray(pa.array(["x", "y"])) + string_array_na = ArrowStringArray(pa.array(["x", None])) + + with pd.option_context("mode.string_storage", string_storage): + with pd.option_context("mode.dtype_backend", dtype_backend): + result = read_xml(data, parser=parser, use_nullable_dtypes=True) + + expected = DataFrame( + { + "a": string_array, + "b": Series([1, 2], dtype="Int64"), + "c": Series([4.0, 5.0], dtype="Float64"), + "d": string_array_na, + "e": Series([2, NA], dtype="Int64"), + "f": Series([4.0, NA], dtype="Float64"), + "g": Series([NA, NA], dtype="Int64"), + "h": Series([True, False], dtype="boolean"), + "i": Series([False, NA], dtype="boolean"), + } + ) + + if dtype_backend == "pyarrow": + from pandas.arrays import ArrowExtensionArray + + expected = DataFrame( + { + col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True)) + for col in expected.columns + } + ) + expected["g"] = ArrowExtensionArray(pa.array([None, None])) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index fd7c47d47112f..e352250dc748d 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -243,6 +243,27 @@ def test_get_reverse_indexer(self): expected = np.array([4, 2, 3, 6, 7], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", ["int64", "int32"]) + def test_array_equal_fast(self, dtype): + # GH#50592 + left = np.arange(1, 100, dtype=dtype) + right = np.arange(1, 100, dtype=dtype) + assert lib.array_equal_fast(left, right) + + @pytest.mark.parametrize("dtype", ["int64", "int32"]) + def test_array_equal_fast_not_equal(self, dtype): + # GH#50592 + left = np.array([1, 2], dtype=dtype) + right = np.array([2, 2], dtype=dtype) + assert not lib.array_equal_fast(left, right) + + @pytest.mark.parametrize("dtype", ["int64", "int32"]) + def test_array_equal_fast_not_equal_shape(self, dtype): + # GH#50592 + left = np.array([1, 2, 3], dtype=dtype) + right = np.array([2, 2], dtype=dtype) + assert not lib.array_equal_fast(left, right) + def test_cache_readonly_preserve_docstrings(): # GH18197 diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 9d90f2e405803..797aae7eaba3a 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -560,6 +560,36 @@ def test_hist_secondary_legend(self): assert ax.get_yaxis().get_visible() tm.close() + @td.skip_if_no_mpl + def test_hist_with_nans_and_weights(self): + # GH 48884 + df = DataFrame( + [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]], + columns=list("abc"), + ) + weights = np.array([0.25, 0.3, 0.45]) + no_nan_df = DataFrame([[0.4, 0.2, 0.3], [0.7, 0.8, 0.9]], columns=list("abc")) + no_nan_weights = np.array([[0.3, 0.25, 0.25], [0.45, 0.45, 0.45]]) + + from matplotlib.patches import Rectangle + + _, ax0 = self.plt.subplots() + df.plot.hist(ax=ax0, weights=weights) + rects = [x for x in ax0.get_children() if isinstance(x, Rectangle)] + heights = [rect.get_height() for rect in rects] + _, ax1 = self.plt.subplots() + no_nan_df.plot.hist(ax=ax1, weights=no_nan_weights) + no_nan_rects = [x for x in ax1.get_children() if isinstance(x, Rectangle)] + no_nan_heights = [rect.get_height() for rect in no_nan_rects] + assert all(h0 == h1 for h0, h1 in zip(heights, no_nan_heights)) + + idxerror_weights = np.array([[0.3, 0.25], [0.45, 0.45]]) + + msg = "weights must have the same shape as data, or be a single column" + with pytest.raises(ValueError, match=msg): + _, ax2 = self.plt.subplots() + no_nan_df.plot.hist(ax=ax2, weights=idxerror_weights) + @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 0432cf397067d..a521e24aa6022 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -515,3 +515,25 @@ def test_resample_empty_Dataframe(keys): expected.index.name = keys[0] tm.assert_frame_equal(result, expected) + + +def test_groupby_resample_size_all_index_same(): + # GH 46826 + df = DataFrame( + {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)}, + index=date_range("31/12/2000 18:00", freq="H", periods=12), + ) + result = df.groupby("A").resample("D").size() + expected = Series( + 3, + index=pd.MultiIndex.from_tuples( + [ + (1, Timestamp("2000-12-31")), + (1, Timestamp("2001-01-01")), + (2, Timestamp("2000-12-31")), + (2, Timestamp("2001-01-01")), + ], + names=["A", None], + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 604429e7c8d78..4143e52bbb7ed 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -415,12 +415,13 @@ def test_constructor_fromordinal(self): nanosecond=1, tz="UTC", ), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, None), - Timestamp(2000, 1, 2, 3, 4, 5, 6, 1, pytz.UTC), + Timestamp(2000, 1, 2, 3, 4, 5, 6, None, nanosecond=1), + Timestamp(2000, 1, 2, 3, 4, 5, 6, tz=pytz.UTC, nanosecond=1), ], ) def test_constructor_nanosecond(self, result): # GH 18898 + # As of 2.0 (GH 49416), nanosecond should not be accepted positionally expected = Timestamp(datetime(2000, 1, 2, 3, 4, 5, 6), tz=result.tz) expected = expected + Timedelta(nanoseconds=1) assert result == expected diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index de41ea9021453..70f9f7c924844 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -7,6 +7,7 @@ timezone, ) import locale +import time import unicodedata from dateutil.tz import tzutc @@ -1095,3 +1096,11 @@ def test_delimited_date(): result = Timestamp("13-01-2000") expected = Timestamp(2000, 1, 13) assert result == expected + + +def test_utctimetuple(): + # GH 32174 + ts = Timestamp("2000-01-01", tz="UTC") + result = ts.utctimetuple() + expected = time.struct_time((2000, 1, 1, 0, 0, 0, 5, 1, 0)) + assert result == expected diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 59afe22e40f7a..18ad275083022 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -16,7 +16,8 @@ def test_replace_explicit_none(self): expected = pd.Series([0, 0, None], dtype=object) tm.assert_series_equal(result, expected) - df = pd.DataFrame(np.zeros((3, 3))) + # Cast column 2 to object to avoid implicit cast when setting entry to "" + df = pd.DataFrame(np.zeros((3, 3))).astype({2: object}) df.iloc[2, 2] = "" result = df.replace("", None) expected = pd.DataFrame( diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py new file mode 100644 index 0000000000000..487489e8c0b0c --- /dev/null +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -0,0 +1,17 @@ +import numpy as np +import pytest + +from pandas import ( + NA, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("dtype", ["int64", "float64"]) +def test_to_numpy_na_value(dtype): + # GH#48951 + ser = Series([1, 2, NA, 4]) + result = ser.to_numpy(dtype=dtype, na_value=0) + expected = np.array([1, 2, 0, 4], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/methods/test_tz_localize.py b/pandas/tests/series/methods/test_tz_localize.py index b71e7ed5500c3..6b096a7fcf3eb 100644 --- a/pandas/tests/series/methods/test_tz_localize.py +++ b/pandas/tests/series/methods/test_tz_localize.py @@ -64,6 +64,7 @@ def test_series_tz_localize_matching_index(self): "method, exp", [ ["shift_forward", "2015-03-29 03:00:00"], + ["shift_backward", "2015-03-29 01:59:59.999999999"], ["NaT", NaT], ["raise", None], ["foo", "invalid"], @@ -99,15 +100,6 @@ def test_tz_localize_nonexistent(self, warsaw, method, exp): with pytest.raises(ValueError, match=msg): df.tz_localize(tz, nonexistent=method) - elif method == "shift_forward" and type(tz).__name__ == "ZoneInfo": - msg = "nonexistent shifting is not implemented with ZoneInfo tzinfos" - with pytest.raises(NotImplementedError, match=msg): - ser.tz_localize(tz, nonexistent=method) - with pytest.raises(NotImplementedError, match=msg): - df.tz_localize(tz, nonexistent=method) - with pytest.raises(NotImplementedError, match=msg): - dti.tz_localize(tz, nonexistent=method) - else: result = ser.tz_localize(tz, nonexistent=method) expected = Series(1, index=DatetimeIndex([exp] * n, tz=tz)) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 389e32f7f193f..d6e862ed11d36 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -366,6 +366,37 @@ def test_to_datetime_with_non_exact(self, cache): ) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "format, expected", + [ + ("%Y-%m-%d", Timestamp(2000, 1, 3)), + ("%Y-%d-%m", Timestamp(2000, 3, 1)), + ("%Y-%m-%d %H", Timestamp(2000, 1, 3, 12)), + ("%Y-%d-%m %H", Timestamp(2000, 3, 1, 12)), + ("%Y-%m-%d %H:%M", Timestamp(2000, 1, 3, 12, 34)), + ("%Y-%d-%m %H:%M", Timestamp(2000, 3, 1, 12, 34)), + ("%Y-%m-%d %H:%M:%S", Timestamp(2000, 1, 3, 12, 34, 56)), + ("%Y-%d-%m %H:%M:%S", Timestamp(2000, 3, 1, 12, 34, 56)), + ("%Y-%m-%d %H:%M:%S.%f", Timestamp(2000, 1, 3, 12, 34, 56, 123456)), + ("%Y-%d-%m %H:%M:%S.%f", Timestamp(2000, 3, 1, 12, 34, 56, 123456)), + ( + "%Y-%m-%d %H:%M:%S.%f%z", + Timestamp(2000, 1, 3, 12, 34, 56, 123456, tz="UTC+01:00"), + ), + ( + "%Y-%d-%m %H:%M:%S.%f%z", + Timestamp(2000, 3, 1, 12, 34, 56, 123456, tz="UTC+01:00"), + ), + ], + ) + def test_non_exact_doesnt_parse_whole_string(self, cache, format, expected): + # https://github.com/pandas-dev/pandas/issues/50412 + # the formats alternate between ISO8601 and non-ISO8601 to check both paths + result = to_datetime( + "2000-01-03 12:34:56.123456+01:00", format=format, exact=False + ) + assert result == expected + @pytest.mark.parametrize( "arg", [ @@ -511,6 +542,29 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime: + @pytest.mark.filterwarnings("ignore:Could not infer format") + def test_to_datetime_overflow(self): + # we should get an OutOfBoundsDatetime, NOT OverflowError + # TODO: Timestamp raises VaueError("could not convert string to Timestamp") + # can we make these more consistent? + arg = "08335394550" + msg = 'Parsing "08335394550" to datetime overflows, at position 0' + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime(arg) + + with pytest.raises(OutOfBoundsDatetime, match=msg): + to_datetime([arg]) + + res = to_datetime(arg, errors="coerce") + assert res is NaT + res = to_datetime([arg], errors="coerce") + tm.assert_index_equal(res, Index([NaT])) + + res = to_datetime(arg, errors="ignore") + assert isinstance(res, str) and res == arg + res = to_datetime([arg], errors="ignore") + tm.assert_index_equal(res, Index([arg], dtype=object)) + def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1))) diff --git a/pandas/tests/tseries/offsets/test_business_hour.py b/pandas/tests/tseries/offsets/test_business_hour.py index 79e7a5ff67010..319cc053d5d7d 100644 --- a/pandas/tests/tseries/offsets/test_business_hour.py +++ b/pandas/tests/tseries/offsets/test_business_hour.py @@ -241,6 +241,12 @@ def test_sub(self, dt, offset2, _offset): assert dt - offset2 == dt + _offset(-3) + def test_multiply_by_zero(self, dt, offset1, offset2): + assert dt - 0 * offset1 == dt + assert dt + 0 * offset1 == dt + assert dt - 0 * offset2 == dt + assert dt + 0 * offset2 == dt + def testRollback1( self, dt, @@ -972,6 +978,12 @@ def test_datetimeindex(self): for idx in [idx1, idx2, idx3]: tm.assert_index_equal(idx, expected) + def test_short_datetimeindex_creation(self): + # gh-49835 + idx4 = date_range(start="2014-07-01 10:00", freq="BH", periods=1) + expected4 = DatetimeIndex(["2014-07-01 10:00"], freq="BH") + tm.assert_index_equal(idx4, expected4) + def test_bday_ignores_timedeltas(self): idx = date_range("2010/02/01", "2010/02/10", freq="12H") t1 = idx + BDay(offset=Timedelta(3, unit="H")) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 558c802fd70f6..a1a14674400c6 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -262,7 +262,8 @@ def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt): def test_guess_datetime_format_no_padding(string, fmt, dayfirst, warning): # see gh-11142 msg = ( - f"Parsing dates in {fmt} format when dayfirst=False was specified. " + rf"Parsing dates in {fmt} format when dayfirst=False \(the default\) " + "was specified. " "Pass `dayfirst=True` or specify a format to silence this warning." ) with tm.assert_produces_warning(warning, match=msg): diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index bc723b8ed36b8..6180d4a5f8e17 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -1,7 +1,10 @@ import numpy as np import pytest -from pandas.errors import SpecificationError +from pandas.errors import ( + DataError, + SpecificationError, +) from pandas import ( DataFrame, @@ -66,18 +69,12 @@ def tests_skip_nuisance(step): tm.assert_frame_equal(result, expected) -def test_skip_sum_object_raises(step): +def test_sum_object_str_raises(step): df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"}) r = df.rolling(window=3, step=step) - msg = r"nuisance columns.*Dropped columns were Index\(\['C'\], dtype='object'\)" - with tm.assert_produces_warning(FutureWarning, match=msg): - # GH#42738 - result = r.sum() - expected = DataFrame( - {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]}, - columns=list("AB"), - )[::step] - tm.assert_frame_equal(result, expected) + with pytest.raises(DataError, match="Cannot aggregate non-numeric type: object"): + # GH#42738, enforced in 2.0 + r.sum() def test_agg(step): diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 52011a2d5f760..b975a28273337 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -165,7 +165,7 @@ def test_dataframe_dtypes(method, expected_data, dtypes, min_periods, step): rolled = df.rolling(2, min_periods=min_periods, step=step) if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count": - msg = "No numeric types to aggregate" + msg = "Cannot aggregate non-numeric type" with pytest.raises(DataError, match=msg): getattr(rolled, method)() else: diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index f88c20f2f78c6..205a02dcb051b 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -98,11 +98,9 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): halflife = halflife_with_times data = np.arange(10.0) data[::2] = np.nan - df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) - with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): - # GH#42738 - result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() - expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() + df = DataFrame({"A": data}) + result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() + expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 3da14bce6facd..41b2ee70d7987 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -1125,13 +1125,6 @@ def test_methods(self, method, expected_data): ) tm.assert_frame_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - expected = df.groupby("A", group_keys=True).apply( - lambda x: getattr(x.ewm(com=1.0), method)() - ) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( "method, expected_data", [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]], @@ -1160,13 +1153,9 @@ def test_pairwise_methods(self, method, expected_data): def test_times(self, times_frame): # GH 40951 halflife = "23 days" - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - result = ( - times_frame.groupby("A") - .ewm(halflife=halflife, times=times_frame["C"]) - .mean() - ) + # GH#42738 + times = times_frame.pop("C") + result = times_frame.groupby("A").ewm(halflife=halflife, times=times).mean() expected = DataFrame( { "B": [ @@ -1200,29 +1189,13 @@ def test_times(self, times_frame): ) tm.assert_frame_equal(result, expected) - def test_times_vs_apply(self, times_frame): - # GH 40951 - halflife = "23 days" - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - result = ( - times_frame.groupby("A") - .ewm(halflife=halflife, times=times_frame["C"]) - .mean() - ) - expected = times_frame.groupby("A", group_keys=True).apply( - lambda x: x.ewm(halflife=halflife, times=x["C"]).mean() - ) - tm.assert_frame_equal(result, expected) - def test_times_array(self, times_frame): # GH 40951 halflife = "23 days" + times = times_frame.pop("C") gb = times_frame.groupby("A") - with tm.assert_produces_warning(FutureWarning, match="nuisance"): - # GH#42738 - result = gb.ewm(halflife=halflife, times=times_frame["C"]).mean() - expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean() + result = gb.ewm(halflife=halflife, times=times).mean() + expected = gb.ewm(halflife=halflife, times=times.values).mean() tm.assert_frame_equal(result, expected) def test_dont_mutate_obj_after_slicing(self): diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 1c78a186e9d37..cca0ab3a0a9bb 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -253,22 +253,19 @@ def test_invalid_engine_kwargs(self, grouper, method): def test_cython_vs_numba( self, grouper, method, nogil, parallel, nopython, ignore_na, adjust ): + df = DataFrame({"B": range(4)}) if grouper == "None": grouper = lambda x: x - warn = FutureWarning else: + df["A"] = ["a", "b", "a", "b"] grouper = lambda x: x.groupby("A") - warn = None if method == "sum": adjust = True - df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - with tm.assert_produces_warning(warn, match="nuisance"): - # GH#42738 - result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs) - expected = getattr(ewm, method)(engine="cython") + result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs) + expected = getattr(ewm, method)(engine="cython") tm.assert_frame_equal(result, expected) @@ -276,12 +273,12 @@ def test_cython_vs_numba( def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): # GH 40951 + df = DataFrame({"B": [0, 0, 1, 1, 2, 2]}) if grouper == "None": grouper = lambda x: x - warn = FutureWarning else: grouper = lambda x: x.groupby("A") - warn = None + df["A"] = ["a", "b", "a", "b", "b", "a"] halflife = "23 days" times = to_datetime( @@ -294,17 +291,14 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_ "2020-01-03", ] ) - df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]}) ewm = grouper(df).ewm( halflife=halflife, adjust=True, ignore_na=ignore_na, times=times ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - with tm.assert_produces_warning(warn, match="nuisance"): - # GH#42738 - result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) - expected = ewm.mean(engine="cython") + result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) + expected = ewm.mean(engine="cython") tm.assert_frame_equal(result, expected) diff --git a/requirements-dev.txt b/requirements-dev.txt index 4f2a80d932fd0..975783a83d1f6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -40,7 +40,7 @@ python-snappy pyxlsb s3fs>=2021.08.0 scipy -sqlalchemy +sqlalchemy<1.4.46 tabulate tzdata>=2022.1 xarray @@ -65,7 +65,6 @@ gitpython gitdb natsort numpydoc -pandas-dev-flaker==0.5.0 pydata-sphinx-theme<0.11 pytest-cython sphinx diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py new file mode 100644 index 0000000000000..3c21821e794a9 --- /dev/null +++ b/scripts/check_for_inconsistent_pandas_namespace.py @@ -0,0 +1,142 @@ +""" +Check that test suite file doesn't use the pandas namespace inconsistently. + +We check for cases of ``Series`` and ``pd.Series`` appearing in the same file +(likewise for other pandas objects). + +This is meant to be run as a pre-commit hook - to run it manually, you can do: + + pre-commit run inconsistent-namespace-usage --all-files + +To automatically fixup a given file, you can pass `--replace`, e.g. + + python scripts/check_for_inconsistent_pandas_namespace.py test_me.py --replace + +though note that you may need to manually fixup some imports and that you will also +need the additional dependency `tokenize-rt` (which is left out from the pre-commit +hook so that it uses the same virtualenv as the other local ones). + +The general structure is similar to that of some plugins from +https://github.com/asottile/pyupgrade . +""" + +import argparse +import ast +import sys +from typing import ( + MutableMapping, + NamedTuple, + Optional, + Sequence, + Set, +) + +ERROR_MESSAGE = ( + "{path}:{lineno}:{col_offset}: " + "Found both '{prefix}.{name}' and '{name}' in {path}" +) + + +class OffsetWithNamespace(NamedTuple): + lineno: int + col_offset: int + namespace: str + + +class Visitor(ast.NodeVisitor): + def __init__(self) -> None: + self.pandas_namespace: MutableMapping[OffsetWithNamespace, str] = {} + self.imported_from_pandas: Set[str] = set() + + def visit_Attribute(self, node: ast.Attribute) -> None: + if isinstance(node.value, ast.Name) and node.value.id in {"pandas", "pd"}: + offset_with_namespace = OffsetWithNamespace( + node.lineno, node.col_offset, node.value.id + ) + self.pandas_namespace[offset_with_namespace] = node.attr + self.generic_visit(node) + + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + if node.module is not None and "pandas" in node.module: + self.imported_from_pandas.update(name.name for name in node.names) + self.generic_visit(node) + + +def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str: + from tokenize_rt import ( + reversed_enumerate, + src_to_tokens, + tokens_to_src, + ) + + tokens = src_to_tokens(content) + for n, i in reversed_enumerate(tokens): + offset_with_namespace = OffsetWithNamespace(i.offset[0], i.offset[1], i.src) + if ( + offset_with_namespace in visitor.pandas_namespace + and visitor.pandas_namespace[offset_with_namespace] + in visitor.imported_from_pandas + ): + # Replace `pd` + tokens[n] = i._replace(src="") + # Replace `.` + tokens[n + 1] = tokens[n + 1]._replace(src="") + + new_src: str = tokens_to_src(tokens) + return new_src + + +def check_for_inconsistent_pandas_namespace( + content: str, path: str, *, replace: bool +) -> Optional[str]: + tree = ast.parse(content) + + visitor = Visitor() + visitor.visit(tree) + + inconsistencies = visitor.imported_from_pandas.intersection( + visitor.pandas_namespace.values() + ) + + if not inconsistencies: + # No inconsistent namespace usage, nothing to replace. + return None + + if not replace: + inconsistency = inconsistencies.pop() + lineno, col_offset, prefix = next( + key for key, val in visitor.pandas_namespace.items() if val == inconsistency + ) + msg = ERROR_MESSAGE.format( + lineno=lineno, + col_offset=col_offset, + prefix=prefix, + name=inconsistency, + path=path, + ) + sys.stdout.write(msg) + sys.exit(1) + + return replace_inconsistent_pandas_namespace(visitor, content) + + +def main(argv: Optional[Sequence[str]] = None) -> None: + parser = argparse.ArgumentParser() + parser.add_argument("paths", nargs="*") + parser.add_argument("--replace", action="store_true") + args = parser.parse_args(argv) + + for path in args.paths: + with open(path, encoding="utf-8") as fd: + content = fd.read() + new_content = check_for_inconsistent_pandas_namespace( + content, path, replace=args.replace + ) + if not args.replace or new_content is None: + continue + with open(path, "w", encoding="utf-8") as fd: + fd.write(new_content) + + +if __name__ == "__main__": + main() diff --git a/scripts/sync_flake8_versions.py b/scripts/sync_flake8_versions.py index 8852634c5d796..0d513d5937dbe 100644 --- a/scripts/sync_flake8_versions.py +++ b/scripts/sync_flake8_versions.py @@ -1,5 +1,5 @@ """ -Check that the flake8 (and pandas-dev-flaker) pins are the same in: +Check that the flake8 pins are the same in: - environment.yml - .pre-commit-config.yaml, in the flake8 hook @@ -103,17 +103,13 @@ def get_revisions( precommit_config: YamlMapping, environment: YamlMapping ) -> tuple[Revisions, Revisions]: flake8_revisions = Revisions(name="flake8") - pandas_dev_flaker_revisions = Revisions(name="pandas-dev-flaker") repos = precommit_config["repos"] flake8_repo, flake8_hook = _get_repo_hook(repos, "flake8") flake8_revisions.pre_commit = Revision("flake8", "==", flake8_repo["rev"]) flake8_additional_dependencies = [] for dep in _process_dependencies(flake8_hook.get("additional_dependencies", [])): - if dep.name == "pandas-dev-flaker": - pandas_dev_flaker_revisions.pre_commit = dep - else: - flake8_additional_dependencies.append(dep) + flake8_additional_dependencies.append(dep) environment_dependencies = environment["dependencies"] environment_additional_dependencies = [] @@ -121,8 +117,6 @@ def get_revisions( if dep.name == "flake8": flake8_revisions.environment = dep environment_additional_dependencies.append(dep) - elif dep.name == "pandas-dev-flaker": - pandas_dev_flaker_revisions.environment = dep else: environment_additional_dependencies.append(dep) @@ -131,8 +125,7 @@ def get_revisions( environment_additional_dependencies, ) - for revisions in flake8_revisions, pandas_dev_flaker_revisions: - _validate_revisions(revisions) + _validate_revisions(flake8_revisions) if __name__ == "__main__": diff --git a/scripts/tests/test_inconsistent_namespace_check.py b/scripts/tests/test_inconsistent_namespace_check.py new file mode 100644 index 0000000000000..eb995158d8cb4 --- /dev/null +++ b/scripts/tests/test_inconsistent_namespace_check.py @@ -0,0 +1,61 @@ +import pytest + +from ..check_for_inconsistent_pandas_namespace import ( + check_for_inconsistent_pandas_namespace, +) + +BAD_FILE_0 = ( + "from pandas import Categorical\n" + "cat_0 = Categorical()\n" + "cat_1 = pd.Categorical()" +) +BAD_FILE_1 = ( + "from pandas import Categorical\n" + "cat_0 = pd.Categorical()\n" + "cat_1 = Categorical()" +) +BAD_FILE_2 = ( + "from pandas import Categorical\n" + "cat_0 = pandas.Categorical()\n" + "cat_1 = Categorical()" +) +GOOD_FILE_0 = ( + "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = Categorical()" +) +GOOD_FILE_1 = "cat_0 = pd.Categorical()\ncat_1 = pd.Categorical()" +GOOD_FILE_2 = "from array import array\nimport pandas as pd\narr = pd.array([])" +PATH = "t.py" + + +@pytest.mark.parametrize( + "content, expected", + [ + (BAD_FILE_0, "t.py:3:8: Found both 'pd.Categorical' and 'Categorical' in t.py"), + (BAD_FILE_1, "t.py:2:8: Found both 'pd.Categorical' and 'Categorical' in t.py"), + ( + BAD_FILE_2, + "t.py:2:8: Found both 'pandas.Categorical' and 'Categorical' in t.py", + ), + ], +) +def test_inconsistent_usage(content, expected, capsys): + with pytest.raises(SystemExit): + check_for_inconsistent_pandas_namespace(content, PATH, replace=False) + result, _ = capsys.readouterr() + assert result == expected + + +@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1, GOOD_FILE_2]) +@pytest.mark.parametrize("replace", [True, False]) +def test_consistent_usage(content, replace): + # should not raise + check_for_inconsistent_pandas_namespace(content, PATH, replace=replace) + + +@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1, BAD_FILE_2]) +def test_inconsistent_usage_with_replace(content): + result = check_for_inconsistent_pandas_namespace(content, PATH, replace=True) + expected = ( + "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = Categorical()" + ) + assert result == expected diff --git a/scripts/tests/test_sync_flake8_versions.py b/scripts/tests/test_sync_flake8_versions.py index 743ece34e0b56..2349a4f5d8d1c 100644 --- a/scripts/tests/test_sync_flake8_versions.py +++ b/scripts/tests/test_sync_flake8_versions.py @@ -87,7 +87,6 @@ def test_get_revisions_no_failure(capsys): { "id": "flake8", "additional_dependencies": [ - "pandas-dev-flaker==0.4.0", "flake8-bugs==1.1.1", ], } @@ -101,7 +100,6 @@ def test_get_revisions_no_failure(capsys): "id": "yesqa", "additional_dependencies": [ "flake8==0.1.1", - "pandas-dev-flaker==0.4.0", "flake8-bugs==1.1.1", ], } @@ -116,7 +114,6 @@ def test_get_revisions_no_failure(capsys): { "pip": [ "git+https://github.com/pydata/pydata-sphinx-theme.git@master", - "pandas-dev-flaker==0.4.0", ] }, ] diff --git a/scripts/tests/test_validate_unwanted_patterns.py b/scripts/tests/test_validate_unwanted_patterns.py new file mode 100644 index 0000000000000..ef93fd1d21981 --- /dev/null +++ b/scripts/tests/test_validate_unwanted_patterns.py @@ -0,0 +1,419 @@ +import io + +import pytest + +from .. import validate_unwanted_patterns + + +class TestBarePytestRaises: + @pytest.mark.parametrize( + "data", + [ + ( + """ + with pytest.raises(ValueError, match="foo"): + pass + """ + ), + ( + """ + # with pytest.raises(ValueError, match="foo"): + # pass + """ + ), + ( + """ + # with pytest.raises(ValueError): + # pass + """ + ), + ( + """ + with pytest.raises( + ValueError, + match="foo" + ): + pass + """ + ), + ], + ) + def test_pytest_raises(self, data): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + with pytest.raises(ValueError): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError, match="foo"): + with pytest.raises(ValueError): + pass + pass + """ + ), + [ + ( + 2, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises(ValueError): + with pytest.raises(ValueError, match="foo"): + pass + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ( + ( + """ + with pytest.raises( + ValueError, + # match = "foo" + ): + pass + """ + ), + [ + ( + 1, + ( + "Bare pytests raise have been found. " + "Please pass in the argument 'match' " + "as well the exception." + ), + ), + ], + ), + ], + ) + def test_pytest_raises_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.bare_pytest_raises(fd)) + assert result == expected + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + 'msg = ("bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + ], + ), + ( + 'msg = ("foo " "bar " "baz")', + [ + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ( + 1, + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ), + ], + ), + ], +) +def test_strings_to_concatenate(data, expected): + fd = io.StringIO(data.strip()) + result = list(validate_unwanted_patterns.strings_to_concatenate(fd)) + assert result == expected + + +class TestStringsWithWrongPlacedWhitespace: + @pytest.mark.parametrize( + "data", + [ + ( + """ + msg = ( + "foo\n" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + " bar" + "baz" + ) + """ + ), + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + ( + """ + msg = ( + "foo" + rf" bar" + ) + """ + ), + ], + ) + def test_strings_with_wrong_placed_whitespace(self, data): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == [] + + @pytest.mark.parametrize( + "data, expected", + [ + ( + ( + """ + msg = ( + "foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + " bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + f"foo" + f" bar" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + " baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + " bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ( + ( + """ + msg = ( + "foo" + rf" bar" + rf" baz" + ) + """ + ), + [ + ( + 3, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ( + 4, + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ), + ], + ), + ], + ) + def test_strings_with_wrong_placed_whitespace_raises(self, data, expected): + fd = io.StringIO(data.strip()) + result = list( + validate_unwanted_patterns.strings_with_wrong_placed_whitespace(fd) + ) + assert result == expected diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py new file mode 100755 index 0000000000000..5bc2a915a1c0e --- /dev/null +++ b/scripts/validate_unwanted_patterns.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python3 +""" +Unwanted patterns test cases. + +The reason this file exist despite the fact we already have +`ci/code_checks.sh`, +(see https://github.com/pandas-dev/pandas/blob/master/ci/code_checks.sh) + +is that some of the test cases are more complex/impossible to validate via regex. +So this file is somewhat an extensions to `ci/code_checks.sh` +""" + +import argparse +import ast +import sys +import token +import tokenize +from typing import ( + IO, + Callable, + Iterable, + List, + Set, + Tuple, +) + +PRIVATE_IMPORTS_TO_IGNORE: Set[str] = { + "_extension_array_shared_docs", + "_index_shared_docs", + "_interval_shared_docs", + "_merge_doc", + "_shared_docs", + "_apply_docs", + "_new_Index", + "_new_PeriodIndex", + "_doc_template", + "_agg_template", + "_pipe_template", + "__main__", + "_transform_template", + "_flex_comp_doc_FRAME", + "_op_descriptions", + "_IntegerDtype", + "_use_inf_as_na", + "_get_plot_backend", + "_matplotlib", + "_arrow_utils", + "_registry", + "_get_offset", # TODO: remove after get_offset deprecation enforced + "_test_parse_iso8601", + "_json_normalize", # TODO: remove after deprecation is enforced + "_testing", + "_test_decorators", + "__version__", # check np.__version__ in compat.numpy.function + "_arrow_dtype_mapping", +} + + +def _get_literal_string_prefix_len(token_string: str) -> int: + """ + Getting the length of the literal string prefix. + + Parameters + ---------- + token_string : str + String to check. + + Returns + ------- + int + Length of the literal string prefix. + + Examples + -------- + >>> example_string = "'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 0 + >>> example_string = "r'Hello world'" + >>> _get_literal_string_prefix_len(example_string) + 1 + """ + try: + return min( + token_string.find(quote) + for quote in (r"'", r'"') + if token_string.find(quote) >= 0 + ) + except ValueError: + return 0 + + +def bare_pytest_raises(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Test Case for bare pytest raises. + + For example, this is wrong: + + >>> with pytest.raise(ValueError): + ... # Some code that raises ValueError + + And this is what we want instead: + + >>> with pytest.raise(ValueError, match="foo"): + ... # Some code that raises ValueError + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explanation of the error. + + Notes + ----- + GH #23922 + """ + contents = file_obj.read() + tree = ast.parse(contents) + + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + + try: + if not (node.func.value.id == "pytest" and node.func.attr == "raises"): + continue + except AttributeError: + continue + + if not node.keywords: + yield ( + node.lineno, + "Bare pytests raise have been found. " + "Please pass in the argument 'match' as well the exception.", + ) + else: + # Means that there are arguments that are being passed in, + # now we validate that `match` is one of the passed in arguments + if not any(keyword.arg == "match" for keyword in node.keywords): + yield ( + node.lineno, + "Bare pytests raise have been found. " + "Please pass in the argument 'match' as well the exception.", + ) + + +PRIVATE_FUNCTIONS_ALLOWED = {"sys._getframe"} # no known alternative + + +def private_function_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Checking that a private function is not used across modules. + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + Yields + ------ + line_number : int + Line number of the private function that is used across modules. + msg : str + Explanation of the error. + """ + contents = file_obj.read() + tree = ast.parse(contents) + + imported_modules: Set[str] = set() + + for node in ast.walk(tree): + if isinstance(node, (ast.Import, ast.ImportFrom)): + for module in node.names: + module_fqdn = module.name if module.asname is None else module.asname + imported_modules.add(module_fqdn) + + if not isinstance(node, ast.Call): + continue + + try: + module_name = node.func.value.id + function_name = node.func.attr + except AttributeError: + continue + + # Exception section # + + # (Debatable) Class case + if module_name[0].isupper(): + continue + # (Debatable) Dunder methods case + elif function_name.startswith("__") and function_name.endswith("__"): + continue + elif module_name + "." + function_name in PRIVATE_FUNCTIONS_ALLOWED: + continue + + if module_name in imported_modules and function_name.startswith("_"): + yield (node.lineno, f"Private function '{module_name}.{function_name}'") + + +def private_import_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Checking that a private function is not imported across modules. + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + Yields + ------ + line_number : int + Line number of import statement, that imports the private function. + msg : str + Explanation of the error. + """ + contents = file_obj.read() + tree = ast.parse(contents) + + for node in ast.walk(tree): + if not isinstance(node, (ast.Import, ast.ImportFrom)): + continue + + for module in node.names: + module_name = module.name.split(".")[-1] + if module_name in PRIVATE_IMPORTS_TO_IGNORE: + continue + + if module_name.startswith("_"): + yield (node.lineno, f"Import of internal function {repr(module_name)}") + + +def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + This test case is necessary after 'Black' (https://github.com/psf/black), + is formatting strings over multiple lines. + + For example, when this: + + >>> foo = ( + ... "bar " + ... "baz" + ... ) + + Is becoming this: + + >>> foo = ("bar " "baz") + + 'Black' is not considering this as an + issue (see https://github.com/psf/black/issues/1051), + so we are checking it here instead. + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explanation of the error. + + Notes + ----- + GH #30454 + """ + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token.type == next_token.type == token.STRING: + yield ( + current_token.start[0], + ( + "String unnecessarily split in two by black. " + "Please merge them manually." + ), + ) + + +def strings_with_wrong_placed_whitespace( + file_obj: IO[str], +) -> Iterable[Tuple[int, str]]: + """ + Test case for leading spaces in concated strings. + + For example: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + Instead of: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + + Yields + ------ + line_number : int + Line number of unconcatenated string. + msg : str + Explanation of the error. + """ + + def has_wrong_whitespace(first_line: str, second_line: str) -> bool: + """ + Checking if the two lines are mattching the unwanted pattern. + + Parameters + ---------- + first_line : str + First line to check. + second_line : str + Second line to check. + + Returns + ------- + bool + True if the two received string match, an unwanted pattern. + + Notes + ----- + The unwanted pattern that we are trying to catch is if the spaces in + a string that is concatenated over multiple lines are placed at the + end of each string, unless this string is ending with a + newline character (\n). + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the end of the line," + ... " not at the beginning" + ... ) + + And what we want is: + + >>> rule = ( + ... "We want the space at the end of the line, " + ... "not at the beginning" + ... ) + + And if the string is ending with a new line character (\n) we + do not want any trailing whitespaces after it. + + For example, this is bad: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n " + ... "not at the end, like always" + ... ) + + And what we do want is: + + >>> rule = ( + ... "We want the space at the begging of " + ... "the line if the previous line is ending with a \n" + ... " not at the end, like always" + ... ) + """ + if first_line.endswith(r"\n"): + return False + elif first_line.startswith(" ") or second_line.startswith(" "): + return False + elif first_line.endswith(" ") or second_line.endswith(" "): + return False + elif (not first_line.endswith(" ")) and second_line.startswith(" "): + return True + return False + + tokens: List = list(tokenize.generate_tokens(file_obj.readline)) + + for first_token, second_token, third_token in zip(tokens, tokens[1:], tokens[2:]): + # Checking if we are in a block of concated string + if ( + first_token.type == third_token.type == token.STRING + and second_token.type == token.NL + ): + # Striping the quotes, with the string literal prefix + first_string: str = first_token.string[ + _get_literal_string_prefix_len(first_token.string) + 1 : -1 + ] + second_string: str = third_token.string[ + _get_literal_string_prefix_len(third_token.string) + 1 : -1 + ] + + if has_wrong_whitespace(first_string, second_string): + yield ( + third_token.start[0], + ( + "String has a space at the beginning instead " + "of the end of the previous string." + ), + ) + + +def main( + function: Callable[[IO[str]], Iterable[Tuple[int, str]]], + source_path: str, + output_format: str, +) -> bool: + """ + Main entry point of the script. + + Parameters + ---------- + function : Callable + Function to execute for the specified validation type. + source_path : str + Source path representing path to a file/directory. + output_format : str + Output format of the error message. + file_extensions_to_check : str + Comma separated values of what file extensions to check. + excluded_file_paths : str + Comma separated values of what file paths to exclude during the check. + + Returns + ------- + bool + True if found any patterns are found related to the given function. + + Raises + ------ + ValueError + If the `source_path` is not pointing to existing file/directory. + """ + is_failed: bool = False + + for file_path in source_path: + with open(file_path, encoding="utf-8") as file_obj: + for line_number, msg in function(file_obj): + is_failed = True + print( + output_format.format( + source_path=file_path, line_number=line_number, msg=msg + ) + ) + + return is_failed + + +if __name__ == "__main__": + available_validation_types: List[str] = [ + "bare_pytest_raises", + "private_function_across_module", + "private_import_across_module", + "strings_to_concatenate", + "strings_with_wrong_placed_whitespace", + ] + + parser = argparse.ArgumentParser(description="Unwanted patterns checker.") + + parser.add_argument("paths", nargs="*", help="Source paths of files to check.") + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}", + help="Output format of the error message.", + ) + parser.add_argument( + "--validation-type", + "-vt", + choices=available_validation_types, + required=True, + help="Validation test case to check.", + ) + + args = parser.parse_args() + + sys.exit( + main( + function=globals().get(args.validation_type), + source_path=args.paths, + output_format=args.format, + ) + ) diff --git a/setup.cfg b/setup.cfg index ef84dd7f9ce85..562ae70fd73ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,10 +35,6 @@ ignore = B023 # Functions defined inside a loop must not use variables redefined in the loop B301, - # single-letter variables - PDF023, - # "use 'pandas._testing' instead" in non-test code - PDF025, # If test must be a simple comparison against sys.platform or sys.version_info Y002, # Use "_typeshed.Self" instead of class-bound TypeVar @@ -59,18 +55,6 @@ exclude = versioneer.py, # exclude asv benchmark environments from linting env -per-file-ignores = - # private import across modules - pandas/tests/*:PDF020 - # pytest.raises without match= - pandas/tests/extension/*:PDF009 - # os.remove - doc/make.py:PDF008 - # import from pandas._testing - pandas/testing.py:PDF014 - # can't use fixtures in asv - asv_bench/*:PDF016 - [flake8-rst] max-line-length = 84 From a4343408aa27a53120ea111fc2d53f9a800de92d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 7 Jan 2023 22:56:11 +0100 Subject: [PATCH 7/7] Fix test --- pandas/tests/frame/methods/test_align.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 89da9017e43af..30b7ed963e792 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -11,6 +11,7 @@ date_range, ) import pandas._testing as tm +from pandas.core.internals.managers import using_copy_on_write class TestDataFrameAlign: @@ -40,12 +41,12 @@ def test_frame_align_aware(self): assert new1.index.tz is timezone.utc assert new2.index.tz is timezone.utc - def test_align_float(self, float_frame, using_copy_on_write, using_array_manager): + def test_align_float(self, float_frame): af, bf = float_frame.align(float_frame) assert af._mgr is not float_frame._mgr af, bf = float_frame.align(float_frame, copy=False) - if using_copy_on_write or using_array_manager: + if using_copy_on_write(): assert af._mgr is not float_frame._mgr else: assert af._mgr is float_frame._mgr