diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index bcdd9c6fa5717..65167e6467fd5 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -11,6 +11,7 @@ date_range, isnull, period_range, + timedelta_range, ) from .pandas_vb_common import tm @@ -355,15 +356,42 @@ def time_isnull_obj(self): class Fillna: - params = ([True, False], ["pad", "bfill"]) - param_names = ["inplace", "method"] - - def setup(self, inplace, method): - values = np.random.randn(10000, 100) - values[::2] = np.nan - self.df = DataFrame(values) - - def time_frame_fillna(self, inplace, method): + params = ( + [True, False], + ["pad", "bfill"], + [ + "float64", + "float32", + "object", + "Int64", + "Float64", + "datetime64[ns]", + "datetime64[ns, tz]", + "timedelta64[ns]", + ], + ) + param_names = ["inplace", "method", "dtype"] + + def setup(self, inplace, method, dtype): + N, M = 10000, 100 + if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"): + data = { + "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N), + "datetime64[ns, tz]": date_range( + "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" + ), + "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"), + } + self.df = DataFrame({f"col_{i}": data[dtype] for i in range(M)}) + self.df[::2] = None + else: + values = np.random.randn(N, M) + values[::2] = np.nan + if dtype == "Int64": + values = values.round() + self.df = DataFrame(values, dtype=dtype) + + def time_frame_fillna(self, inplace, method, dtype): self.df.fillna(inplace=inplace, method=method) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index e99963c6ad56b..ecb9830024900 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -375,6 +375,7 @@ Performance improvements - Performance improvement in :meth:`IntervalIndex.isin` (:issue:`38353`) - Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`) - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) +- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`) - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) - Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 1a1b263ae356e..5783d3c2353aa 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -597,10 +597,11 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): @cython.boundscheck(False) @cython.wraparound(False) -def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): +def pad_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N algos_t val + uint8_t prev_mask int lim, fill_count = 0 N = len(values) @@ -612,15 +613,18 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): lim = validate_limit(N, limit) val = values[0] + prev_mask = mask[0] for i in range(N): if mask[i]: if fill_count >= lim: continue fill_count += 1 values[i] = val + mask[i] = prev_mask else: fill_count = 0 val = values[i] + prev_mask = mask[i] @cython.boundscheck(False) @@ -739,10 +743,11 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: @cython.boundscheck(False) @cython.wraparound(False) -def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): +def backfill_inplace(algos_t[:] values, uint8_t[:] mask, limit=None): cdef: Py_ssize_t i, N algos_t val + uint8_t prev_mask int lim, fill_count = 0 N = len(values) @@ -754,15 +759,18 @@ def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): lim = validate_limit(N, limit) val = values[N - 1] + prev_mask = mask[N - 1] for i in range(N - 1, -1, -1): if mask[i]: if fill_count >= lim: continue fill_count += 1 values[i] = val + mask[i] = prev_mask else: fill_count = 0 val = values[i] + prev_mask = mask[i] @cython.boundscheck(False) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index ad0bf76b0556b..4615cb4ec7abd 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -279,7 +279,7 @@ def fillna( if mask.any(): if method is not None: func = missing.get_fill_func(method) - new_values = func(self._ndarray.copy(), limit=limit, mask=mask) + new_values, _ = func(self._ndarray.copy(), limit=limit, mask=mask) # TODO: PandasArray didn't used to copy, need tests for this new_values = self._from_backing_data(new_values) else: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3b80c0b189108..86a1bcf24167c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -702,7 +702,7 @@ def fillna(self, value=None, method=None, limit=None): if mask.any(): if method is not None: func = missing.get_fill_func(method) - new_values = func(self.astype(object), limit=limit, mask=mask) + new_values, _ = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8cf876fa32d7b..eff06a5c62894 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -28,6 +28,7 @@ cache_readonly, doc, ) +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( @@ -38,12 +39,16 @@ is_string_dtype, pandas_dtype, ) +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import ( isna, notna, ) -from pandas.core import nanops +from pandas.core import ( + missing, + nanops, +) from pandas.core.algorithms import ( factorize_array, isin, @@ -144,6 +149,39 @@ def __getitem__( return type(self)(self._data[item], self._mask[item]) + @doc(ExtensionArray.fillna) + def fillna( + self: BaseMaskedArrayT, value=None, method=None, limit=None + ) -> BaseMaskedArrayT: + value, method = validate_fillna_kwargs(value, method) + + mask = self._mask + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = missing.get_fill_func(method) + new_values, new_mask = func( + self._data.copy(), + limit=limit, + mask=mask.copy(), + ) + return type(self)(new_values, new_mask.view(np.bool_)) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]: raise AbstractMethodError(self) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 26fe6338118b6..e003efeabcb66 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -400,7 +400,7 @@ def fillna(self, value=None, method=None, limit=None): if mask.any(): if method is not None: func = missing.get_fill_func(method) - new_values = func(self.to_numpy(object), limit=limit, mask=mask) + new_values, _ = func(self.to_numpy(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values) else: # fill with value diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 597023cb5b000..2e1a14104c16c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1727,16 +1727,13 @@ def _slice(self, slicer): def fillna( self, value, limit=None, inplace: bool = False, downcast=None ) -> List[Block]: - values = self.values if inplace else self.values.copy() - values = values.fillna(value=value, limit=limit) + values = self.values.fillna(value=value, limit=limit) return [self.make_block_same_class(values=values)] def interpolate( self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs ): - - values = self.values if inplace else self.values.copy() - new_values = values.fillna(value=fill_value, method=method, limit=limit) + new_values = self.values.fillna(value=fill_value, method=method, limit=limit) return self.make_block_same_class(new_values) def diff(self, n: int, axis: int = 1) -> List[Block]: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d1597b23cf577..1b5a7237b5287 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -660,9 +660,9 @@ def interpolate_2d( method = clean_fill_method(method) tvalues = transf(values) if method == "pad": - result = _pad_2d(tvalues, limit=limit) + result, _ = _pad_2d(tvalues, limit=limit) else: - result = _backfill_2d(tvalues, limit=limit) + result, _ = _backfill_2d(tvalues, limit=limit) result = transf(result) # reshape back @@ -698,8 +698,8 @@ def new_func(values, limit=None, mask=None): # This needs to occur before casting to int64 mask = isna(values) - result = func(values.view("i8"), limit=limit, mask=mask) - return result.view(values.dtype) + result, mask = func(values.view("i8"), limit=limit, mask=mask) + return result.view(values.dtype), mask return func(values, limit=limit, mask=mask) @@ -707,17 +707,25 @@ def new_func(values, limit=None, mask=None): @_datetimelike_compat -def _pad_1d(values, limit=None, mask=None): +def _pad_1d( + values: np.ndarray, + limit: int | None = None, + mask: np.ndarray | None = None, +) -> tuple[np.ndarray, np.ndarray]: mask = _fillna_prep(values, mask) algos.pad_inplace(values, mask, limit=limit) - return values + return values, mask @_datetimelike_compat -def _backfill_1d(values, limit=None, mask=None): +def _backfill_1d( + values: np.ndarray, + limit: int | None = None, + mask: np.ndarray | None = None, +) -> tuple[np.ndarray, np.ndarray]: mask = _fillna_prep(values, mask) algos.backfill_inplace(values, mask, limit=limit) - return values + return values, mask @_datetimelike_compat @@ -729,7 +737,7 @@ def _pad_2d(values, limit=None, mask=None): else: # for test coverage pass - return values + return values, mask @_datetimelike_compat @@ -741,7 +749,7 @@ def _backfill_2d(values, limit=None, mask=None): else: # for test coverage pass - return values + return values, mask _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} diff --git a/pandas/core/series.py b/pandas/core/series.py index 5a5d1c44b312c..e1a6c6884e003 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4529,7 +4529,7 @@ def _replace_single(self, to_replace, method: str, inplace: bool, limit): fill_f = missing.get_fill_func(method) mask = missing.mask_missing(result.values, to_replace) - values = fill_f(result.values, limit=limit, mask=mask) + values, _ = fill_f(result.values, limit=limit, mask=mask) if values.dtype == orig_dtype and inplace: return diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 0cf03533915f2..c501694a7c2d5 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -69,6 +69,18 @@ def test_fillna_limit_backfill(self, data_missing): expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) self.assert_series_equal(result, expected) + def test_fillna_no_op_returns_copy(self, data): + data = data[~data.isna()] + + valid = data[0] + result = data.fillna(valid) + assert result is not data + self.assert_extension_array_equal(result, data) + + result = data.fillna(method="backfill") + assert result is not data + self.assert_extension_array_equal(result, data) + def test_fillna_series(self, data_missing): fill_value = data_missing[1] ser = pd.Series(data_missing) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 1bc06ee4b6397..24c0d619e2b1a 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -132,6 +132,10 @@ def test_fillna_series_method(self): def test_fillna_limit_backfill(self): pass + @unsupported_fill + def test_fillna_no_op_returns_copy(self): + pass + @unsupported_fill def test_fillna_series(self): pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index e8995bc654428..718ef087e47d3 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -309,6 +309,11 @@ def test_fillna_scalar(self, data_missing): # Non-scalar "scalar" values. super().test_fillna_scalar(data_missing) + @skip_nested + def test_fillna_no_op_returns_copy(self, data): + # Non-scalar "scalar" values. + super().test_fillna_no_op_returns_copy(data) + @skip_nested def test_fillna_series(self, data_missing): # Non-scalar "scalar" values. diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 067fada5edcae..a49e1b4a367fd 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -221,6 +221,13 @@ def test_fillna_limit_backfill(self, data_missing): with tm.assert_produces_warning(PerformanceWarning): super().test_fillna_limit_backfill(data_missing) + def test_fillna_no_op_returns_copy(self, data, request): + if np.isnan(data.fill_value): + request.node.add_marker( + pytest.mark.xfail(reason="returns array with different fill value") + ) + super().test_fillna_no_op_returns_copy(data) + def test_fillna_series_method(self, data_missing): with tm.assert_produces_warning(PerformanceWarning): super().test_fillna_limit_backfill(data_missing)