diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 46f8c73027f48..ef4746311645c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5530,7 +5530,7 @@ def _replace_columnwise( DataFrame or None """ # Operate column-wise - res = self if inplace else self.copy() + res = self if inplace else self.copy(deep=None) ax = self.columns for i, ax_value in enumerate(ax): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4ef91f6de5e27..1604748fef2be 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7046,6 +7046,7 @@ def replace( to_replace = [to_replace] if isinstance(to_replace, (tuple, list)): + # TODO: Consider copy-on-write for non-replaced columns's here if isinstance(self, ABCDataFrame): from pandas import Series @@ -7105,7 +7106,7 @@ def replace( if not self.size: if inplace: return None - return self.copy() + return self.copy(deep=None) if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2ce294f257d75..1a0aba0778da5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1220,10 +1220,16 @@ def value_getitem(placement): if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write if using_copy_on_write() and not self._has_no_reference_block(blkno_l): - blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) + nbs_tup = tuple(blk.delete(blk_locs)) + first_nb = new_block_2d( + value_getitem(val_locs), BlockPlacement(blk.mgr_locs[blk_locs]) + ) + if self.refs is not None: + self.refs.extend([self.refs[blkno_l]] * len(nbs_tup)) self._clear_reference_block(blkno_l) else: blk.set_inplace(blk_locs, value_getitem(val_locs)) + continue else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) @@ -1231,25 +1237,26 @@ def value_getitem(placement): # If all block items are unfit, schedule the block for removal. if len(val_locs) == len(blk.mgr_locs): removed_blknos.append(blkno_l) + continue else: nbs = blk.delete(blk_locs) # Add first block where old block was and remaining blocks at # the end to avoid updating all block numbers first_nb = nbs[0] nbs_tup = tuple(nbs[1:]) - nr_blocks = len(self.blocks) - blocks_tup = ( - self.blocks[:blkno_l] - + (first_nb,) - + self.blocks[blkno_l + 1 :] - + nbs_tup - ) - self.blocks = blocks_tup - self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) + nr_blocks = len(self.blocks) + blocks_tup = ( + self.blocks[:blkno_l] + + (first_nb,) + + self.blocks[blkno_l + 1 :] + + nbs_tup + ) + self.blocks = blocks_tup + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) - for i, nb in enumerate(nbs_tup): - self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) - self._blknos[nb.mgr_locs.indexer] = i + nr_blocks + for i, nb in enumerate(nbs_tup): + self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + self._blknos[nb.mgr_locs.indexer] = i + nr_blocks if len(removed_blknos): # Remove blocks & update blknos and refs accordingly diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 1938a1c58fe3d..506ae7d3465c5 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import DataFrame +import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -93,3 +94,49 @@ def test_switch_options(): subset.iloc[0, 0] = 0 # df updated with CoW disabled assert df.iloc[0, 0] == 0 + + +@td.skip_array_manager_invalid_test +@pytest.mark.parametrize("dtype", [np.intp, np.int8]) +@pytest.mark.parametrize( + "locs, arr", + [ + ([0], np.array([-1, -2, -3])), + ([1], np.array([-1, -2, -3])), + ([5], np.array([-1, -2, -3])), + ([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T), + ([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T), + ], +) +def test_iset_splits_blocks_inplace(using_copy_on_write, locs, arr, dtype): + # Nothing currently calls iset with + # more than 1 loc with inplace=True (only happens with inplace=False) + # but ensure that it works + df = DataFrame( + { + "a": [1, 2, 3], + "b": [4, 5, 6], + "c": [7, 8, 9], + "d": [10, 11, 12], + "e": [13, 14, 15], + "f": ["a", "b", "c"], + }, + ) + arr = arr.astype(dtype) + df_orig = df.copy() + df2 = df.copy(deep=None) # Trigger a CoW (if enabled, otherwise makes copy) + df2._mgr.iset(locs, arr, inplace=True) + + tm.assert_frame_equal(df, df_orig) + + if using_copy_on_write: + for i, col in enumerate(df.columns): + if i not in locs: + assert np.shares_memory(get_array(df, col), get_array(df2, col)) + else: + for col in df.columns: + assert not np.shares_memory(get_array(df, col), get_array(df2, col)) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 5c24a180c4d6d..d1184c3a70610 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -903,6 +903,44 @@ def test_squeeze(using_copy_on_write): assert df.loc[0, "a"] == 0 +@pytest.mark.parametrize( + "replace_kwargs", + [ + {"to_replace": {"a": 1, "b": 4}, "value": -1}, + # Test CoW splits blocks to avoid copying unchanged columns + {"to_replace": {"a": 1}, "value": -1}, + {"to_replace": {"b": 4}, "value": -1}, + {"to_replace": {"b": {4: 1}}}, + # TODO: Add these in a further optimization + # We would need to see which columns got replaced in the mask + # which could be expensive + # {"to_replace": {"b": 1}}, + # 1 + ], +) +def test_replace(using_copy_on_write, replace_kwargs): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df_orig = df.copy() + + df_replaced = df.replace(**replace_kwargs) + + if using_copy_on_write: + if (df_replaced["b"] == df["b"]).all(): + assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + # mutating squeezed df triggers a copy-on-write for that column/block + df_replaced.loc[0, "c"] = -1 + if using_copy_on_write: + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + if "a" in replace_kwargs["to_replace"]: + arr = get_array(df_replaced, "a") + df_replaced.loc[0, "a"] = 100 + assert np.shares_memory(get_array(df_replaced, "a"), arr) + tm.assert_frame_equal(df, df_orig) + + def test_putmask(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}) view = df[:]