-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Avoid copying unnecessary columns in setitem by splitting blocks for CoW #51031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b0812c2
0ee8783
3944655
23b5fbe
33eb278
297c08f
331232e
cae0170
24b58ab
26f1322
697ddb8
6b6eb63
fb623cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1250,7 +1250,10 @@ def value_getitem(placement): | |
self._known_consolidated = False | ||
|
||
def _iset_split_block( | ||
self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None | ||
self, | ||
blkno_l: int, | ||
blk_locs: np.ndarray | list[int], | ||
value: ArrayLike | None = None, | ||
) -> None: | ||
"""Removes columns from a block by splitting the block. | ||
|
||
|
@@ -1271,12 +1274,8 @@ def _iset_split_block( | |
|
||
nbs_tup = tuple(blk.delete(blk_locs)) | ||
if value is not None: | ||
# Argument 1 to "BlockPlacement" has incompatible type "BlockPlacement"; | ||
# expected "Union[int, slice, ndarray[Any, Any]]" | ||
first_nb = new_block_2d( | ||
value, | ||
BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[arg-type] | ||
) | ||
locs = blk.mgr_locs.as_array[blk_locs] | ||
first_nb = new_block_2d(value, BlockPlacement(locs)) | ||
else: | ||
first_nb = nbs_tup[0] | ||
nbs_tup = tuple(nbs_tup[1:]) | ||
|
@@ -1287,6 +1286,10 @@ def _iset_split_block( | |
) | ||
self.blocks = blocks_tup | ||
|
||
if not nbs_tup and value is not None: | ||
# No need to update anything if split did not happen | ||
return | ||
|
||
self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) | ||
|
||
for i, nb in enumerate(nbs_tup): | ||
|
@@ -1330,11 +1333,18 @@ def column_setitem( | |
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) | ||
""" | ||
if using_copy_on_write() and not self._has_no_reference(loc): | ||
# otherwise perform Copy-on-Write and clear the reference | ||
blkno = self.blknos[loc] | ||
blocks = list(self.blocks) | ||
blocks[blkno] = blocks[blkno].copy() | ||
self.blocks = tuple(blocks) | ||
# Split blocks to only copy the column we want to modify | ||
blk_loc = self.blklocs[loc] | ||
# Copy our values | ||
values = self.blocks[blkno].values | ||
if values.ndim == 1: | ||
values = values.copy() | ||
else: | ||
# Use [blk_loc] as indexer to keep ndim=2, this already results in a | ||
# copy | ||
values = values[[blk_loc]] | ||
self._iset_split_block(blkno, [blk_loc], values) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need to call this for the And There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes and Yes Not calling it would duplicate the logic |
||
|
||
# this manager is only created temporarily to mutate the values in place | ||
# so don't track references, otherwise the `setitem` would perform CoW again | ||
|
Uh oh!
There was an error while loading. Please reload this page.