From da10a2c3aea507346005ca4c001765f9f879d385 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 14:42:22 -0800 Subject: [PATCH 01/49] * add missing method kwargs in MultiIndex get_indexer() impl * add tests for functions covered by bugfix --- pandas/_libs/index.pyx | 4 +-- pandas/tests/indexes/multi/test_indexing.py | 36 +++++++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 4a9b504ffb0d9..d55f798eb3c8b 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -608,13 +608,13 @@ cdef class BaseMultiIndexCodesEngine: int_keys : 1-dimensional array of dtype uint64 or object Integers representing one combination each """ - level_codes = [lev.get_indexer(codes) + 1 for lev, codes + level_codes = [lev.get_indexer(codes, method=method) + 1 for lev, codes in zip(self.levels, zip(*target))] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) def get_indexer(self, object target, object method=None, object limit=None): - lab_ints = self._extract_level_codes(target) + lab_ints = self._extract_level_codes(target, method=method) # All methods (exact, backfill, pad) directly map to the respective # methods of the underlying (integers) index... diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d104c773227d5..b53753d408a78 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -4,9 +4,21 @@ import pytest import pandas as pd +<<<<<<< HEAD from pandas import Categorical, Index, MultiIndex, date_range import pandas._testing as tm +======= +from pandas import ( + Categorical, + CategoricalIndex, + Index, + IntervalIndex, + MultiIndex, + date_range, +) +>>>>>>> * add missing method kwargs in MultiIndex get_indexer() impl from pandas.core.indexes.base import InvalidIndexError +import pandas.util.testing as tm class TestSliceLocs: @@ -239,6 +251,30 @@ def test_get_indexer_with_missing_value(self, index_arr, labels, expected): result = idx.get_indexer(labels) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_and_fill(): + """ test getting an indexer for another index using the backfill method """ + mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) + mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) + + assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2), np.array([-1, 2, 3])) + assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='backfill'), + np.array([1, 2, 3])) + for method in ("bfill", "backfill"): + assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method=method), + np.array([1, 2, 3])) + for method in ("ffill", "pad"): + assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method=method), + np.array([0, 2, 3])) + + def test_get_indexer_pad(self): + """ test getting an indexer for another index using the pad method """ + mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) + mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) + padded_idx = np.array([0, 2, 3]) + + assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='ffill'), + padded_idx) + def test_getitem(idx): # scalar From dc96151765b08faaf5ddb5749fdc85c7cefd0f96 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 14:50:43 -0800 Subject: [PATCH 02/49] handle multiple trickier cases with indexing * handle tuple-ordering case, i.e. the highest-level (highest being closest to 0, i.e. the leftmost) fill up or down, for backfill/pad, respectively, should override values at all lower levels to their lowest/highest values, respectively * stub out handling of "carrying" situation with too-large values at some level which may require values at higher levels being bumped --- pandas/_libs/index.pyx | 194 +++++++++++++++++++- pandas/tests/indexes/multi/test_indexing.py | 110 +++++++++-- 2 files changed, 291 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index d55f798eb3c8b..3e316788ae13d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -611,11 +611,167 @@ cdef class BaseMultiIndexCodesEngine: level_codes = [lev.get_indexer(codes, method=method) + 1 for lev, codes in zip(self.levels, zip(*target))] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) + print('\n\n\n\n') + print('extracting level codes from:\n{}\nwith:\n{}\nwith method: {}\n\n'.format( + str(self.levels), + str(zip(*target)), + str(method))) + level_codes = np.array([ + lev.get_indexer(codes, method=method) for lev, codes + in zip(self.levels, zip(*target)) + ], dtype='uint64').T + 1 + + print('this gives us:\n{}'.format(str(level_codes))) + # idk if truthy/falsy stuff works w/cython... + # also this entire block should basically be moved into its own helper function + # or something like that + if method is not None: + print('method is {}, so we need any fills at level n to override ' + 'all fills at level n + k, for all k > 0'.format(str(method))) + level_codes_no_fill = np.array([ + lev.get_indexer(codes) for lev, codes + in zip(self.levels, zip(*target)) + ], dtype='uint64').T + 1 + print('without {}-style filling, the level_codes are:\n{}'.format( + str(method), + str(level_codes_no_fill))) + + + # TODO: add arithmetic for (backfill-only) "incremementing" when we hit + # a "0" (i.e. NaN, i.e. too large value) at level i (i.e. incremementing + # levels 1, ..., i-1). this necessarily involves "place-value arithmetic" + # w.r.t. map(len, self.levels), i.e. the max value we can have at each + # level, after which we have to set it to 0 (i.e. "1") and then "carry" + # + # then, when we hit a case where we need to "carry" past level 0, we need + # to set the whole row to -1 + # + # it will LIKELY be the case that we need to pass that past this function, + # but, for the moment, let's see if we can just do something with level + # codes + # + # my HOPE is that the result will be something like: + # sum(1 << offset for offset in offsets) (or maybe offsets[:-1], not sure + # yet....) + # + # let's impl this now! + # eventually let's see if this can be pulled out into a helper function + if method == 'backfill': + print('we\'re backfilling, so we want to find values which we might ' + 'need to be bumped to the next value in one or more levels or ' + 'removed entirely') + for i, row in enumerate(level_codes): + print('examining row: {}'.format(str(row))) + need_to_carry = False + # go from right to left for place-value arithmetic + for j in range(len(row) - 1, -1, -1): + print('looking at row[{}], i.e. {}'.format(str(j), str(row[j]))) + # this is AFTER backfilling, i.e. this means value was too large + # subtract 1 bc all values here have 1 added to them + max_val = len(self.levels[j]) + print('the highest value you can have in this row is: {}'.format(str(max_val))) + if row[j] == 0 or (level_codes_no_fill[i][j] == max_val): + need_to_carry = True + print('row[{}], i.e. {}, is already too large (max val is {}), ' + 'or will be after incrementing, so we need to continue ' + 'carrying'.format(str(j), str(row[j]), str(max_val))) + elif (row[j] == max_val): + print('this row is at the max value, but since it was backfilled ' + 'up to it, we dont actually need to do anything') + need_to_carry = False + + if need_to_carry: + print('we need to carry') + new_val = row[j] + 1 if row[j] == level_codes_no_fill[i][j] else new_val + print('new value would be {}'.format(new_val)) + if new_val > max_val or row[j] == 0: + print('which it too big (if new_val is 1 and we\'re here, that\'s because ' + 'it was previously 0 *AFTER* backfilling), so we will see what ' + 'happens now') + # if we're at the first row, then we're done, this whole thing + # is too big + if j == 0: + print('we\'re at the end, i.e. level 0, so we gotta just set the ' + 'whole thing to NaN, which we\'ll in turn need to clean up ' + 'later to prevent it from being backfilled to the smallest ' + 'value') + for k in range(len(row)): + row[k] = 0 + print('the whole row is now: {}'.format(str(level_codes[i]))) + # we need to keep carrying, but will not necessarily be a problem + else: + print('we\'re not at the end, so there\'s still hope -- setting this ' + 'to 1, i.e. min value, since that\'s what it should be if the "next", ' + 'i.e. next one to the *left*, can be bumped') + row[j] = 1 + # done carrying, for now + else: + print('new value, {}, is legit, so we can stop carrying for now'.format( + str(new_val))) + row[j] = new_val + need_to_carry = False + print('`need_to_carry` is now False, continuing') + + # MOTIVATION: + # this is basically just backfilling/forward-filling a tuple into a list + # of tuples. the ordering of tuples (i.e. x_i vs. y_i, with ties broken + # by y_{i+1], y_{i+1}, etc.) is such that we want to explicitly avoid + # considering values in levels i+1, ..., n (for an n-level MultiIndex) + # when the value at level i is different w/and w/o filling + # + # - when backfilling, it'll bump the value at level i, so we want to + # decrease the values at levels i+1, ..., n, to their min value (i.e. "1"), + # since we're using an arithmetic here where 1 is 0, more or less (TODO: add + # caveats, formalizations, etc., to this). this will work since, for all + # possible values x_1, ..., x_{i-1}, x_{i+1}, ..., x_n, it's necessarily + # the case that (x_1, ..., x_n) < (x_1, ..., x_{i-1}, x_{i}', 1, ..., 1), + # for all x_{i}' > x_i + # + # - when forward-filling (aka "padding"), it'll drop the value at level i, + # and so we want to increase the values at levels i+1, ..., n, to their + # max possible values, i.e. map(len, self.levels[i+1:]) + # + # + # TODO: see if this can be replaced w/higher-level functions w/o + # sacrificing performance. presently unclear if that can be + # accomplished + # als TODO: clean this up + for i, row in enumerate(level_codes): + # THIS is where i can apply the algorithm described earlier + for j, level in enumerate(row): + # if it was filled from the prev/next value, then everything after + # that should either be the min val, if backfill, or the max, if + # using pad. i think. lemme mull this one over a bit more + if level_codes_no_fill[i][j] == 0 and level_codes[i][j] >= 1: + print('without filling, level_codes[{}][{}], curently {}, would be {}'.format( + str(i), + str(j), + str(level_codes[i][j]), + '0')) + for k in range(j + 1, len(row)): + old_val = row[k] + row[k] = 1 if method == 'backfill' else len(self.levels[k]) + print('replaced level_codes[{}][{}], previously {}, with {}'.format( + str(i), + str(k), + str(old_val), + str(row[k]))) + + print('our cleaned-up level codes are now:\n{},\nwhich we\'ll pass to self._codes_to_ints()'.format(str(level_codes))) + int_reprs = self._codes_to_ints(level_codes) + print('which, using the index backend\'s int representations, is:\n{}'.format(str(int_reprs))) + return int_reprs + #return self._codes_to_ints(level_codes) def get_indexer(self, object target, object method=None, object limit=None): lab_ints = self._extract_level_codes(target, method=method) + print('extracting level codes from {} on {} (w/method = {}) returned:\n{}'.format( + str(self), + str(target), + str(method), + str(lab_ints))) # All methods (exact, backfill, pad) directly map to the respective # methods of the underlying (integers) index... if method is not None: @@ -626,7 +782,43 @@ cdef class BaseMultiIndexCodesEngine: lab_ints = lab_ints[order] indexer = (getattr(self._base, f'get_{method}_indexer') (self, lab_ints, limit=limit)) - indexer = indexer[order] + + # TODO: completely replace this with correctly-generalized fix + # with pre-processes by "adding 1" when backfilling when it finds + # 0s (i.e. -1 (i.e. NaN repr) + 1) + # + # HACK: because the backfilled representation of NAs at any level l + # will always be 2^(offset_l), as a result of adding 1 to -1 and + # then shifting by offset_l bits, when backfilling, we need to + # prevent too-large values from being treated as too small values + # the fix for this, therefore, is to change all too-small values to + # 1 larger than the max of the entity being backfilled + if method == 'backfill': + # inherently necessary bc NaNs are stored as a value which would + # otherwise be backfilled to zero + print('method is backfill, doing some cleanup, let\'s see if it\'s sufficient... ') + print('lab_ints (after re-ordering) is: {}'.format(str(lab_ints))) + print('indexer is: {}'.format(str(indexer))) + #indexer = np.array([ + # idx if lab_ints[i] != nan_val else -1 + # for i, idx in enumerate(indexer) + #], dtype=indexer.dtype) + # any too-large values will be backfilled into the minimum index value + for i in range(len(indexer)): + if lab_ints[i] == 0: + print('lab_ints[{}] = {}, but was (incorrectly) backfilled to {}, re-setting to -1'.format( + str(i), + str(lab_ints[i]), + str(indexer[i]))) + indexer[i] = -1 + print('after cleanup, indexer is: {}'.format(str(indexer))) + + # TODO: try using argsort more to accomplish this. does not matter for now, though + new_indexer = [0] * len(indexer) + for i, idx in enumerate(order): + new_indexer[idx] = indexer[i] + print('after fixing order, indexer is: {}'.format(str(new_indexer))) + return new_indexer else: indexer = self._base.get_indexer(self, lab_ints) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index b53753d408a78..16ed727af2c9d 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -4,21 +4,9 @@ import pytest import pandas as pd -<<<<<<< HEAD from pandas import Categorical, Index, MultiIndex, date_range import pandas._testing as tm -======= -from pandas import ( - Categorical, - CategoricalIndex, - Index, - IntervalIndex, - MultiIndex, - date_range, -) ->>>>>>> * add missing method kwargs in MultiIndex get_indexer() impl from pandas.core.indexes.base import InvalidIndexError -import pandas.util.testing as tm class TestSliceLocs: @@ -205,6 +193,104 @@ def test_get_indexer(self): with pytest.raises(InvalidIndexError, match=msg): idx1.get_indexer(idx2) + def test_get_indexer_and_fill(self): + """ test getting an indexer for another index using the backfill method """ + mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) + mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) + + indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) + expected_indexer_no_fill = np.array([-1, 2, 3]) + assert_almost_equal(expected_indexer_no_fill, indexer_no_fill) + + for method in ("bfill", "backfill"): + indexer_backfilled = multi_idx_1.get_indexer(mult_idx_2, + method=method) + expected_indexer_backfilled = np.array([1, 2, 3]) + assert_almost_equal(expected_indexer_backfilled, indexer_backfilled) + + for method in ("ffill", "pad"): + indexer_padded = mult_idx_1.get_indexer(mult_idx_2, + method=method) + expected_indexer_padded = np.array([0, 2, 3]) + assert_almost_equal(expected_indexer_padded, indexer_padded) + + def test_get_indexer_three_or_more_levels(self): + """ tests get_indexer() on MultiIndexes with 3+ levels + + visually, these are + + mult_idx_1: + 0: 1 2 5 + 1: 7 + 2: 4 5 + 3: 7 + 4: 6 5 + 5: 7 + 6: 3 2 5 + 7: 7 + 8: 4 5 + 9: 7 + 10: 6 5 + 11: 7 + + mult_idx_2: + 0: 1 1 8 + 1: 1 5 9 + 2: 1 6 7 + 3: 2 1 6 + 4: 3 6 8 + """ + mult_idx_1 = pd.MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]]) + mult_idx_2 = pd.MultiIndex.from_tuples([(1, 1, 8), + (1, 5, 9), + (1, 6, 7), + (2, 1, 6), + (3, 6, 8)]) + # sanity check + assert mult_idx_1.is_monotonic + assert mult_idx_1.is_unique + assert mult_idx_2.is_monotonic + assert mult_idx_2.is_unique + + # show the relationships between the two + assert mult_idx_2[0] < mult_idx_1[0] + assert mult_idx_1[3] < mult_idx_2[1] < mult_idx_1[4] + assert mult_idx_1[5] == mult_idx_2[2] + assert mult_idx_1[5] < mult_idx_2[3] < mult_idx_1[6] + assert mult_idx_1[-1] < mult_idx_[4] + + indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) + assert_almost_equal(indexer_no_fill, [-1, -1, 5, -1, -1]) + + # test with backfilling + indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method='backfill') + assert_almost_equal(indexer_backfilled, [0, 4, 5, 6, -1]) + + # now, the same thing, but forward-filled (aka "padded") + indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method='pad') + assert_almost_equal(indexer_padded, [-1, 3, 5, 5, 11]) + + # now, do the indexing in the other direction + assert mult_idx_2[0] < mult_idx_1[0] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[1] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[2] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[3] < mult_idx_2[1] + assert mult_idx_2[1] < mult_idx_1[4] < mult_idx_2[2] + assert mult_idx_2[2] == mult_idx_1[5] + assert mult_idx_2[3] < mult_idx_1[6] < mult_idx_2[4] + assert mult_idx_2[3] < mult_idx_1[7] < mult_idx_2[4] + assert mult_idx_2[3] < mult_idx_1[8] < mult_idx_2[4] + assert mult_idx_2[3] < mult_idx_1[9] < mult_idx_2[4] + assert mult_idx_2[3] < mult_idx_1[10] < mult_idx_2[4] + assert mult_idx_2[3] < mult_idx_1[11] < mult_idx_2[4] + + assert_almost_equal(mult_idx_2.get_indexer(mult_idx_1), + [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1]) + assert_almost_equal(mult_idx_2.get_indexer(mult_idx_1, method='bfill'), + [1, 1, 1, 1, 2, 2, 4, 4, 4, 4, 4, 4]) + assert_almost_equal(mult_idx_2.get_indexer(mult_idx_1, method='pad'), + [0, 0, 0, 0, 1, 2, 3, 3, 3, 3, 3, 3]) + def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) msg = ( From fec078fd874ea17c365abe38061145d5ed05568e Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 14:52:44 -0800 Subject: [PATCH 03/49] add small bugfixes and cleanup * fix bug with incorrect handling of legit settling at max val at some level * use np arrays for checking test outputs * dont overflow from a val when we were backfilled up to it * remove debugging-only code, no change to logic --- pandas/_libs/index.pyx | 76 ++------------------- pandas/tests/indexes/multi/test_indexing.py | 14 +--- 2 files changed, 5 insertions(+), 85 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 3e316788ae13d..e685b99c56633 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -621,21 +621,14 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zip(*target)) ], dtype='uint64').T + 1 - print('this gives us:\n{}'.format(str(level_codes))) # idk if truthy/falsy stuff works w/cython... # also this entire block should basically be moved into its own helper function # or something like that if method is not None: - print('method is {}, so we need any fills at level n to override ' - 'all fills at level n + k, for all k > 0'.format(str(method))) level_codes_no_fill = np.array([ lev.get_indexer(codes) for lev, codes in zip(self.levels, zip(*target)) ], dtype='uint64').T + 1 - print('without {}-style filling, the level_codes are:\n{}'.format( - str(method), - str(level_codes_no_fill))) - # TODO: add arithmetic for (backfill-only) "incremementing" when we hit # a "0" (i.e. NaN, i.e. too large value) at level i (i.e. incremementing @@ -657,60 +650,31 @@ cdef class BaseMultiIndexCodesEngine: # let's impl this now! # eventually let's see if this can be pulled out into a helper function if method == 'backfill': - print('we\'re backfilling, so we want to find values which we might ' - 'need to be bumped to the next value in one or more levels or ' - 'removed entirely') for i, row in enumerate(level_codes): - print('examining row: {}'.format(str(row))) need_to_carry = False # go from right to left for place-value arithmetic for j in range(len(row) - 1, -1, -1): - print('looking at row[{}], i.e. {}'.format(str(j), str(row[j]))) # this is AFTER backfilling, i.e. this means value was too large # subtract 1 bc all values here have 1 added to them max_val = len(self.levels[j]) - print('the highest value you can have in this row is: {}'.format(str(max_val))) - if row[j] == 0 or (level_codes_no_fill[i][j] == max_val): + if row[j] == 0: need_to_carry = True - print('row[{}], i.e. {}, is already too large (max val is {}), ' - 'or will be after incrementing, so we need to continue ' - 'carrying'.format(str(j), str(row[j]), str(max_val))) - elif (row[j] == max_val): - print('this row is at the max value, but since it was backfilled ' - 'up to it, we dont actually need to do anything') - need_to_carry = False if need_to_carry: - print('we need to carry') - new_val = row[j] + 1 if row[j] == level_codes_no_fill[i][j] else new_val - print('new value would be {}'.format(new_val)) + new_val = row[j] + 1 if row[j] == level_codes_no_fill[i][j] else row[j] if new_val > max_val or row[j] == 0: - print('which it too big (if new_val is 1 and we\'re here, that\'s because ' - 'it was previously 0 *AFTER* backfilling), so we will see what ' - 'happens now') # if we're at the first row, then we're done, this whole thing # is too big if j == 0: - print('we\'re at the end, i.e. level 0, so we gotta just set the ' - 'whole thing to NaN, which we\'ll in turn need to clean up ' - 'later to prevent it from being backfilled to the smallest ' - 'value') for k in range(len(row)): row[k] = 0 - print('the whole row is now: {}'.format(str(level_codes[i]))) # we need to keep carrying, but will not necessarily be a problem else: - print('we\'re not at the end, so there\'s still hope -- setting this ' - 'to 1, i.e. min value, since that\'s what it should be if the "next", ' - 'i.e. next one to the *left*, can be bumped') row[j] = 1 # done carrying, for now else: - print('new value, {}, is legit, so we can stop carrying for now'.format( - str(new_val))) row[j] = new_val need_to_carry = False - print('`need_to_carry` is now False, continuing') # MOTIVATION: # this is basically just backfilling/forward-filling a tuple into a list @@ -743,35 +707,16 @@ cdef class BaseMultiIndexCodesEngine: # that should either be the min val, if backfill, or the max, if # using pad. i think. lemme mull this one over a bit more if level_codes_no_fill[i][j] == 0 and level_codes[i][j] >= 1: - print('without filling, level_codes[{}][{}], curently {}, would be {}'.format( - str(i), - str(j), - str(level_codes[i][j]), - '0')) for k in range(j + 1, len(row)): old_val = row[k] row[k] = 1 if method == 'backfill' else len(self.levels[k]) - print('replaced level_codes[{}][{}], previously {}, with {}'.format( - str(i), - str(k), - str(old_val), - str(row[k]))) - - print('our cleaned-up level codes are now:\n{},\nwhich we\'ll pass to self._codes_to_ints()'.format(str(level_codes))) - int_reprs = self._codes_to_ints(level_codes) - print('which, using the index backend\'s int representations, is:\n{}'.format(str(int_reprs))) - return int_reprs - #return self._codes_to_ints(level_codes) + + return self._codes_to_ints(level_codes) def get_indexer(self, object target, object method=None, object limit=None): lab_ints = self._extract_level_codes(target, method=method) - print('extracting level codes from {} on {} (w/method = {}) returned:\n{}'.format( - str(self), - str(target), - str(method), - str(lab_ints))) # All methods (exact, backfill, pad) directly map to the respective # methods of the underlying (integers) index... if method is not None: @@ -796,28 +741,15 @@ cdef class BaseMultiIndexCodesEngine: if method == 'backfill': # inherently necessary bc NaNs are stored as a value which would # otherwise be backfilled to zero - print('method is backfill, doing some cleanup, let\'s see if it\'s sufficient... ') - print('lab_ints (after re-ordering) is: {}'.format(str(lab_ints))) - print('indexer is: {}'.format(str(indexer))) - #indexer = np.array([ - # idx if lab_ints[i] != nan_val else -1 - # for i, idx in enumerate(indexer) - #], dtype=indexer.dtype) # any too-large values will be backfilled into the minimum index value for i in range(len(indexer)): if lab_ints[i] == 0: - print('lab_ints[{}] = {}, but was (incorrectly) backfilled to {}, re-setting to -1'.format( - str(i), - str(lab_ints[i]), - str(indexer[i]))) indexer[i] = -1 - print('after cleanup, indexer is: {}'.format(str(indexer))) # TODO: try using argsort more to accomplish this. does not matter for now, though new_indexer = [0] * len(indexer) for i, idx in enumerate(order): new_indexer[idx] = indexer[i] - print('after fixing order, indexer is: {}'.format(str(new_indexer))) return new_indexer else: indexer = self._base.get_indexer(self, lab_ints) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 16ed727af2c9d..56730b13fd27b 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -337,14 +337,12 @@ def test_get_indexer_with_missing_value(self, index_arr, labels, expected): result = idx.get_indexer(labels) tm.assert_numpy_array_equal(result, expected) - def test_get_indexer_and_fill(): + def test_get_indexer_and_fill(self): """ test getting an indexer for another index using the backfill method """ mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2), np.array([-1, 2, 3])) - assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='backfill'), - np.array([1, 2, 3])) for method in ("bfill", "backfill"): assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method=method), np.array([1, 2, 3])) @@ -352,16 +350,6 @@ def test_get_indexer_and_fill(): assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method=method), np.array([0, 2, 3])) - def test_get_indexer_pad(self): - """ test getting an indexer for another index using the pad method """ - mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) - mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) - padded_idx = np.array([0, 2, 3]) - - assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='ffill'), - padded_idx) - - def test_getitem(idx): # scalar assert idx[2] == ("bar", "one") From 3e6ca9d9104de984a7e1c7d3d6e7131914a142b7 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 14:54:48 -0800 Subject: [PATCH 04/49] fix more bugs and do more cleanup * clean up comments * small cleanup and fix for tuple-order-handling fix * fix ordering of extract_level_codes() cleanup * add test for case with fill at multiple levels * rm duplicative test * move carrying code into its own function --- pandas/_libs/index.pyx | 196 +++++++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 95 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e685b99c56633..b65adf43e6c8e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -593,7 +593,81 @@ cdef class BaseMultiIndexCodesEngine: def _codes_to_ints(self, codes): raise NotImplementedError("Implemented by subclass") - def _extract_level_codes(self, object target): + def _do_backfill_carrying(self, object level_codes, + object level_codes_no_fill): + """ + given a 2d list of level_codes, i.e. integers representing the index + of a target index inside `self.level_codes` for each level, which + were backfilled at each level, handle cases where too-large values + which were backfilled to 0 (and became 1 after adding 1 to handle use + of unsigned integers) and thus the backfilling on the level code, when + represented as an int, will not be correct + + in particular, this involves a sort of "carrying", whereby, when a + value at a given level is too large, we set it to the minimum value + and then attempt to bump the value to the left, and, in the event that + we need to do this at the first index + + e.g. if the highest two values in `self`'s index tuples are (8, 4, 7) + and (8, 4, 9), then, in order to determine that the value (8, 4, 10) is + too large, we would need to determine that 10 > 9, meaning we need to + "carry" to the middle level, at which we try to bump 4 to 5, which is + in turn too large, requiring us to try to bump 8 to 9, also too large, + meaning that the entire row representing (8, 4, 10)'s level_codes + should be set to -1 + + Parameters + ---------- + level_codes : 2D array, M x N (where self has N levels) + per-level backfilled codes of a target index. a 0 represents NaN, + and the remainder of the values should be from 1, ..., L_i, where + L_i is the length of `self.levels[i]` + level_codes_no_fill: 2D-list-like, M x N + the same as above, but computed without backfilling + + Returns + ------- + level_codes : 2D integer array, also M x N + same as level_codes but with appropriate "carrying" operations + performed, where values which are too large are represented by + rows consisting of all 0s + """ + for i, row in enumerate(level_codes): + need_to_carry = False + # go from right to left for place-value arithmetic + for j in range(len(row) - 1, -1, -1): + max_val = len(self.levels[j]) + # the value here was too large, so backfilling returns + # -1, which, after adding 1, becomes 0 + if row[j] == 0: + need_to_carry = True + + if need_to_carry: + # if row[j] was backfilled to its value, then even + # if we are "carrying," it can remain as is + new_val = ( + row[j] + 1 + if row[j] == level_codes_no_fill[i][j] + else row[j]) + if new_val > max_val or row[j] == 0: + # at this point, no more room to carry, so the + # entire value is too large + if j == 0: + for k in range(len(row)): + row[k] = 0 + # set it to the minimum value and carry to the + # next level + else: + row[j] = 1 + # done carrying, for now + else: + row[j] = new_val + need_to_carry = False + + return level_codes + + + def _extract_level_codes(self, object target, object method=None): """ Map the requested list of (tuple) keys to their integer representations for searching in the underlying integer index. @@ -602,6 +676,10 @@ cdef class BaseMultiIndexCodesEngine: ---------- target : list-like of keys Each key is a tuple, with a label for each level of the index. + method : string (optional) + whether to fill missing keys with either the previous (using "pad" + of "ffill" or next ("bfill"/"backfill") values, in terms of the + ordering of the tuples and the underlying index Returns ------ @@ -621,95 +699,34 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zip(*target)) ], dtype='uint64').T + 1 - # idk if truthy/falsy stuff works w/cython... - # also this entire block should basically be moved into its own helper function - # or something like that + # handle intricacies required to properly respect tuple ordering + # properties if method is not None: level_codes_no_fill = np.array([ lev.get_indexer(codes) for lev, codes in zip(self.levels, zip(*target)) ], dtype='uint64').T + 1 - # TODO: add arithmetic for (backfill-only) "incremementing" when we hit - # a "0" (i.e. NaN, i.e. too large value) at level i (i.e. incremementing - # levels 1, ..., i-1). this necessarily involves "place-value arithmetic" - # w.r.t. map(len, self.levels), i.e. the max value we can have at each - # level, after which we have to set it to 0 (i.e. "1") and then "carry" - # - # then, when we hit a case where we need to "carry" past level 0, we need - # to set the whole row to -1 - # - # it will LIKELY be the case that we need to pass that past this function, - # but, for the moment, let's see if we can just do something with level - # codes - # - # my HOPE is that the result will be something like: - # sum(1 << offset for offset in offsets) (or maybe offsets[:-1], not sure - # yet....) - # - # let's impl this now! - # eventually let's see if this can be pulled out into a helper function - if method == 'backfill': - for i, row in enumerate(level_codes): - need_to_carry = False - # go from right to left for place-value arithmetic - for j in range(len(row) - 1, -1, -1): - # this is AFTER backfilling, i.e. this means value was too large - # subtract 1 bc all values here have 1 added to them - max_val = len(self.levels[j]) - if row[j] == 0: - need_to_carry = True - - if need_to_carry: - new_val = row[j] + 1 if row[j] == level_codes_no_fill[i][j] else row[j] - if new_val > max_val or row[j] == 0: - # if we're at the first row, then we're done, this whole thing - # is too big - if j == 0: - for k in range(len(row)): - row[k] = 0 - # we need to keep carrying, but will not necessarily be a problem - else: - row[j] = 1 - # done carrying, for now - else: - row[j] = new_val - need_to_carry = False - - # MOTIVATION: - # this is basically just backfilling/forward-filling a tuple into a list - # of tuples. the ordering of tuples (i.e. x_i vs. y_i, with ties broken - # by y_{i+1], y_{i+1}, etc.) is such that we want to explicitly avoid - # considering values in levels i+1, ..., n (for an n-level MultiIndex) - # when the value at level i is different w/and w/o filling - # - # - when backfilling, it'll bump the value at level i, so we want to - # decrease the values at levels i+1, ..., n, to their min value (i.e. "1"), - # since we're using an arithmetic here where 1 is 0, more or less (TODO: add - # caveats, formalizations, etc., to this). this will work since, for all - # possible values x_1, ..., x_{i-1}, x_{i+1}, ..., x_n, it's necessarily - # the case that (x_1, ..., x_n) < (x_1, ..., x_{i-1}, x_{i}', 1, ..., 1), - # for all x_{i}' > x_i - # - # - when forward-filling (aka "padding"), it'll drop the value at level i, - # and so we want to increase the values at levels i+1, ..., n, to their - # max possible values, i.e. map(len, self.levels[i+1:]) - # - # - # TODO: see if this can be replaced w/higher-level functions w/o - # sacrificing performance. presently unclear if that can be - # accomplished - # als TODO: clean this up + # necessary to respect tuple ordering. intuition is that bumping + # the value at level i should make the values at levels i+1, ..., n + # as small as possible, and vice versa for i, row in enumerate(level_codes): - # THIS is where i can apply the algorithm described earlier for j, level in enumerate(row): - # if it was filled from the prev/next value, then everything after - # that should either be the min val, if backfill, or the max, if - # using pad. i think. lemme mull this one over a bit more - if level_codes_no_fill[i][j] == 0 and level_codes[i][j] >= 1: + if level_codes_no_fill[i][j] != level_codes[i][j]: for k in range(j + 1, len(row)): - old_val = row[k] - row[k] = 1 if method == 'backfill' else len(self.levels[k]) + row[k] = ( + 1 if method == 'backfill' else + len(self.levels[k])) + break + + # after doing per-level indexing, backfilled level codes need + # additional cleanup, as too-large values are 0, which will in + # turn be backfilled to 1 without cleanup. This is not an issue + # for padded level codes because the final padding will (correctly) + # exclude them anyway + if method == 'backfill': + level_codes = self._do_backfill_carrying(level_codes, + level_codes_no_fill) return self._codes_to_ints(level_codes) @@ -728,25 +745,14 @@ cdef class BaseMultiIndexCodesEngine: indexer = (getattr(self._base, f'get_{method}_indexer') (self, lab_ints, limit=limit)) - # TODO: completely replace this with correctly-generalized fix - # with pre-processes by "adding 1" when backfilling when it finds - # 0s (i.e. -1 (i.e. NaN repr) + 1) - # - # HACK: because the backfilled representation of NAs at any level l - # will always be 2^(offset_l), as a result of adding 1 to -1 and - # then shifting by offset_l bits, when backfilling, we need to - # prevent too-large values from being treated as too small values - # the fix for this, therefore, is to change all too-small values to - # 1 larger than the max of the entity being backfilled + # handle the case where too-large values are backfilled to NaN, for + # which the integer representation from _extract_level_codes() is 0 if method == 'backfill': - # inherently necessary bc NaNs are stored as a value which would - # otherwise be backfilled to zero - # any too-large values will be backfilled into the minimum index value for i in range(len(indexer)): if lab_ints[i] == 0: indexer[i] = -1 - # TODO: try using argsort more to accomplish this. does not matter for now, though + # restore the ordering new_indexer = [0] * len(indexer) for i, idx in enumerate(order): new_indexer[idx] = indexer[i] From fbe7684c073c1aa90db0de3e217458eede131741 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 14:57:11 -0800 Subject: [PATCH 05/49] update tests to reflect changes since 0.24.2 --- pandas/tests/frame/indexing/test_indexing.py | 50 ++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index a71b4a0983c63..bf9f7b2c18b76 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -22,8 +22,10 @@ ) import pandas._testing as tm from pandas.arrays import SparseArray + import pandas.core.common as com from pandas.core.indexing import IndexingError +import pandas.util.testing as tm from pandas.tseries.offsets import BDay @@ -1626,6 +1628,54 @@ def test_reindex_methods(self, method, expected_values): actual = df[::-1].reindex(target, method=switched_method) tm.assert_frame_equal(expected, actual) + def test_reindex_with_multi_index(self): + df = pd.DataFrame({ + "a": [0] * 7, + "b": list(range(7)), + "c": list(range(7)), + }).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product( + [[0], new_index], + names=["a", "b"]) + + reindexed_df = pd.DataFrame({ + "a": [0] * 4, + "b": new_index, + "c": [np.nan, 2.0, 5.0, np.nan], + }).set_index(["a", "b"]) + reindexed_and_backfilled_df = pd.DataFrame({ + "a": [0] * 4, + "b": new_index, + "c": [1, 2, 5, 6], + }).set_index(["a", "b"]) + reindexed_and_padded_df = pd.DataFrame({ + "a": [0] * 4, + "b": new_index, + "c": [0, 2, 5, 5], + }).set_index(["a", "b"]) + + tm.assert_frame_equal(df.reindex(new_multi_index), reindexed_df) + tm.assert_frame_equal(df.reindex(new_multi_index, method="bfill"), + reindexed_and_backfilled_df) + tm.assert_frame_equal(df.reindex(new_multi_index, method="backfill"), + reindexed_and_backfilled_df) + tm.assert_frame_equal(df.reindex(new_multi_index, method="ffill"), + reindexed_and_padded_df) + tm.assert_frame_equal(df.reindex(new_multi_index, method="pad"), + reindexed_and_padded_df) + + def test_reindex_subclass(self): + # https://github.com/pandas-dev/pandas/issues/31925 + class MyDataFrame(DataFrame): + pass + + expected = DataFrame() + df = MyDataFrame() + result = df.reindex_like(expected) + + tm.assert_frame_equal(result, expected) + def test_reindex_methods_nearest_special(self): df = pd.DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5]) From aeccfed03fcedcbbbe98ad85fe356e7ac6b8bdba Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 14:58:15 -0800 Subject: [PATCH 06/49] handle tuple-ordering correctly across several levels when carrying --- pandas/_libs/index.pyx | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index b65adf43e6c8e..f2ae26520f81c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -634,6 +634,7 @@ cdef class BaseMultiIndexCodesEngine: """ for i, row in enumerate(level_codes): need_to_carry = False + highest_level_adjustment = None # go from right to left for place-value arithmetic for j in range(len(row) - 1, -1, -1): max_val = len(self.levels[j]) @@ -655,15 +656,21 @@ cdef class BaseMultiIndexCodesEngine: if j == 0: for k in range(len(row)): row[k] = 0 - # set it to the minimum value and carry to the - # next level + # still possible value is not too large, but need to + # keep track of values which will need to be decreased else: - row[j] = 1 + highest_level_adjustment = j # done carrying, for now else: row[j] = new_val need_to_carry = False + # if we increased any values, all lower levels (visually, all + # levels to the left) should be set to their lowest level + if row[0] > 0 and highest_level_adjustment is not None: + for k in range(highest_level_adjustment + 1, len(row)): + row[k] = 1 + return level_codes From 1472e6465f20602034a98efb5371a884769cf324 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 15:01:18 -0800 Subject: [PATCH 07/49] rm testing print statements and fix rebase error --- pandas/_libs/index.pyx | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f2ae26520f81c..44c9662651171 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -693,14 +693,6 @@ cdef class BaseMultiIndexCodesEngine: int_keys : 1-dimensional array of dtype uint64 or object Integers representing one combination each """ - level_codes = [lev.get_indexer(codes, method=method) + 1 for lev, codes - in zip(self.levels, zip(*target))] - return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - print('\n\n\n\n') - print('extracting level codes from:\n{}\nwith:\n{}\nwith method: {}\n\n'.format( - str(self.levels), - str(zip(*target)), - str(method))) level_codes = np.array([ lev.get_indexer(codes, method=method) for lev, codes in zip(self.levels, zip(*target)) From 44e30f08c1eeffc211a5bc6a49fdacfc5f7b019d Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 15:22:48 -0800 Subject: [PATCH 08/49] address flake8 style complaints --- pandas/tests/indexes/multi/test_indexing.py | 42 +++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 56730b13fd27b..c85b6bec8431e 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -291,6 +291,48 @@ def test_get_indexer_three_or_more_levels(self): assert_almost_equal(mult_idx_2.get_indexer(mult_idx_1, method='pad'), [0, 0, 0, 0, 1, 2, 3, 3, 3, 3, 3, 3]) + def test_get_indexer_crossing_levels(self): + """ tests a corner case with get_indexer() with MultiIndexes where, when we + need to "carry" across levels, proper tuple ordering is respected + + the MultiIndexes used in this test, visually, are: + + mult_idx_1: + 0: 1 1 1 1 + 1: 2 + 2: 2 1 + 3: 2 + 4: 1 2 1 1 + 5: 2 + 6: 2 1 + 7: 2 + 8: 2 1 1 1 + 9: 2 + 10: 2 1 + 11: 2 + 12: 2 2 1 1 + 13: 2 + 14: 2 1 + 15: 2 + + mult_idx_2: + 0: 1 3 2 2 + 1: 2 3 2 2 + """ + mult_idx_1 = pd.MultiIndex.from_product([[1, 2]] * 4) + mult_idx_2 = pd.MultiIndex.from_tuples([(1, 3, 2, 2), (2, 3, 2, 2)]) + + # show the tuple orderings, which get_indexer() should respect + assert mult_idx_1[7] < mult_idx_2[0] < mult_idx_1[8] + assert mult_idx_1[-1] < mult_idx_2[1] + + tm.assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2), + np.array([-1, -1])) + tm.assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='bfill'), + np.array([8, -1])) + tm.assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='ffill'), + np.array([7, 15])) + def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) msg = ( From 1394ebf15bf19e9f0c8d13692ae45bfd25cc3612 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 15:36:28 -0800 Subject: [PATCH 09/49] apply output of --- pandas/tests/frame/indexing/test_indexing.py | 58 +++++++++----------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index bf9f7b2c18b76..6f123514ada8f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1629,41 +1629,35 @@ def test_reindex_methods(self, method, expected_values): tm.assert_frame_equal(expected, actual) def test_reindex_with_multi_index(self): - df = pd.DataFrame({ - "a": [0] * 7, - "b": list(range(7)), - "c": list(range(7)), - }).set_index(["a", "b"]) + df = pd.DataFrame( + {"a": [0] * 7, "b": list(range(7)), "c": list(range(7)),} + ).set_index(["a", "b"]) new_index = [0.5, 2.0, 5.0, 5.8] - new_multi_index = MultiIndex.from_product( - [[0], new_index], - names=["a", "b"]) - - reindexed_df = pd.DataFrame({ - "a": [0] * 4, - "b": new_index, - "c": [np.nan, 2.0, 5.0, np.nan], - }).set_index(["a", "b"]) - reindexed_and_backfilled_df = pd.DataFrame({ - "a": [0] * 4, - "b": new_index, - "c": [1, 2, 5, 6], - }).set_index(["a", "b"]) - reindexed_and_padded_df = pd.DataFrame({ - "a": [0] * 4, - "b": new_index, - "c": [0, 2, 5, 5], - }).set_index(["a", "b"]) + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + reindexed_df = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, 2.0, 5.0, np.nan],} + ).set_index(["a", "b"]) + reindexed_and_backfilled_df = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [1, 2, 5, 6],} + ).set_index(["a", "b"]) + reindexed_and_padded_df = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [0, 2, 5, 5],} + ).set_index(["a", "b"]) tm.assert_frame_equal(df.reindex(new_multi_index), reindexed_df) - tm.assert_frame_equal(df.reindex(new_multi_index, method="bfill"), - reindexed_and_backfilled_df) - tm.assert_frame_equal(df.reindex(new_multi_index, method="backfill"), - reindexed_and_backfilled_df) - tm.assert_frame_equal(df.reindex(new_multi_index, method="ffill"), - reindexed_and_padded_df) - tm.assert_frame_equal(df.reindex(new_multi_index, method="pad"), - reindexed_and_padded_df) + tm.assert_frame_equal( + df.reindex(new_multi_index, method="bfill"), reindexed_and_backfilled_df + ) + tm.assert_frame_equal( + df.reindex(new_multi_index, method="backfill"), reindexed_and_backfilled_df + ) + tm.assert_frame_equal( + df.reindex(new_multi_index, method="ffill"), reindexed_and_padded_df + ) + tm.assert_frame_equal( + df.reindex(new_multi_index, method="pad"), reindexed_and_padded_df + ) def test_reindex_subclass(self): # https://github.com/pandas-dev/pandas/issues/31925 From df68ca9f2b24765dc8304a15d4e3dac1da8e8824 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 15:48:13 -0800 Subject: [PATCH 10/49] fix test import names and ordering --- pandas/tests/frame/indexing/test_indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 6f123514ada8f..1849fb8958e42 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -25,7 +25,6 @@ import pandas.core.common as com from pandas.core.indexing import IndexingError -import pandas.util.testing as tm from pandas.tseries.offsets import BDay From bba7e51689eb01de4d12d4912321054efbe24ef3 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 15:55:33 -0800 Subject: [PATCH 11/49] move whatsnew addition from Indexing to MultiIndex section --- doc/source/whatsnew/v1.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6597b764581a4..4a3de8837e4c1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1160,7 +1160,6 @@ MultiIndex - Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) - Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) -- I/O ^^^ From 3f3b7994315651d7422062622d8eae795423d8ed Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 16:22:27 -0800 Subject: [PATCH 12/49] address linting issues --- pandas/_libs/index.pyx | 20 +++++++++----------- pandas/tests/frame/indexing/test_indexing.py | 8 ++++---- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 44c9662651171..7559fcec04ef4 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -673,7 +673,6 @@ cdef class BaseMultiIndexCodesEngine: return level_codes - def _extract_level_codes(self, object target, object method=None): """ Map the requested list of (tuple) keys to their integer representations @@ -711,12 +710,11 @@ cdef class BaseMultiIndexCodesEngine: # as small as possible, and vice versa for i, row in enumerate(level_codes): for j, level in enumerate(row): - if level_codes_no_fill[i][j] != level_codes[i][j]: - for k in range(j + 1, len(row)): - row[k] = ( - 1 if method == 'backfill' else - len(self.levels[k])) - break + if level_codes_no_fill[i][j] != level_codes[i][j]: + for k in range(j + 1, len(row)): + row[k] = (1 if method == 'backfill' else + len(self.levels[k])) + break # after doing per-level indexing, backfilled level codes need # additional cleanup, as too-large values are 0, which will in @@ -747,14 +745,14 @@ cdef class BaseMultiIndexCodesEngine: # handle the case where too-large values are backfilled to NaN, for # which the integer representation from _extract_level_codes() is 0 if method == 'backfill': - for i in range(len(indexer)): - if lab_ints[i] == 0: - indexer[i] = -1 + for i in range(len(indexer)): + if lab_ints[i] == 0: + indexer[i] = -1 # restore the ordering new_indexer = [0] * len(indexer) for i, idx in enumerate(order): - new_indexer[idx] = indexer[i] + new_indexer[idx] = indexer[i] return new_indexer else: indexer = self._base.get_indexer(self, lab_ints) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 1849fb8958e42..4bcc1ac620ee9 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1629,19 +1629,19 @@ def test_reindex_methods(self, method, expected_values): def test_reindex_with_multi_index(self): df = pd.DataFrame( - {"a": [0] * 7, "b": list(range(7)), "c": list(range(7)),} + {"a": [0] * 7, "b": list(range(7)), "c": list(range(7))} ).set_index(["a", "b"]) new_index = [0.5, 2.0, 5.0, 5.8] new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) reindexed_df = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [np.nan, 2.0, 5.0, np.nan],} + {"a": [0] * 4, "b": new_index, "c": [np.nan, 2.0, 5.0, np.nan]} ).set_index(["a", "b"]) reindexed_and_backfilled_df = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [1, 2, 5, 6],} + {"a": [0] * 4, "b": new_index, "c": [1, 2, 5, 6]} ).set_index(["a", "b"]) reindexed_and_padded_df = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [0, 2, 5, 5],} + {"a": [0] * 4, "b": new_index, "c": [0, 2, 5, 5]} ).set_index(["a", "b"]) tm.assert_frame_equal(df.reindex(new_multi_index), reindexed_df) From 3457192e7972bbcedd721a8f5e5d778019872ca2 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 21:30:51 -0800 Subject: [PATCH 13/49] use intermediary variables in tests and add issue link in comments --- pandas/tests/frame/indexing/test_indexing.py | 38 ++-- pandas/tests/indexes/multi/test_indexing.py | 216 +++++++++++-------- 2 files changed, 147 insertions(+), 107 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 4bcc1ac620ee9..a004d1de04a51 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1628,35 +1628,41 @@ def test_reindex_methods(self, method, expected_values): tm.assert_frame_equal(expected, actual) def test_reindex_with_multi_index(self): + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + # apropos of https://github.com/pandas-dev/pandas/issues/29896 df = pd.DataFrame( {"a": [0] * 7, "b": list(range(7)), "c": list(range(7))} ).set_index(["a", "b"]) new_index = [0.5, 2.0, 5.0, 5.8] new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) - reindexed_df = pd.DataFrame( + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = pd.DataFrame( {"a": [0] * 4, "b": new_index, "c": [np.nan, 2.0, 5.0, np.nan]} ).set_index(["a", "b"]) - reindexed_and_backfilled_df = pd.DataFrame( + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = pd.DataFrame( {"a": [0] * 4, "b": new_index, "c": [1, 2, 5, 6]} ).set_index(["a", "b"]) - reindexed_and_padded_df = pd.DataFrame( + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = pd.DataFrame( {"a": [0] * 4, "b": new_index, "c": [0, 2, 5, 5]} ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) - tm.assert_frame_equal(df.reindex(new_multi_index), reindexed_df) - tm.assert_frame_equal( - df.reindex(new_multi_index, method="bfill"), reindexed_and_backfilled_df - ) - tm.assert_frame_equal( - df.reindex(new_multi_index, method="backfill"), reindexed_and_backfilled_df - ) - tm.assert_frame_equal( - df.reindex(new_multi_index, method="ffill"), reindexed_and_padded_df - ) - tm.assert_frame_equal( - df.reindex(new_multi_index, method="pad"), reindexed_and_padded_df - ) + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) def test_reindex_subclass(self): # https://github.com/pandas-dev/pandas/issues/31925 diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index c85b6bec8431e..33c51bed59641 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -214,11 +214,87 @@ def test_get_indexer_and_fill(self): expected_indexer_padded = np.array([0, 2, 3]) assert_almost_equal(expected_indexer_padded, indexer_padded) + + def test_get_indexer_nearest(self): + midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = ( + "method='nearest' not implemented yet for MultiIndex; " + "see GitHub issue 9365" + ) + with pytest.raises(NotImplementedError, match=msg): + midx.get_indexer(["a"], method="nearest") + msg = "tolerance not implemented yet for MultiIndex" + with pytest.raises(NotImplementedError, match=msg): + midx.get_indexer(["a"], method="pad", tolerance=2) + + def test_get_indexer_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [ + Categorical(["a", "b", "c"]), + Categorical(date_range("2012-01-01", periods=3, freq="H")), + ] + ) + result = midx.get_indexer(midx) + tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + + @pytest.mark.parametrize( + "index_arr,labels,expected", + [ + ( + [[1, np.nan, 2], [3, 4, 5]], + [1, np.nan, 2], + np.array([-1, -1, -1], dtype=np.intp), + ), + ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), + ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), + ( + [[1, 2, 3], [np.nan, 4, 5]], + [np.nan, 4, 5], + np.array([-1, -1, -1], dtype=np.intp), + ), + ], + ) + def test_get_indexer_with_missing_value(self, index_arr, labels, expected): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_indexer(labels) + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_methods(self): + """ test getting an indexer for another index using the backfill method + + apropos of https://github.com/pandas-dev/pandas/issues/29896 + """ + mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) + mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) + + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, 2, 3], dtype="int64") + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([1, 2, 3], dtype="int64") + tm.assert_almost_equal(expected, backfill_indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([1, 2, 3], dtype="int64") + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([0, 2, 3], dtype="int64") + tm.assert_almost_equal(expected, pad_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([0, 2, 3], dtype="int64") + tm.assert_almost_equal(expected, pad_indexer) + def test_get_indexer_three_or_more_levels(self): """ tests get_indexer() on MultiIndexes with 3+ levels - visually, these are + apropos of https://github.com/pandas-dev/pandas/issues/29896 + visually, these are mult_idx_1: 0: 1 2 5 1: 7 @@ -238,14 +314,14 @@ def test_get_indexer_three_or_more_levels(self): 1: 1 5 9 2: 1 6 7 3: 2 1 6 - 4: 3 6 8 + 4: 2 7 6 + 5: 2 7 8 + 6: 3 6 8 """ mult_idx_1 = pd.MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]]) - mult_idx_2 = pd.MultiIndex.from_tuples([(1, 1, 8), - (1, 5, 9), - (1, 6, 7), - (2, 1, 6), - (3, 6, 8)]) + mult_idx_2 = pd.MultiIndex.from_tuples( + [(1, 1, 8), (1, 5, 9), (1, 6, 7), (2, 1, 6), (2, 7, 7), (2, 7, 8), (3, 6, 8)] + ) # sanity check assert mult_idx_1.is_monotonic assert mult_idx_1.is_unique @@ -257,18 +333,22 @@ def test_get_indexer_three_or_more_levels(self): assert mult_idx_1[3] < mult_idx_2[1] < mult_idx_1[4] assert mult_idx_1[5] == mult_idx_2[2] assert mult_idx_1[5] < mult_idx_2[3] < mult_idx_1[6] - assert mult_idx_1[-1] < mult_idx_[4] + assert mult_idx_1[5] < mult_idx_2[4] < mult_idx_1[6] + assert mult_idx_1[5] < mult_idx_2[5] < mult_idx_1[6] + assert mult_idx_1[-1] < mult_idx_2[6] indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) - assert_almost_equal(indexer_no_fill, [-1, -1, 5, -1, -1]) + tm.assert_almost_equal(indexer_no_fill, np.array([-1, -1, 5, -1, -1, -1, -1])) # test with backfilling - indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method='backfill') - assert_almost_equal(indexer_backfilled, [0, 4, 5, 6, -1]) + indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype="int64") + tm.assert_almost_equal(expected, indexer_backfilled) # now, the same thing, but forward-filled (aka "padded") - indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method='pad') - assert_almost_equal(indexer_padded, [-1, 3, 5, 5, 11]) + indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype="int64") + tm.assert_almost_equal(expected, indexer_padded) # now, do the indexing in the other direction assert mult_idx_2[0] < mult_idx_1[0] < mult_idx_2[1] @@ -277,26 +357,33 @@ def test_get_indexer_three_or_more_levels(self): assert mult_idx_2[0] < mult_idx_1[3] < mult_idx_2[1] assert mult_idx_2[1] < mult_idx_1[4] < mult_idx_2[2] assert mult_idx_2[2] == mult_idx_1[5] - assert mult_idx_2[3] < mult_idx_1[6] < mult_idx_2[4] - assert mult_idx_2[3] < mult_idx_1[7] < mult_idx_2[4] - assert mult_idx_2[3] < mult_idx_1[8] < mult_idx_2[4] - assert mult_idx_2[3] < mult_idx_1[9] < mult_idx_2[4] - assert mult_idx_2[3] < mult_idx_1[10] < mult_idx_2[4] - assert mult_idx_2[3] < mult_idx_1[11] < mult_idx_2[4] - - assert_almost_equal(mult_idx_2.get_indexer(mult_idx_1), - [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1]) - assert_almost_equal(mult_idx_2.get_indexer(mult_idx_1, method='bfill'), - [1, 1, 1, 1, 2, 2, 4, 4, 4, 4, 4, 4]) - assert_almost_equal(mult_idx_2.get_indexer(mult_idx_1, method='pad'), - [0, 0, 0, 0, 1, 2, 3, 3, 3, 3, 3, 3]) - - def test_get_indexer_crossing_levels(self): + assert mult_idx_2[5] < mult_idx_1[6] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[7] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[8] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[9] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[10] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[11] < mult_idx_2[6] + + indexer = mult_idx_2.get_indexer(mult_idx_1) + expected = np.array([-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], + dtype="int64") + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill") + expected = np.array([1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype="int64") + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") + expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype="int64") + tm.assert_almost_equal(expected, pad_indexer) + + def test_get_indexer_backfill_with_carrying(self): """ tests a corner case with get_indexer() with MultiIndexes where, when we need to "carry" across levels, proper tuple ordering is respected - the MultiIndexes used in this test, visually, are: + apropos of https://github.com/pandas-dev/pandas/issues/29896 + the MultiIndexes used in this test, visually, are: mult_idx_1: 0: 1 1 1 1 1: 2 @@ -326,71 +413,18 @@ def test_get_indexer_crossing_levels(self): assert mult_idx_1[7] < mult_idx_2[0] < mult_idx_1[8] assert mult_idx_1[-1] < mult_idx_2[1] - tm.assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2), - np.array([-1, -1])) - tm.assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='bfill'), - np.array([8, -1])) - tm.assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method='ffill'), - np.array([7, 15])) + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, -1], dtype="int64") + tm.assert_almost_equal(expected, indexer) - def test_get_indexer_nearest(self): - midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) - msg = ( - "method='nearest' not implemented yet for MultiIndex; " - "see GitHub issue 9365" - ) - with pytest.raises(NotImplementedError, match=msg): - midx.get_indexer(["a"], method="nearest") - msg = "tolerance not implemented yet for MultiIndex" - with pytest.raises(NotImplementedError, match=msg): - midx.get_indexer(["a"], method="pad", tolerance=2) + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([8, -1], dtype="int64") + tm.assert_almost_equal(expected, backfill_indexer) - def test_get_indexer_categorical_time(self): - # https://github.com/pandas-dev/pandas/issues/21390 - midx = MultiIndex.from_product( - [ - Categorical(["a", "b", "c"]), - Categorical(date_range("2012-01-01", periods=3, freq="H")), - ] - ) - result = midx.get_indexer(midx) - tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) - - @pytest.mark.parametrize( - "index_arr,labels,expected", - [ - ( - [[1, np.nan, 2], [3, 4, 5]], - [1, np.nan, 2], - np.array([-1, -1, -1], dtype=np.intp), - ), - ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), - ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), - ( - [[1, 2, 3], [np.nan, 4, 5]], - [np.nan, 4, 5], - np.array([-1, -1, -1], dtype=np.intp), - ), - ], - ) - def test_get_indexer_with_missing_value(self, index_arr, labels, expected): - # issue 19132 - idx = MultiIndex.from_arrays(index_arr) - result = idx.get_indexer(labels) - tm.assert_numpy_array_equal(result, expected) + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([7, 15], dtype="int64") + tm.assert_almost_equal(expected, pad_indexer) - def test_get_indexer_and_fill(self): - """ test getting an indexer for another index using the backfill method """ - mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) - mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) - - assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2), np.array([-1, 2, 3])) - for method in ("bfill", "backfill"): - assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method=method), - np.array([1, 2, 3])) - for method in ("ffill", "pad"): - assert_almost_equal(mult_idx_1.get_indexer(mult_idx_2, method=method), - np.array([0, 2, 3])) def test_getitem(idx): # scalar From 5f16fc819f0891ba949d9913cd72fa5386ac09e6 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 21:51:08 -0800 Subject: [PATCH 14/49] add one-line summary for _do_backfill_carrying() method --- pandas/_libs/index.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 7559fcec04ef4..97b0d95ee23e6 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -596,6 +596,8 @@ cdef class BaseMultiIndexCodesEngine: def _do_backfill_carrying(self, object level_codes, object level_codes_no_fill): """ + cleanup for the case of per-level backfills of too-large values to -1 + given a 2d list of level_codes, i.e. integers representing the index of a target index inside `self.level_codes` for each level, which were backfilled at each level, handle cases where too-large values @@ -618,7 +620,7 @@ cdef class BaseMultiIndexCodesEngine: Parameters ---------- - level_codes : 2D array, M x N (where self has N levels) + level_codes : ndarray[ndim=2] per-level backfilled codes of a target index. a 0 represents NaN, and the remainder of the values should be from 1, ..., L_i, where L_i is the length of `self.levels[i]` From 7241973d8342148c76941ec24f095654c4d25a27 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 21:55:30 -0800 Subject: [PATCH 15/49] use comments instead of docstrings with issue link in tests --- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/indexes/multi/test_indexing.py | 135 ++++++++----------- 2 files changed, 54 insertions(+), 83 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index a004d1de04a51..769f1d9f8064f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1628,8 +1628,8 @@ def test_reindex_methods(self, method, expected_values): tm.assert_frame_equal(expected, actual) def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 # tests for reindexing a multi-indexed DataFrame with a new MultiIndex - # apropos of https://github.com/pandas-dev/pandas/issues/29896 df = pd.DataFrame( {"a": [0] * 7, "b": list(range(7)), "c": list(range(7))} ).set_index(["a", "b"]) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 33c51bed59641..255b0b00e8a00 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -193,28 +193,6 @@ def test_get_indexer(self): with pytest.raises(InvalidIndexError, match=msg): idx1.get_indexer(idx2) - def test_get_indexer_and_fill(self): - """ test getting an indexer for another index using the backfill method """ - mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) - mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) - - indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) - expected_indexer_no_fill = np.array([-1, 2, 3]) - assert_almost_equal(expected_indexer_no_fill, indexer_no_fill) - - for method in ("bfill", "backfill"): - indexer_backfilled = multi_idx_1.get_indexer(mult_idx_2, - method=method) - expected_indexer_backfilled = np.array([1, 2, 3]) - assert_almost_equal(expected_indexer_backfilled, indexer_backfilled) - - for method in ("ffill", "pad"): - indexer_padded = mult_idx_1.get_indexer(mult_idx_2, - method=method) - expected_indexer_padded = np.array([0, 2, 3]) - assert_almost_equal(expected_indexer_padded, indexer_padded) - - def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) msg = ( @@ -262,10 +240,8 @@ def test_get_indexer_with_missing_value(self, index_arr, labels, expected): tm.assert_numpy_array_equal(result, expected) def test_get_indexer_methods(self): - """ test getting an indexer for another index using the backfill method - - apropos of https://github.com/pandas-dev/pandas/issues/29896 - """ + # https://github.com/pandas-dev/pandas/issues/29896 + # test getting an indexer for another index with different methods mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) @@ -290,34 +266,31 @@ def test_get_indexer_methods(self): tm.assert_almost_equal(expected, pad_indexer) def test_get_indexer_three_or_more_levels(self): - """ tests get_indexer() on MultiIndexes with 3+ levels - - apropos of https://github.com/pandas-dev/pandas/issues/29896 - - visually, these are - mult_idx_1: - 0: 1 2 5 - 1: 7 - 2: 4 5 - 3: 7 - 4: 6 5 - 5: 7 - 6: 3 2 5 - 7: 7 - 8: 4 5 - 9: 7 - 10: 6 5 - 11: 7 - - mult_idx_2: - 0: 1 1 8 - 1: 1 5 9 - 2: 1 6 7 - 3: 2 1 6 - 4: 2 7 6 - 5: 2 7 8 - 6: 3 6 8 - """ + # https://github.com/pandas-dev/pandas/issues/29896 + # tests get_indexer() on MultiIndexes with 3+ levels + # visually, these are + # mult_idx_1: + # 0: 1 2 5 + # 1: 7 + # 2: 4 5 + # 3: 7 + # 4: 6 5 + # 5: 7 + # 6: 3 2 5 + # 7: 7 + # 8: 4 5 + # 9: 7 + # 10: 6 5 + # 11: 7 + # + # mult_idx_2: + # 0: 1 1 8 + # 1: 1 5 9 + # 2: 1 6 7 + # 3: 2 1 6 + # 4: 2 7 6 + # 5: 2 7 8 + # 6: 3 6 8 mult_idx_1 = pd.MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]]) mult_idx_2 = pd.MultiIndex.from_tuples( [(1, 1, 8), (1, 5, 9), (1, 6, 7), (2, 1, 6), (2, 7, 7), (2, 7, 8), (3, 6, 8)] @@ -378,34 +351,32 @@ def test_get_indexer_three_or_more_levels(self): tm.assert_almost_equal(expected, pad_indexer) def test_get_indexer_backfill_with_carrying(self): - """ tests a corner case with get_indexer() with MultiIndexes where, when we - need to "carry" across levels, proper tuple ordering is respected - - apropos of https://github.com/pandas-dev/pandas/issues/29896 - - the MultiIndexes used in this test, visually, are: - mult_idx_1: - 0: 1 1 1 1 - 1: 2 - 2: 2 1 - 3: 2 - 4: 1 2 1 1 - 5: 2 - 6: 2 1 - 7: 2 - 8: 2 1 1 1 - 9: 2 - 10: 2 1 - 11: 2 - 12: 2 2 1 1 - 13: 2 - 14: 2 1 - 15: 2 - - mult_idx_2: - 0: 1 3 2 2 - 1: 2 3 2 2 - """ + # https://github.com/pandas-dev/pandas/issues/29896 + # tests a corner case with get_indexer() with MultiIndexes where, when we + # need to "carry" across levels, proper tuple ordering is respected + # + # the MultiIndexes used in this test, visually, are: + # mult_idx_1: + # 0: 1 1 1 1 + # 1: 2 + # 2: 2 1 + # 3: 2 + # 4: 1 2 1 1 + # 5: 2 + # 6: 2 1 + # 7: 2 + # 8: 2 1 1 1 + # 9: 2 + # 10: 2 1 + # 11: 2 + # 12: 2 2 1 1 + # 13: 2 + # 14: 2 1 + # 15: 2 + # + # mult_idx_2: + # 0: 1 3 2 2 + # 1: 2 3 2 2 mult_idx_1 = pd.MultiIndex.from_product([[1, 2]] * 4) mult_idx_2 = pd.MultiIndex.from_tuples([(1, 3, 2, 2), (2, 3, 2, 2)]) From 917a1403fa878527ee807f6926a188e2e7e1db8f Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 22:23:27 -0800 Subject: [PATCH 16/49] rm unnecessary blank line --- pandas/tests/frame/indexing/test_indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 769f1d9f8064f..93ab19769b67a 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1636,7 +1636,6 @@ def test_reindex_with_multi_index(self): new_index = [0.5, 2.0, 5.0, 5.8] new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) - # reindexing w/o a `method` value reindexed = df.reindex(new_multi_index) expected = pd.DataFrame( From 3ea4bae4134c8fa54e37eaf71714f20a38f37df9 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Mon, 6 Jan 2020 23:19:54 -0800 Subject: [PATCH 17/49] use indexer dtypes in tests for compatibility with windows --- pandas/tests/indexes/multi/test_indexing.py | 25 +++++++++++---------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 255b0b00e8a00..82f5c96251fd0 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -246,23 +246,23 @@ def test_get_indexer_methods(self): mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) indexer = mult_idx_1.get_indexer(mult_idx_2) - expected = np.array([-1, 2, 3], dtype="int64") + expected = np.array([-1, 2, 3], dtype=indexer.dtype) tm.assert_almost_equal(expected, indexer) backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill") - expected = np.array([1, 2, 3], dtype="int64") + expected = np.array([1, 2, 3], dtype=backfill_indexer.dtype) tm.assert_almost_equal(expected, backfill_indexer) backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") - expected = np.array([1, 2, 3], dtype="int64") + expected = np.array([1, 2, 3], dtype=backfill_indexer.dtype) tm.assert_almost_equal(expected, backfill_indexer) pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad") - expected = np.array([0, 2, 3], dtype="int64") + expected = np.array([0, 2, 3], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") - expected = np.array([0, 2, 3], dtype="int64") + expected = np.array([0, 2, 3], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) def test_get_indexer_three_or_more_levels(self): @@ -315,12 +315,12 @@ def test_get_indexer_three_or_more_levels(self): # test with backfilling indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill") - expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype="int64") + expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype=indexer_backfilled.dtype) tm.assert_almost_equal(expected, indexer_backfilled) # now, the same thing, but forward-filled (aka "padded") indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method="pad") - expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype="int64") + expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype=indexer_padded.dtype) tm.assert_almost_equal(expected, indexer_padded) # now, do the indexing in the other direction @@ -343,11 +343,12 @@ def test_get_indexer_three_or_more_levels(self): tm.assert_almost_equal(expected, indexer) backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill") - expected = np.array([1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype="int64") + expected = np.array([1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], + dtype=backfill_indexer.dtype) tm.assert_almost_equal(expected, backfill_indexer) pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") - expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype="int64") + expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5],dtype="int64") tm.assert_almost_equal(expected, pad_indexer) def test_get_indexer_backfill_with_carrying(self): @@ -385,15 +386,15 @@ def test_get_indexer_backfill_with_carrying(self): assert mult_idx_1[-1] < mult_idx_2[1] indexer = mult_idx_1.get_indexer(mult_idx_2) - expected = np.array([-1, -1], dtype="int64") + expected = np.array([-1, -1], dtype=indexer.dtype) tm.assert_almost_equal(expected, indexer) backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") - expected = np.array([8, -1], dtype="int64") + expected = np.array([8, -1], dtype=backfill_indexer.dtype) tm.assert_almost_equal(expected, backfill_indexer) pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") - expected = np.array([7, 15], dtype="int64") + expected = np.array([7, 15], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) From c52d87a59f41f3161b6c26b755d5dd64db9ea7f8 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 7 Jan 2020 10:02:25 -0800 Subject: [PATCH 18/49] one more such case --- pandas/tests/indexes/multi/test_indexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 82f5c96251fd0..23641aa776d5b 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -348,7 +348,8 @@ def test_get_indexer_three_or_more_levels(self): tm.assert_almost_equal(expected, backfill_indexer) pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") - expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5],dtype="int64") + expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], + dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) def test_get_indexer_backfill_with_carrying(self): From d72bdd618071dceec7082e2287a593efa1347974 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 7 Jan 2020 10:36:18 -0800 Subject: [PATCH 19/49] cleanup for same such cases --- pandas/tests/indexes/multi/test_indexing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 23641aa776d5b..a7d34c67bd72b 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -311,7 +311,9 @@ def test_get_indexer_three_or_more_levels(self): assert mult_idx_1[-1] < mult_idx_2[6] indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) - tm.assert_almost_equal(indexer_no_fill, np.array([-1, -1, 5, -1, -1, -1, -1])) + expected = np.array([-1, -1, 5, -1, -1, -1, -1], + dtype=indexer_no_fill.dtype) + tm.assert_almost_equal(expected, indexer_no_fill) # test with backfilling indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill") @@ -339,7 +341,7 @@ def test_get_indexer_three_or_more_levels(self): indexer = mult_idx_2.get_indexer(mult_idx_1) expected = np.array([-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], - dtype="int64") + dtype=indexer.dtype) tm.assert_almost_equal(expected, indexer) backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill") @@ -352,7 +354,8 @@ def test_get_indexer_three_or_more_levels(self): dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) - def test_get_indexer_backfill_with_carrying(self): + + def test_get_indexer_backfill_with_carrying(): # https://github.com/pandas-dev/pandas/issues/29896 # tests a corner case with get_indexer() with MultiIndexes where, when we # need to "carry" across levels, proper tuple ordering is respected From 6b2edd883079b1185b195619a0abefb175af48fa Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 7 Jan 2020 11:15:21 -0800 Subject: [PATCH 20/49] run black pandas again, update formatting of a few lines --- pandas/tests/indexes/multi/test_indexing.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index a7d34c67bd72b..8a54d4dc0c5ef 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -311,8 +311,7 @@ def test_get_indexer_three_or_more_levels(self): assert mult_idx_1[-1] < mult_idx_2[6] indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) - expected = np.array([-1, -1, 5, -1, -1, -1, -1], - dtype=indexer_no_fill.dtype) + expected = np.array([-1, -1, 5, -1, -1, -1, -1], dtype=indexer_no_fill.dtype) tm.assert_almost_equal(expected, indexer_no_fill) # test with backfilling @@ -340,22 +339,22 @@ def test_get_indexer_three_or_more_levels(self): assert mult_idx_2[5] < mult_idx_1[11] < mult_idx_2[6] indexer = mult_idx_2.get_indexer(mult_idx_1) - expected = np.array([-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], - dtype=indexer.dtype) + expected = np.array( + [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], dtype=indexer.dtype + ) tm.assert_almost_equal(expected, indexer) backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill") - expected = np.array([1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], - dtype=backfill_indexer.dtype) + expected = np.array( + [1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype=backfill_indexer.dtype + ) tm.assert_almost_equal(expected, backfill_indexer) pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") - expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], - dtype=pad_indexer.dtype) + expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) - - def test_get_indexer_backfill_with_carrying(): + def test_get_indexer_backfill_with_carrying(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests a corner case with get_indexer() with MultiIndexes where, when we # need to "carry" across levels, proper tuple ordering is respected From 9014f965c03a10e6b3fdb6583cf5910fdf86c53b Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 30 Jan 2020 11:11:09 -0800 Subject: [PATCH 21/49] move whatsnew entry from 1.0.0 -> 1.0.1 --- doc/source/whatsnew/v1.0.0.rst | 1 + doc/source/whatsnew/v1.0.1.rst | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4a3de8837e4c1..6597b764581a4 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1160,6 +1160,7 @@ MultiIndex - Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) - Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) +- I/O ^^^ diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index c42aab6de4cc3..0573c9c77e3eb 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -67,8 +67,6 @@ Bug fixes **Interval** -- Bug in :meth:`Series.shift` with ``interval`` dtype raising a ``TypeError`` when shifting an interval array of integers or datetimes (:issue:`34195`) - .. --------------------------------------------------------------------------- .. _whatsnew_101.contributors: From 09adf2c9e58a67b631ad6c4b4e2038c01ba2f9d3 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 30 Jan 2020 11:22:31 -0800 Subject: [PATCH 22/49] actually, mv it to 1.1.0 since that appears to be next release --- doc/source/whatsnew/v1.0.1.rst | 30 ++++++++++++++++++++++++++++++ doc/source/whatsnew/v1.1.0.rst | 1 + 2 files changed, 31 insertions(+) diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index 0573c9c77e3eb..8cc21b2b2a635 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -64,6 +64,36 @@ Bug fixes **Plotting** - Plotting tz-aware timeseries no longer gives UserWarning (:issue:`31205`) +======= +- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) +- Fixed regression in :class:`DataFrame` setting values with a slice (e.g. ``df[-4:] = 1``) indexing by label instead of position (:issue:`31469`) +- +- +- Bug where assigning to a :class:`Series` using a IntegerArray / BooleanArray as a mask would raise ``TypeError`` (:issue:`31446`) + +Missing +^^^^^^^ + +- +- + +MultiIndex +^^^^^^^^^^ + +- + +I/O +^^^ + +- Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`) +- +- + +Plotting +^^^^^^^^ + +- +- **Interval** diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6f2b9b4f946c7..c6e4d547c7624 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -407,6 +407,7 @@ MultiIndex # Common elements are now guaranteed to be ordered by the left side left.intersection(right, sort=False) +- Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options (:issue:`29896`) - I/O From f99428690363bd8628a581e359c63d4314b12e2d Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 30 Jan 2020 11:39:07 -0800 Subject: [PATCH 23/49] sort imports in indexing tests with isort --- pandas/tests/frame/indexing/test_indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 93ab19769b67a..29e7eb895b3aa 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -22,7 +22,6 @@ ) import pandas._testing as tm from pandas.arrays import SparseArray - import pandas.core.common as com from pandas.core.indexing import IndexingError From 734b6852c59a9dbaf279dfe5d054bdfcacb71627 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 30 Jan 2020 11:43:37 -0800 Subject: [PATCH 24/49] revert unrelated change to not-used 1.0.1 release notes --- doc/source/whatsnew/v1.0.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index 8cc21b2b2a635..a8cc4dcd770e2 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -80,6 +80,7 @@ Missing MultiIndex ^^^^^^^^^^ +- - I/O From f69cd1337631628ae17e1e62fc5c08805653b568 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 18 Feb 2020 18:50:06 -0800 Subject: [PATCH 25/49] add example code in whatsnew --- doc/source/whatsnew/v1.1.0.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c6e4d547c7624..37d60e7a8903d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -408,6 +408,17 @@ MultiIndex left.intersection(right, sort=False) - Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options (:issue:`29896`) +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0], + 'b': [0, 2, 3, 4], + 'c': ['A', 'B', 'C', 'D'] + }).set_index(['a', 'b']) + mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + + df.reindex(mi_2, method='backfill') + df.reindex(mi_2, method='pad') - I/O From e91b2cba462e33524afecbf4e27effa51bd1d4b6 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 18 Feb 2020 18:54:42 -0800 Subject: [PATCH 26/49] re-write backend for reindexing MultiIndexes with a filling method * use approach similar to creating a new factorization of a merged array of the MultiIndex values * add explanatory comments * separate implementation of MultiIndex.get_indexer() with and without filling; former is unchanged * use np C types strictly in cython code and add type annotations where possible --- doc/source/whatsnew/v1.0.1.rst | 33 +-- pandas/_libs/index.pyx | 220 ++++++------------- pandas/core/indexes/multi.py | 4 +- pandas/tests/frame/indexing/test_indexing.py | 38 +++- pandas/tests/indexes/multi/test_indexing.py | 2 +- 5 files changed, 111 insertions(+), 186 deletions(-) diff --git a/doc/source/whatsnew/v1.0.1.rst b/doc/source/whatsnew/v1.0.1.rst index a8cc4dcd770e2..c42aab6de4cc3 100644 --- a/doc/source/whatsnew/v1.0.1.rst +++ b/doc/source/whatsnew/v1.0.1.rst @@ -64,40 +64,11 @@ Bug fixes **Plotting** - Plotting tz-aware timeseries no longer gives UserWarning (:issue:`31205`) -======= -- Fixed regression when indexing a ``Series`` or ``DataFrame`` indexed by ``DatetimeIndex`` with a slice containg a :class:`datetime.date` (:issue:`31501`) -- Fixed regression in :class:`DataFrame` setting values with a slice (e.g. ``df[-4:] = 1``) indexing by label instead of position (:issue:`31469`) -- -- -- Bug where assigning to a :class:`Series` using a IntegerArray / BooleanArray as a mask would raise ``TypeError`` (:issue:`31446`) - -Missing -^^^^^^^ - -- -- - -MultiIndex -^^^^^^^^^^ - -- -- - -I/O -^^^ - -- Fixed regression in :meth:`~DataFrame.to_csv` where specifying an ``na_rep`` might truncate the values written (:issue:`31447`) -- -- - -Plotting -^^^^^^^^ - -- -- **Interval** +- Bug in :meth:`Series.shift` with ``interval`` dtype raising a ``TypeError`` when shifting an interval array of integers or datetimes (:issue:`34195`) + .. --------------------------------------------------------------------------- .. _whatsnew_101.contributors: diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 97b0d95ee23e6..987f92d179783 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -593,89 +593,7 @@ cdef class BaseMultiIndexCodesEngine: def _codes_to_ints(self, codes): raise NotImplementedError("Implemented by subclass") - def _do_backfill_carrying(self, object level_codes, - object level_codes_no_fill): - """ - cleanup for the case of per-level backfills of too-large values to -1 - - given a 2d list of level_codes, i.e. integers representing the index - of a target index inside `self.level_codes` for each level, which - were backfilled at each level, handle cases where too-large values - which were backfilled to 0 (and became 1 after adding 1 to handle use - of unsigned integers) and thus the backfilling on the level code, when - represented as an int, will not be correct - - in particular, this involves a sort of "carrying", whereby, when a - value at a given level is too large, we set it to the minimum value - and then attempt to bump the value to the left, and, in the event that - we need to do this at the first index - - e.g. if the highest two values in `self`'s index tuples are (8, 4, 7) - and (8, 4, 9), then, in order to determine that the value (8, 4, 10) is - too large, we would need to determine that 10 > 9, meaning we need to - "carry" to the middle level, at which we try to bump 4 to 5, which is - in turn too large, requiring us to try to bump 8 to 9, also too large, - meaning that the entire row representing (8, 4, 10)'s level_codes - should be set to -1 - - Parameters - ---------- - level_codes : ndarray[ndim=2] - per-level backfilled codes of a target index. a 0 represents NaN, - and the remainder of the values should be from 1, ..., L_i, where - L_i is the length of `self.levels[i]` - level_codes_no_fill: 2D-list-like, M x N - the same as above, but computed without backfilling - - Returns - ------- - level_codes : 2D integer array, also M x N - same as level_codes but with appropriate "carrying" operations - performed, where values which are too large are represented by - rows consisting of all 0s - """ - for i, row in enumerate(level_codes): - need_to_carry = False - highest_level_adjustment = None - # go from right to left for place-value arithmetic - for j in range(len(row) - 1, -1, -1): - max_val = len(self.levels[j]) - # the value here was too large, so backfilling returns - # -1, which, after adding 1, becomes 0 - if row[j] == 0: - need_to_carry = True - - if need_to_carry: - # if row[j] was backfilled to its value, then even - # if we are "carrying," it can remain as is - new_val = ( - row[j] + 1 - if row[j] == level_codes_no_fill[i][j] - else row[j]) - if new_val > max_val or row[j] == 0: - # at this point, no more room to carry, so the - # entire value is too large - if j == 0: - for k in range(len(row)): - row[k] = 0 - # still possible value is not too large, but need to - # keep track of values which will need to be decreased - else: - highest_level_adjustment = j - # done carrying, for now - else: - row[j] = new_val - need_to_carry = False - - # if we increased any values, all lower levels (visually, all - # levels to the left) should be set to their lowest level - if row[0] > 0 and highest_level_adjustment is not None: - for k in range(highest_level_adjustment + 1, len(row)): - row[k] = 1 - - return level_codes - - def _extract_level_codes(self, object target, object method=None): + def _extract_level_codes(self, object target): """ Map the requested list of (tuple) keys to their integer representations for searching in the underlying integer index. @@ -684,82 +602,80 @@ cdef class BaseMultiIndexCodesEngine: ---------- target : list-like of keys Each key is a tuple, with a label for each level of the index. - method : string (optional) - whether to fill missing keys with either the previous (using "pad" - of "ffill" or next ("bfill"/"backfill") values, in terms of the - ordering of the tuples and the underlying index Returns ------ int_keys : 1-dimensional array of dtype uint64 or object Integers representing one combination each """ - level_codes = np.array([ - lev.get_indexer(codes, method=method) for lev, codes - in zip(self.levels, zip(*target)) - ], dtype='uint64').T + 1 - - # handle intricacies required to properly respect tuple ordering - # properties - if method is not None: - level_codes_no_fill = np.array([ - lev.get_indexer(codes) for lev, codes - in zip(self.levels, zip(*target)) - ], dtype='uint64').T + 1 - - # necessary to respect tuple ordering. intuition is that bumping - # the value at level i should make the values at levels i+1, ..., n - # as small as possible, and vice versa - for i, row in enumerate(level_codes): - for j, level in enumerate(row): - if level_codes_no_fill[i][j] != level_codes[i][j]: - for k in range(j + 1, len(row)): - row[k] = (1 if method == 'backfill' else - len(self.levels[k])) - break - - # after doing per-level indexing, backfilled level codes need - # additional cleanup, as too-large values are 0, which will in - # turn be backfilled to 1 without cleanup. This is not an issue - # for padded level codes because the final padding will (correctly) - # exclude them anyway - if method == 'backfill': - level_codes = self._do_backfill_carrying(level_codes, - level_codes_no_fill) - - return self._codes_to_ints(level_codes) - - def get_indexer(self, object target, object method=None, - object limit=None): - lab_ints = self._extract_level_codes(target, method=method) - - # All methods (exact, backfill, pad) directly map to the respective - # methods of the underlying (integers) index... - if method is not None: - # but underlying backfill and pad methods require index and keys - # to be sorted. The index already is (checked in - # Index._get_fill_indexer), sort (integer representations of) keys: - order = np.argsort(lab_ints) - lab_ints = lab_ints[order] - indexer = (getattr(self._base, f'get_{method}_indexer') - (self, lab_ints, limit=limit)) - - # handle the case where too-large values are backfilled to NaN, for - # which the integer representation from _extract_level_codes() is 0 - if method == 'backfill': - for i in range(len(indexer)): - if lab_ints[i] == 0: - indexer[i] = -1 - - # restore the ordering - new_indexer = [0] * len(indexer) - for i, idx in enumerate(order): - new_indexer[idx] = indexer[i] - return new_indexer - else: - indexer = self._base.get_indexer(self, lab_ints) + level_codes = [lev.get_indexer(codes) + 1 for lev, codes + in zip(self.levels, zip(*target))] + return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - return indexer + def get_indexer(self, object target, object limit=None) -> np.ndarray: + lab_ints = self._extract_level_codes(target) + return self._base.get_indexer(self, lab_ints) + + def get_indexer_and_fill(self, object values, object target, + object method, object limit = None) -> np.ndarray: + """ get an indexer for `target`, a sortable, array-like collection of + values which are themselves comparable to `values`, which should be the + index values of the MultiIndex object for which `self` is the engine """ + if method not in ("backfill", "pad"): + raise ValueError( + f"{method} is not a valid method value; only 'backfill' and " + "'pad' filling methods are supported") + + cdef: + int64_t i, j, next_code + int64_t num_values, num_target_values + ndarray[int64_t, ndim=1] target_order + ndarray[object, ndim=1] target_values + ndarray[int64_t, ndim=1] new_codes, new_target_codes + ndarray[int64_t, ndim=1] sorted_indexer + + target_order = np.argsort(target.values) + target_values = target.values[target_order] + num_values, num_target_values = len(values), len(target_values) + new_codes, new_target_codes = \ + np.empty((num_values,)).astype('int64'), \ + np.empty((num_target_values,)).astype('int64') + + # `values` and `target_values` are both sorted, so we walk through them + # and memoize the set of indices in the (implicit) merged sorted list, + # the effect of which is to create a factorization for the (sorted) + # merger of the index values, where `new_codes` and `new_target_codes` + # are the subset of the factors which appear in `values` and `target`, + # respectively + i, j, next_code = 0, 0, 0 + while i < num_values and j < num_target_values: + val, target_val = values[i], target_values[j] + if val <= target_val: + new_codes[i] = next_code + i += 1 + if target_val <= val: + new_target_codes[j] = next_code + j += 1 + next_code += 1 + # at this point, at least one should have reached the end + # the remaining values of the other should be added to the end + assert i == num_values or j == num_target_values + while i < num_values: + new_codes[i] = next_code + i += 1 + next_code += 1 + while j < num_target_values: + new_target_codes[j] = next_code + j += 1 + next_code += 1 + + # get the indexer, and undo the sorting of `target.values` + sorted_indexer = ( + algos.backfill(new_codes, new_target_codes, limit=limit) + if method == "backfill" else + algos.pad(new_codes, new_target_codes, limit=limit) + ) + return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): if is_definitely_invalid_key(key): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4e2d07ddf9225..dec2746d6e04a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2455,7 +2455,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) - indexer = self._engine.get_indexer(target, method, limit) + indexer = self._engine.get_indexer_and_fill( + self.values, target, method=method, limit=limit + ) elif method == "nearest": raise NotImplementedError( "method='nearest' not implemented yet " diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 29e7eb895b3aa..39178be5c7cd7 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer import pandas as pd -from pandas import ( + ../pandas/tests/frame/indexing/test_indexing.pyfrom pandas import ( DataFrame, DatetimeIndex, Index, @@ -1432,6 +1432,42 @@ def test_set_value_resize(self, float_frame): with pytest.raises(ValueError, match=msg): res._set_value("foobar", "baz", "sam") + def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + df = pd.DataFrame( + {"a": [0] * 7, "b": list(range(7)), "c": list(range(7))} + ).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, 2.0, 5.0, np.nan]} + ).set_index(["a", "b"]) + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [1, 2, 5, 6]} + ).set_index(["a", "b"]) + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [0, 2, 5, 5]} + ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) + + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) + def test_set_value_with_index_dtype_change(self): df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 8a54d4dc0c5ef..788001b991366 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -354,7 +354,7 @@ def test_get_indexer_three_or_more_levels(self): expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) - def test_get_indexer_backfill_with_carrying(self): + def test_get_indexer_crossing_levels(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests a corner case with get_indexer() with MultiIndexes where, when we # need to "carry" across levels, proper tuple ordering is respected From 33015a3a7da73833316b305b95a82ca3b2e45fc7 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 18 Feb 2020 19:39:42 -0800 Subject: [PATCH 27/49] * add docstings, fix comments * simplify code and make get_indexer_and_fill method static --- pandas/_libs/index.pyx | 62 ++++++++++++++++++++++++++++++------ pandas/core/indexes/multi.py | 2 +- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 987f92d179783..05cee1d2f2ebc 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -612,15 +612,56 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zip(*target))] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - def get_indexer(self, object target, object limit=None) -> np.ndarray: + def get_indexer(self, object target) -> np.ndarray: + """ + Gets an indexer, i.e. set of indexes into `self`'s values for the + values in `target`, where -1 represents a value in `target` not existing + in (the cross-product of) `self.levels` + + Parameters + ---------- + target : list-like of keys + Each key is a tuple, with a label for each level of the index + + Returns + ------- + 1-dimensional array of dtype int64 of the index + """ lab_ints = self._extract_level_codes(target) return self._base.get_indexer(self, lab_ints) - def get_indexer_and_fill(self, object values, object target, + @staticmethod + def get_indexer_and_fill(object values, object target, object method, object limit = None) -> np.ndarray: - """ get an indexer for `target`, a sortable, array-like collection of - values which are themselves comparable to `values`, which should be the - index values of the MultiIndex object for which `self` is the engine """ + """ + Gets an indexer, i.e. a set of indexes into `values`, for the values in + `target`, where the index value. + + If method is "backfill" then the index for a value in `target` which + does not exist in `values` is the index of the next match, or -1 is the + value is larger than the largest value in `values`. + + Similarly, if the method if "pad" then the index for a value in `target` + which does not exist in `values` is the index of the previous match, or + -1 if the value is smaller then the largest value in `values`. + + Parameters + ---------- + values : list-like of tuples + must be sorted and all have the same length + target: list-like of tuples + need not be sorted, but all must have the same length, which must be + the same as the length of all tuples in `values` + method: string + "backfill" or "pad" + limit: int, optional + if provided, limit the number of fills to this value + + Returns + ------- + np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`, + filled with the `method` (and optionally `limit`) specified + """ if method not in ("backfill", "pad"): raise ValueError( f"{method} is not a valid method value; only 'backfill' and " @@ -642,8 +683,9 @@ cdef class BaseMultiIndexCodesEngine: np.empty((num_target_values,)).astype('int64') # `values` and `target_values` are both sorted, so we walk through them - # and memoize the set of indices in the (implicit) merged sorted list, - # the effect of which is to create a factorization for the (sorted) + # and memoize the (ordered) set of indices in the (implicit) merged-and + # sorted list of the two which belong to each of them + # the effect of this is to create a factorization for the (sorted) # merger of the index values, where `new_codes` and `new_target_codes` # are the subset of the factors which appear in `values` and `target`, # respectively @@ -671,9 +713,9 @@ cdef class BaseMultiIndexCodesEngine: # get the indexer, and undo the sorting of `target.values` sorted_indexer = ( - algos.backfill(new_codes, new_target_codes, limit=limit) - if method == "backfill" else - algos.pad(new_codes, new_target_codes, limit=limit) + (algos.backfill if method == "backfill" else algos.pad)( + new_codes, new_target_codes, limit=limit + ) ) return sorted_indexer[np.argsort(target_order)] diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dec2746d6e04a..aae8aff6084e7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2455,7 +2455,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) - indexer = self._engine.get_indexer_and_fill( + indexer = self._engine.__class__.get_indexer_and_fill( self.values, target, method=method, limit=limit ) elif method == "nearest": From 590c40fb4ec71b8af41b43803f5288b1c57916f3 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 18 Feb 2020 20:09:15 -0800 Subject: [PATCH 28/49] robustify tests and add additional comments --- pandas/tests/frame/indexing/test_indexing.py | 49 +++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 39178be5c7cd7..86dde16121426 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1435,22 +1435,59 @@ def test_set_value_resize(self, float_frame): def test_reindex_with_multi_index(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests for reindexing a multi-indexed DataFrame with a new MultiIndex - df = pd.DataFrame( - {"a": [0] * 7, "b": list(range(7)), "c": list(range(7))} - ).set_index(["a", "b"]) + # + # confirms that we can reindex a multi-indexed DataFrame with a new + # MultiIndex object correctly when using no filling, backfilling, and + # padding + # + # The DataFrame, `df`, used in this test is: + # c + # a b + # -1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 0 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # + # and the other MultiIndex, `new_multi_index`, is: + # 0: 0 0.5 + # 1: 2.0 + # 2: 5.0 + # 3: 5.8 + df = pd.DataFrame({ + "a": [-1] * 7 + [0] * 7 + [1] * 7 , + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + }).set_index(["a", "b"]) new_index = [0.5, 2.0, 5.0, 5.8] new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) # reindexing w/o a `method` value reindexed = df.reindex(new_multi_index) expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [np.nan, 2.0, 5.0, np.nan]} + {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} ).set_index(["a", "b"]) tm.assert_frame_equal(expected, reindexed) # reindexing with backfilling expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [1, 2, 5, 6]} + {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} ).set_index(["a", "b"]) reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") tm.assert_frame_equal(expected, reindexed_with_backfilling) @@ -1460,7 +1497,7 @@ def test_reindex_with_multi_index(self): # reindexing with padding expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [0, 2, 5, 5]} + {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} ).set_index(["a", "b"]) reindexed_with_padding = df.reindex(new_multi_index, method="pad") tm.assert_frame_equal(expected, reindexed_with_padding) From 701ca63cbde1117ddf11eac2cd95e47a3feeac76 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 18 Feb 2020 20:22:21 -0800 Subject: [PATCH 29/49] include before/after examples --- doc/source/whatsnew/v1.1.0.rst | 64 ++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 37d60e7a8903d..c356291bb209c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -408,17 +408,61 @@ MultiIndex left.intersection(right, sort=False) - Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options (:issue:`29896`) -.. ipython:: python - - df = pd.DataFrame({ - 'a': [0, 0, 0, 0], - 'b': [0, 2, 3, 4], - 'c': ['A', 'B', 'C', 'D'] - }).set_index(['a', 'b']) - mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) +*pandas [0.23.0, 1.1.0)* +.. code-block:: python + >>> df = pd.DataFrame({ + ... 'a': [0, 0, 0, 0], + ... 'b': [0, 2, 3, 4], + ... 'c': ['A', 'B', 'C', 'D'], + ... }).set_index(['a', 'b']) + >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + >>> + >>> df.reindex(mi_2, method="backfill") + c + 0 -1 A + 0 A + 1 D + 3 A + 4 A + 5 C + >>> + >>> df.reindex(mi_2, method="pad") + c + 0 -1 NaN + 0 NaN + 1 D + 3 NaN + 4 A + 5 C + +*pandas 1.1.0*: +.. code-block:: python + >>> df = pd.DataFrame({ + ... 'a': [0, 0, 0, 0], + ... 'b': [0, 2, 3, 4], + ... 'c': ['A', 'B', 'C', 'D'], + ... }).set_index(['a', 'b']) + >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + >>> + >>> df.reindex(mi_2, method="backfill") + c + 0 -1 A + 0 A + 1 B + 3 C + 4 D + 5 NaN + >>> + >>> df.reindex(mi_2, method="pad") + c + 0 -1 NaN + 0 A + 1 A + 3 C + 4 D + 5 D + >>> - df.reindex(mi_2, method='backfill') - df.reindex(mi_2, method='pad') - I/O From c3c811e035eae266860c574c92e5f4f9a5046912 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 18 Feb 2020 21:43:46 -0800 Subject: [PATCH 30/49] ensure type assignments correct --- pandas/_libs/index.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 05cee1d2f2ebc..c74d0d0e27df8 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -675,7 +675,7 @@ cdef class BaseMultiIndexCodesEngine: ndarray[int64_t, ndim=1] new_codes, new_target_codes ndarray[int64_t, ndim=1] sorted_indexer - target_order = np.argsort(target.values) + target_order = np.argsort(target.values).astype('int64') target_values = target.values[target_order] num_values, num_target_values = len(values), len(target_values) new_codes, new_target_codes = \ @@ -712,11 +712,10 @@ cdef class BaseMultiIndexCodesEngine: next_code += 1 # get the indexer, and undo the sorting of `target.values` - sorted_indexer = ( + sorted_indexer = \ (algos.backfill if method == "backfill" else algos.pad)( new_codes, new_target_codes, limit=limit - ) - ) + ).astype('int64') return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): From f5f553194002f014a689f27d327459e079f859a2 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 18 Feb 2020 21:56:12 -0800 Subject: [PATCH 31/49] use preferred method of getting class from object --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index aae8aff6084e7..42d4d94929a3f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2455,7 +2455,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) - indexer = self._engine.__class__.get_indexer_and_fill( + indexer = type(self._engine).get_indexer_and_fill( self.values, target, method=method, limit=limit ) elif method == "nearest": From 180d9019813b0ca8be46a676422ad3e042ad08a5 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Wed, 19 Feb 2020 20:20:59 -0800 Subject: [PATCH 32/49] add newlines to get docs to compile and unindent for clarity --- doc/source/whatsnew/v1.1.0.rst | 103 +++++++++++++++++---------------- 1 file changed, 54 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c356291bb209c..8b2ba4b80baa1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -408,60 +408,65 @@ MultiIndex left.intersection(right, sort=False) - Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options (:issue:`29896`) + *pandas [0.23.0, 1.1.0)* + .. code-block:: python - >>> df = pd.DataFrame({ - ... 'a': [0, 0, 0, 0], - ... 'b': [0, 2, 3, 4], - ... 'c': ['A', 'B', 'C', 'D'], - ... }).set_index(['a', 'b']) - >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) - >>> - >>> df.reindex(mi_2, method="backfill") - c - 0 -1 A - 0 A - 1 D - 3 A - 4 A - 5 C - >>> - >>> df.reindex(mi_2, method="pad") - c - 0 -1 NaN - 0 NaN - 1 D - 3 NaN - 4 A - 5 C + + >>> df = pd.DataFrame({ + ... 'a': [0, 0, 0, 0], + ... 'b': [0, 2, 3, 4], + ... 'c': ['A', 'B', 'C', 'D'], + ... }).set_index(['a', 'b']) + >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + >>> + >>> df.reindex(mi_2, method="backfill") + c + 0 -1 A + 0 A + 1 D + 3 A + 4 A + 5 C + >>> + >>> df.reindex(mi_2, method="pad") + c + 0 -1 NaN + 0 NaN + 1 D + 3 NaN + 4 A + 5 C *pandas 1.1.0*: + .. code-block:: python - >>> df = pd.DataFrame({ - ... 'a': [0, 0, 0, 0], - ... 'b': [0, 2, 3, 4], - ... 'c': ['A', 'B', 'C', 'D'], - ... }).set_index(['a', 'b']) - >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) - >>> - >>> df.reindex(mi_2, method="backfill") - c - 0 -1 A - 0 A - 1 B - 3 C - 4 D - 5 NaN - >>> - >>> df.reindex(mi_2, method="pad") - c - 0 -1 NaN - 0 A - 1 A - 3 C - 4 D - 5 D - >>> + + >>> df = pd.DataFrame({ + ... 'a': [0, 0, 0, 0], + ... 'b': [0, 2, 3, 4], + ... 'c': ['A', 'B', 'C', 'D'], + ... }).set_index(['a', 'b']) + >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + >>> + >>> df.reindex(mi_2, method="backfill") + c + 0 -1 A + 0 A + 1 B + 3 C + 4 D + 5 NaN + >>> + >>> df.reindex(mi_2, method="pad") + c + 0 -1 NaN + 0 A + 1 A + 3 C + 4 D + 5 D + >>> - From edb2e6c23dde04a2f749127d3eee5136287d6a46 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Wed, 19 Feb 2020 20:32:37 -0800 Subject: [PATCH 33/49] add clarifying comment re: regression --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8b2ba4b80baa1..d5105c35179c8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -407,7 +407,7 @@ MultiIndex # Common elements are now guaranteed to be ordered by the left side left.intersection(right, sort=False) -- Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options (:issue:`29896`) +- Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options. This is a regression introduced in pandas 0.23.0 (:issue:`29896`) *pandas [0.23.0, 1.1.0)* From a3908eaec16249c78e042ff1344b5315c31bd8a0 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Wed, 19 Feb 2020 21:15:07 -0800 Subject: [PATCH 34/49] add black formatting --- pandas/tests/frame/indexing/test_indexing.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 86dde16121426..9f11ef88ce593 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1470,11 +1470,13 @@ def test_reindex_with_multi_index(self): # 1: 2.0 # 2: 5.0 # 3: 5.8 - df = pd.DataFrame({ - "a": [-1] * 7 + [0] * 7 + [1] * 7 , - "b": list(range(7)) * 3, - "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, - }).set_index(["a", "b"]) + df = pd.DataFrame( + { + "a": [-1] * 7 + [0] * 7 + [1] * 7, + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + } + ).set_index(["a", "b"]) new_index = [0.5, 2.0, 5.0, 5.8] new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) From f5ef69f62adf7c930ff9b97258723b9334c833e6 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Sun, 23 Feb 2020 18:48:25 -0800 Subject: [PATCH 35/49] clean up after slightly-botched rebase --- pandas/tests/indexes/multi/test_indexing.py | 38 +++++++++++++++++---- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 788001b991366..0c93adeda7872 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -242,27 +242,53 @@ def test_get_indexer_with_missing_value(self, index_arr, labels, expected): def test_get_indexer_methods(self): # https://github.com/pandas-dev/pandas/issues/29896 # test getting an indexer for another index with different methods - mult_idx_1 = MultiIndex.from_product([[0], [0, 2, 3, 4]]) + # confirms that getting an indexer without a filling method, getting an + # indexer and backfilling, and getting an indexer and padding all behave + # correctly in the case where all of the target values fall in between + # several levels in the MultiIndex into which they are getting an indexer + # + # visually, the MultiIndexes used in this test are: + # mult_idx_1: + # 0: -1 0 + # 1: 2 + # 2: 3 + # 3: 4 + # 4: 0 0 + # 5: 2 + # 6: 3 + # 7: 4 + # 8: 1 0 + # 9: 2 + # 10: 3 + # 11: 4 + # + # mult_idx_2: + # 0: 0 1 + # 1: 3 + # 2: 4 + mult_idx_1 = MultiIndex.from_product([[-1, 0, 1], [0, 2, 3, 4]]) mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) indexer = mult_idx_1.get_indexer(mult_idx_2) - expected = np.array([-1, 2, 3], dtype=indexer.dtype) + expected = np.array([-1, 6, 7], dtype=indexer.dtype) tm.assert_almost_equal(expected, indexer) backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill") - expected = np.array([1, 2, 3], dtype=backfill_indexer.dtype) + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) tm.assert_almost_equal(expected, backfill_indexer) + # ensure the legacy "bfill" option functions identically to "backfill" backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") - expected = np.array([1, 2, 3], dtype=backfill_indexer.dtype) + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) tm.assert_almost_equal(expected, backfill_indexer) pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad") - expected = np.array([0, 2, 3], dtype=pad_indexer.dtype) + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) + # ensure the legacy "ffill" option functions identically to "pad" pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") - expected = np.array([0, 2, 3], dtype=pad_indexer.dtype) + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) tm.assert_almost_equal(expected, pad_indexer) def test_get_indexer_three_or_more_levels(self): From 4bc36a2366ffffb62d8c94df05bb8b1befbe9f33 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Sun, 23 Feb 2020 19:01:31 -0800 Subject: [PATCH 36/49] take suggestions of black pandas --- pandas/tests/indexes/multi/test_indexing.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 0c93adeda7872..8c0dae433c8f4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -319,7 +319,15 @@ def test_get_indexer_three_or_more_levels(self): # 6: 3 6 8 mult_idx_1 = pd.MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]]) mult_idx_2 = pd.MultiIndex.from_tuples( - [(1, 1, 8), (1, 5, 9), (1, 6, 7), (2, 1, 6), (2, 7, 7), (2, 7, 8), (3, 6, 8)] + [ + (1, 1, 8), + (1, 5, 9), + (1, 6, 7), + (2, 1, 6), + (2, 7, 7), + (2, 7, 8), + (3, 6, 8), + ] ) # sanity check assert mult_idx_1.is_monotonic @@ -377,7 +385,9 @@ def test_get_indexer_three_or_more_levels(self): tm.assert_almost_equal(expected, backfill_indexer) pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") - expected = np.array([0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype) + expected = np.array( + [0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype + ) tm.assert_almost_equal(expected, pad_indexer) def test_get_indexer_crossing_levels(self): From fff0223e7cf0a48c951169150981cdedafe44a40 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 27 Feb 2020 12:55:28 -0800 Subject: [PATCH 37/49] address comments --- pandas/_libs/index.pyx | 24 ++++++++++-------------- pandas/core/indexes/multi.py | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c74d0d0e27df8..ada64ef3e116b 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -630,8 +630,7 @@ cdef class BaseMultiIndexCodesEngine: lab_ints = self._extract_level_codes(target) return self._base.get_indexer(self, lab_ints) - @staticmethod - def get_indexer_and_fill(object values, object target, + def get_indexer_and_fill(self, object values, object target, object method, object limit = None) -> np.ndarray: """ Gets an indexer, i.e. a set of indexes into `values`, for the values in @@ -662,11 +661,7 @@ cdef class BaseMultiIndexCodesEngine: np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`, filled with the `method` (and optionally `limit`) specified """ - if method not in ("backfill", "pad"): - raise ValueError( - f"{method} is not a valid method value; only 'backfill' and " - "'pad' filling methods are supported") - + assert method in ("backfill", "pad") cdef: int64_t i, j, next_code int64_t num_values, num_target_values @@ -678,9 +673,10 @@ cdef class BaseMultiIndexCodesEngine: target_order = np.argsort(target.values).astype('int64') target_values = target.values[target_order] num_values, num_target_values = len(values), len(target_values) - new_codes, new_target_codes = \ - np.empty((num_values,)).astype('int64'), \ - np.empty((num_target_values,)).astype('int64') + new_codes, new_target_codes = ( + np.empty((num_values,)).astype('int64'), + np.empty((num_target_values,)).astype('int64'), + ) # `values` and `target_values` are both sorted, so we walk through them # and memoize the (ordered) set of indices in the (implicit) merged-and @@ -699,6 +695,7 @@ cdef class BaseMultiIndexCodesEngine: new_target_codes[j] = next_code j += 1 next_code += 1 + # at this point, at least one should have reached the end # the remaining values of the other should be added to the end assert i == num_values or j == num_target_values @@ -712,10 +709,9 @@ cdef class BaseMultiIndexCodesEngine: next_code += 1 # get the indexer, and undo the sorting of `target.values` - sorted_indexer = \ - (algos.backfill if method == "backfill" else algos.pad)( - new_codes, new_target_codes, limit=limit - ).astype('int64') + sorted_indexer = ( + algos.backfill if method == "backfill" else algos.pad + )(new_codes, new_target_codes, limit=limit).astype('int64') return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 42d4d94929a3f..dec2746d6e04a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2455,7 +2455,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) - indexer = type(self._engine).get_indexer_and_fill( + indexer = self._engine.get_indexer_and_fill( self.values, target, method=method, limit=limit ) elif method == "nearest": From 4f80ced13ad93fe618d2c4a38a9806db55c102bd Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Sat, 29 Feb 2020 18:13:27 -0800 Subject: [PATCH 38/49] update docstrings --- pandas/_libs/index.pyx | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ada64ef3e116b..0b7ff07dda48a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -614,9 +614,9 @@ cdef class BaseMultiIndexCodesEngine: def get_indexer(self, object target) -> np.ndarray: """ - Gets an indexer, i.e. set of indexes into `self`'s values for the - values in `target`, where -1 represents a value in `target` not existing - in (the cross-product of) `self.levels` + Returns an array giving the positions of each value of `target` in + `self.values`, where -1 represents a value in `target` which does not + appear in `self.values` Parameters ---------- @@ -625,7 +625,8 @@ cdef class BaseMultiIndexCodesEngine: Returns ------- - 1-dimensional array of dtype int64 of the index + np.ndarray[int64_t, ndim=1] of the indexer of `target` into + `self.values` """ lab_ints = self._extract_level_codes(target) return self._base.get_indexer(self, lab_ints) @@ -633,21 +634,23 @@ cdef class BaseMultiIndexCodesEngine: def get_indexer_and_fill(self, object values, object target, object method, object limit = None) -> np.ndarray: """ - Gets an indexer, i.e. a set of indexes into `values`, for the values in - `target`, where the index value. + Returns an array giving the positions of each value of `target` in + `values`, where -1 represents a value in `target` which does not + appear in `values` - If method is "backfill" then the index for a value in `target` which - does not exist in `values` is the index of the next match, or -1 is the - value is larger than the largest value in `values`. + If `method` is "backfill" then the position for a value in `target` + which does not appear in `values` is that of the next greater value + in `values` (if one exists), and -1 if there is no such value. - Similarly, if the method if "pad" then the index for a value in `target` - which does not exist in `values` is the index of the previous match, or - -1 if the value is smaller then the largest value in `values`. + Similarly, if the method is "pad" then the position for a value in + `target` which does not appear in `values` is that of the next smaller + value in `values` (if one exists), and -1 if there is no such value. Parameters ---------- values : list-like of tuples - must be sorted and all have the same length + must be sorted and all have the same length. Should be the set of + the MultiIndex's values target: list-like of tuples need not be sorted, but all must have the same length, which must be the same as the length of all tuples in `values` From 3e825d9980291252b3bf9ad74c8d46410b0aaa5c Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Wed, 18 Mar 2020 21:17:01 -0700 Subject: [PATCH 39/49] use ipython directive for example with bugfix --- doc/source/whatsnew/v1.1.0.rst | 37 +++++++++------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d5105c35179c8..df68574692180 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -409,8 +409,7 @@ MultiIndex - Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options. This is a regression introduced in pandas 0.23.0 (:issue:`29896`) -*pandas [0.23.0, 1.1.0)* - +*pandas >= 0.23, < 1.1.0* .. code-block:: python >>> df = pd.DataFrame({ @@ -440,33 +439,15 @@ MultiIndex *pandas 1.1.0*: -.. code-block:: python +.. ipython:: python - >>> df = pd.DataFrame({ - ... 'a': [0, 0, 0, 0], - ... 'b': [0, 2, 3, 4], - ... 'c': ['A', 'B', 'C', 'D'], - ... }).set_index(['a', 'b']) - >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) - >>> - >>> df.reindex(mi_2, method="backfill") - c - 0 -1 A - 0 A - 1 B - 3 C - 4 D - 5 NaN - >>> - >>> df.reindex(mi_2, method="pad") - c - 0 -1 NaN - 0 A - 1 A - 3 C - 4 D - 5 D - >>> + df = pd.DataFrame({'a': [0, 0, 0, 0], + 'b': [0, 2, 3, 4], + 'c': ['A', 'B', 'C', 'D']})\ + .set_index(['a', 'b']) + mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + df.reindex(mi_2, method='backfill') + df.reindex(mi_2, method='pad') - From 83b9830297dd6e3776d961c79d5449e42cf03e8d Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Wed, 18 Mar 2020 21:28:34 -0700 Subject: [PATCH 40/49] update formatting --- doc/source/whatsnew/v1.1.0.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index df68574692180..8f4899acb9010 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -410,6 +410,7 @@ MultiIndex - Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options. This is a regression introduced in pandas 0.23.0 (:issue:`29896`) *pandas >= 0.23, < 1.1.0* + .. code-block:: python >>> df = pd.DataFrame({ @@ -441,10 +442,11 @@ MultiIndex .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 0, 0], - 'b': [0, 2, 3, 4], - 'c': ['A', 'B', 'C', 'D']})\ - .set_index(['a', 'b']) + df = pd.DataFrame({ + 'a': [0, 0, 0, 0], + 'b': [0, 2, 3, 4], + 'c': ['A', 'B', 'C', 'D'], + }).set_index(['a', 'b']) mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) df.reindex(mi_2, method='backfill') df.reindex(mi_2, method='pad') From 5349af5b691f88f4927fa35e856a33a53a1e6faf Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Wed, 18 Mar 2020 21:32:31 -0700 Subject: [PATCH 41/49] put get_indexer() calls into MultiIndex engine behind single function --- pandas/_libs/index.pyx | 15 +++++++++------ pandas/core/indexes/multi.py | 7 +++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0b7ff07dda48a..d8e0d9c6bd7ab 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -612,7 +612,7 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zip(*target))] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - def get_indexer(self, object target) -> np.ndarray: + def get_indexer_no_fill(self, object target) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `self.values`, where -1 represents a value in `target` which does not @@ -631,8 +631,8 @@ cdef class BaseMultiIndexCodesEngine: lab_ints = self._extract_level_codes(target) return self._base.get_indexer(self, lab_ints) - def get_indexer_and_fill(self, object values, object target, - object method, object limit = None) -> np.ndarray: + def get_indexer(self, object target, object values = None, + object method = None, object limit = None) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `values`, where -1 represents a value in `target` which does not @@ -648,12 +648,12 @@ cdef class BaseMultiIndexCodesEngine: Parameters ---------- - values : list-like of tuples - must be sorted and all have the same length. Should be the set of - the MultiIndex's values target: list-like of tuples need not be sorted, but all must have the same length, which must be the same as the length of all tuples in `values` + values : list-like of tuples + must be sorted and all have the same length. Should be the set of + the MultiIndex's values. Needed only if `method` is not None method: string "backfill" or "pad" limit: int, optional @@ -664,6 +664,9 @@ cdef class BaseMultiIndexCodesEngine: np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`, filled with the `method` (and optionally `limit`) specified """ + if method is None: + return self.get_indexer_no_fill(target) + assert method in ("backfill", "pad") cdef: int64_t i, j, next_code diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dec2746d6e04a..9e1cb1c8e0a69 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2455,8 +2455,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) - indexer = self._engine.get_indexer_and_fill( - self.values, target, method=method, limit=limit + indexer = self._engine.get_indexer( + values=self.values, + target=target, + method=method, + limit=limit ) elif method == "nearest": raise NotImplementedError( From 975306269812100ac632e99b158ef0e1dd3311e9 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Wed, 18 Mar 2020 21:48:31 -0700 Subject: [PATCH 42/49] add performance benchmark for MultiIndex get_indexer with filling --- asv_bench/benchmarks/multiindex_object.py | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 793f0c7c03c77..898eed496827a 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -74,10 +74,38 @@ def setup(self): ], dtype=object, ) + self.other_mi_many_mismatches = pd.MultiIndex.from_tuples([ + (-7, 41), + (-2, 3), + (-0.7, 5), + (0, 0), + (0, 1.5), + (0, 340), + (0, 1001), + (1, -4), + (1, 20), + (1, 1040), + (432, -5), + (432, 17), + (439, 165.5), + (998, -4), + (998, 24065), + (999, 865.2), + (999, 1000), + (1045, -843), + ]) def time_get_indexer(self): self.mi_int.get_indexer(self.obj_index) + def time_get_indexer_and_backfill(self): + self.mi_int.get_indexer(self.other_mi_many_mismatches, + method="backfill") + + def time_get_indexer_and_pad(self): + self.mi_int.get_indexer(self.other_mi_many_mismatches, + method="pad") + def time_is_monotonic(self): self.mi_int.is_monotonic From ab36e6771edd8bc4604de2623d6bb88220d4c6b9 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 19 Mar 2020 11:16:47 -0700 Subject: [PATCH 43/49] fix MultiIndex reference --- asv_bench/benchmarks/multiindex_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 898eed496827a..4dfda1c6b3adf 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -74,7 +74,7 @@ def setup(self): ], dtype=object, ) - self.other_mi_many_mismatches = pd.MultiIndex.from_tuples([ + self.other_mi_many_mismatches = MultiIndex.from_tuples([ (-7, 41), (-2, 3), (-0.7, 5), From 3231481a77ca6e94a5678e7649e9385ddadf4208 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 19 Mar 2020 12:45:01 -0700 Subject: [PATCH 44/49] take black pandas suggestions --- pandas/core/indexes/multi.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9e1cb1c8e0a69..7aa1456846612 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2456,10 +2456,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "tolerance not implemented yet for MultiIndex" ) indexer = self._engine.get_indexer( - values=self.values, - target=target, - method=method, - limit=limit + values=self.values, target=target, method=method, limit=limit ) elif method == "nearest": raise NotImplementedError( From 5aa7fea732c3f2d163209ad8035315409941174b Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Thu, 26 Mar 2020 19:02:58 -0700 Subject: [PATCH 45/49] use black formatter on asv profiling tests --- asv_bench/benchmarks/multiindex_object.py | 48 +++++++++++------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 4dfda1c6b3adf..18dbb7eae0615 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -74,37 +74,37 @@ def setup(self): ], dtype=object, ) - self.other_mi_many_mismatches = MultiIndex.from_tuples([ - (-7, 41), - (-2, 3), - (-0.7, 5), - (0, 0), - (0, 1.5), - (0, 340), - (0, 1001), - (1, -4), - (1, 20), - (1, 1040), - (432, -5), - (432, 17), - (439, 165.5), - (998, -4), - (998, 24065), - (999, 865.2), - (999, 1000), - (1045, -843), - ]) + self.other_mi_many_mismatches = MultiIndex.from_tuples( + [ + (-7, 41), + (-2, 3), + (-0.7, 5), + (0, 0), + (0, 1.5), + (0, 340), + (0, 1001), + (1, -4), + (1, 20), + (1, 1040), + (432, -5), + (432, 17), + (439, 165.5), + (998, -4), + (998, 24065), + (999, 865.2), + (999, 1000), + (1045, -843), + ] + ) def time_get_indexer(self): self.mi_int.get_indexer(self.obj_index) def time_get_indexer_and_backfill(self): - self.mi_int.get_indexer(self.other_mi_many_mismatches, - method="backfill") + self.mi_int.get_indexer(self.other_mi_many_mismatches, method="backfill") def time_get_indexer_and_pad(self): - self.mi_int.get_indexer(self.other_mi_many_mismatches, - method="pad") + self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad") def time_is_monotonic(self): self.mi_int.is_monotonic From 7e4e9755e908feb12a5c4726442e8f2394b2ca34 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Fri, 27 Mar 2020 09:47:08 -0700 Subject: [PATCH 46/49] move whatsnew docs into their own sub-heading in api changes --- doc/source/whatsnew/v1.1.0.rst | 105 +++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8f4899acb9010..f3d4c8c557dd8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -115,6 +115,67 @@ Backwards incompatible API changes Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) + +``MultiIndex.get_indexer`` interprets `method` argument differently +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`). + +As an example of this, given: + +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0], + 'b': [0, 2, 3, 4], + 'c': ['A', 'B', 'C', 'D'], + }).set_index(['a', 'b']) + mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + +The differences in reindexing ``df`` with ``mi_2`` and using ``method='backfill'`` can be seen here: + +*pandas >= 0.23, < 1.1.0*: + +.. code-block:: ipython + + In [1]: df.reindex(mi_2, method='backfill') + Out[1]: + c + 0 -1 A + 0 A + 1 D + 3 A + 4 A + 5 C + +*pandas <0.23, >= 1.1.0* + +.. ipython:: python + + df.reindex(mi_2, method='backfill') + +And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'`` can be seen here: + +*pandas >= 0.23, < 1.1.0* + +.. code-block:: ipython + + In [1]: df.reindex(mi_2, method='pad') + Out[1]: + c + 0 -1 NaN + 0 NaN + 1 D + 3 NaN + 4 A + 5 C + +*pandas < 0.23, >= 1.1.0* + +.. ipython:: python + + df.reindex(mi_2, method='pad') + - .. _whatsnew_110.api_breaking.indexing_raises_key_errors: @@ -407,50 +468,6 @@ MultiIndex # Common elements are now guaranteed to be ordered by the left side left.intersection(right, sort=False) -- Bug in :meth:`MultiIndex.get_indexer` incorrectly handling use of pad and backfill options. This is a regression introduced in pandas 0.23.0 (:issue:`29896`) - -*pandas >= 0.23, < 1.1.0* - -.. code-block:: python - - >>> df = pd.DataFrame({ - ... 'a': [0, 0, 0, 0], - ... 'b': [0, 2, 3, 4], - ... 'c': ['A', 'B', 'C', 'D'], - ... }).set_index(['a', 'b']) - >>> mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) - >>> - >>> df.reindex(mi_2, method="backfill") - c - 0 -1 A - 0 A - 1 D - 3 A - 4 A - 5 C - >>> - >>> df.reindex(mi_2, method="pad") - c - 0 -1 NaN - 0 NaN - 1 D - 3 NaN - 4 A - 5 C - -*pandas 1.1.0*: - -.. ipython:: python - - df = pd.DataFrame({ - 'a': [0, 0, 0, 0], - 'b': [0, 2, 3, 4], - 'c': ['A', 'B', 'C', 'D'], - }).set_index(['a', 'b']) - mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) - df.reindex(mi_2, method='backfill') - df.reindex(mi_2, method='pad') - - I/O From 113e3b3aaa079a93a69749d78e0fc069d656ec3c Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Fri, 27 Mar 2020 10:16:39 -0700 Subject: [PATCH 47/49] fix syntax error --- pandas/tests/frame/indexing/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9f11ef88ce593..ca08e6d51c095 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer import pandas as pd - ../pandas/tests/frame/indexing/test_indexing.pyfrom pandas import ( +from pandas import ( DataFrame, DatetimeIndex, Index, From 881bc1b460a6eccc57e06846d1741a1e2be6811d Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Fri, 27 Mar 2020 10:40:12 -0700 Subject: [PATCH 48/49] rm duplicate test from failed rebase --- pandas/tests/frame/indexing/test_indexing.py | 36 -------------------- 1 file changed, 36 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index ca08e6d51c095..636cfe0d47980 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1701,42 +1701,6 @@ def test_reindex_methods(self, method, expected_values): actual = df[::-1].reindex(target, method=switched_method) tm.assert_frame_equal(expected, actual) - def test_reindex_with_multi_index(self): - # https://github.com/pandas-dev/pandas/issues/29896 - # tests for reindexing a multi-indexed DataFrame with a new MultiIndex - df = pd.DataFrame( - {"a": [0] * 7, "b": list(range(7)), "c": list(range(7))} - ).set_index(["a", "b"]) - new_index = [0.5, 2.0, 5.0, 5.8] - new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) - - # reindexing w/o a `method` value - reindexed = df.reindex(new_multi_index) - expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [np.nan, 2.0, 5.0, np.nan]} - ).set_index(["a", "b"]) - tm.assert_frame_equal(expected, reindexed) - - # reindexing with backfilling - expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [1, 2, 5, 6]} - ).set_index(["a", "b"]) - reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - # reindexing with padding - expected = pd.DataFrame( - {"a": [0] * 4, "b": new_index, "c": [0, 2, 5, 5]} - ).set_index(["a", "b"]) - reindexed_with_padding = df.reindex(new_multi_index, method="pad") - tm.assert_frame_equal(expected, reindexed_with_padding) - - reindexed_with_padding = df.reindex(new_multi_index, method="ffill") - tm.assert_frame_equal(expected, reindexed_with_padding) - def test_reindex_subclass(self): # https://github.com/pandas-dev/pandas/issues/31925 class MyDataFrame(DataFrame): From 7836cc90767ba0f652065b65a5d6d26440f2d384 Mon Sep 17 00:00:00 2001 From: ChrisRobo Date: Tue, 7 Apr 2020 16:12:27 -0700 Subject: [PATCH 49/49] clean up rebase error --- pandas/tests/frame/indexing/test_indexing.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 636cfe0d47980..c3b9a7bf05c7b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1701,17 +1701,6 @@ def test_reindex_methods(self, method, expected_values): actual = df[::-1].reindex(target, method=switched_method) tm.assert_frame_equal(expected, actual) - def test_reindex_subclass(self): - # https://github.com/pandas-dev/pandas/issues/31925 - class MyDataFrame(DataFrame): - pass - - expected = DataFrame() - df = MyDataFrame() - result = df.reindex_like(expected) - - tm.assert_frame_equal(result, expected) - def test_reindex_methods_nearest_special(self): df = pd.DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5])