diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3f7c4b3b0ccb7..eb0fa49170d44 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -887,6 +887,7 @@ Deprecations - :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` have deprecated passing an ``np.array`` by default. One will need to pass the new ``raw`` parameter to be explicit about what is passed (:issue:`20584`) - ``DatetimeIndex.offset`` is deprecated. Use ``DatetimeIndex.freq`` instead (:issue:`20716`) +- ``Index.get_duplicates()`` is deprecated and will be removed in a future version (:issue:`20239`) .. _whatsnew_0230.prior_deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b67ed9cfd2241..35bfd12466429 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3879,7 +3879,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, index = _ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: - duplicates = index.get_duplicates() + duplicates = index[index.duplicated()].unique() raise ValueError('Index has duplicate keys: {dup}'.format( dup=duplicates)) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2e6e039add8a4..3d60eefc5b598 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1851,6 +1851,9 @@ def get_duplicates(self): Returns a sorted list of index elements which appear more than once in the index. + .. deprecated:: 0.23.0 + Use idx[idx.duplicated()].unique() instead + Returns ------- array-like @@ -1897,13 +1900,12 @@ def get_duplicates(self): >>> pd.Index(dates).get_duplicates() DatetimeIndex([], dtype='datetime64[ns]', freq=None) """ - from collections import defaultdict - counter = defaultdict(lambda: 0) - for k in self.values: - counter[k] += 1 - return sorted(k for k, v in compat.iteritems(counter) if v > 1) + warnings.warn("'get_duplicates' is deprecated and will be removed in " + "a future release. You can use " + "idx[idx.duplicated()].unique() instead", + FutureWarning, stacklevel=2) - _get_duplicates = get_duplicates + return self[self.duplicated()].unique() def _cleanup(self): self._engine.clear_mapping() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 95186b2e79a16..51cd1837fecca 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -502,10 +502,6 @@ def take(self, indices, axis=0, allow_fill=True, freq = self.freq if isinstance(self, ABCPeriodIndex) else None return self._shallow_copy(taken, freq=freq) - def get_duplicates(self): - values = Index.get_duplicates(self) - return self._simple_new(values) - _can_hold_na = True _na_value = NaT diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 20f4384a3d698..6e564975f34cd 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -504,7 +504,7 @@ def _get_concat_axis(self): def _maybe_check_integrity(self, concat_index): if self.verify_integrity: if not concat_index.is_unique: - overlap = concat_index.get_duplicates() + overlap = concat_index[concat_index.duplicated()].unique() raise ValueError('Indexes have overlapping values: ' '{overlap!s}'.format(overlap=overlap)) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 2d55dfff7a8f3..0722b9175c0c6 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -1,3 +1,4 @@ +import warnings import pytest @@ -178,7 +179,10 @@ def test_get_duplicates(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-03', '2000-01-03', '2000-01-04']) - result = idx.get_duplicates() + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + result = idx.get_duplicates() + ex = DatetimeIndex(['2000-01-02', '2000-01-03']) tm.assert_index_equal(result, ex) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 682517f5a6fb1..8cb75f8cfb906 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2078,6 +2078,11 @@ def test_cached_properties_not_settable(self): with tm.assert_raises_regex(AttributeError, "Can't set attribute"): idx.is_unique = False + def test_get_duplicates_deprecated(self): + idx = pd.Index([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + idx.get_duplicates() + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 88dc4cbaf7bb3..cc006baa64ce6 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2432,7 +2432,12 @@ def check(nlevels, with_nulls): for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates - assert mi.get_duplicates() == [] + + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + assert mi.get_duplicates().equals(MultiIndex.from_arrays( + [[], []])) + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( 2, dtype='bool')) @@ -2444,7 +2449,12 @@ def check(nlevels, with_nulls): labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates - assert mi.get_duplicates() == [] + + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + assert mi.get_duplicates().equals(MultiIndex.from_arrays( + [[], []])) + tm.assert_numpy_array_equal(mi.duplicated(), np.zeros( len(mi), dtype='bool')) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 4692b6d675e6b..d7745ffd94cd9 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -1,3 +1,5 @@ +import warnings + import pytest import numpy as np @@ -145,7 +147,10 @@ def test_get_duplicates(self): idx = TimedeltaIndex(['1 day', '2 day', '2 day', '3 day', '3day', '4day']) - result = idx.get_duplicates() + with warnings.catch_warnings(record=True): + # Deprecated - see GH20239 + result = idx.get_duplicates() + ex = TimedeltaIndex(['2 day', '3day']) tm.assert_index_equal(result, ex)