From fe0ff288afe42259aae2420dea2208631d234c41 Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Sat, 29 Nov 2014 15:42:43 +0000 Subject: [PATCH 1/3] Implement Categorical.searchsorted(v, side, sorter) #8420 --- pandas/core/categorical.py | 55 ++++++++++++++++++++++++++++++- pandas/tests/test_categorical.py | 56 ++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 7 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 5b3e9e8a22b12..3fc6ef1e201af 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -776,7 +776,60 @@ def nbytes(self): return self._codes.nbytes + self._categories.values.nbytes def searchsorted(self, v, side='left', sorter=None): - raise NotImplementedError("See https://github.com/pydata/pandas/issues/8420") + """Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted Categorical `self` such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Parameters + ---------- + v : array_like + Array-like values or a scalar value, to insert/search for in `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `a`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + Series.searchsorted + numpy.searchsorted + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + [apple, bread, bread, cheese, milk] + Categories (4, object): [apple < bread < cheese < milk] + >>> x.searchsorted('bread') + 1 + >>> x.searchsorted(['bread']) + array([1]) + >>> x.searchsorted(['bread', 'eggs']) + array([1, 4]) + >>> x.searchsorted(['bread', 'eggs'], side='right') + array([3, 4]) # eggs before milk + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + array([3, 5]) # eggs after donuts, after switching milk and donuts + """ + if not self.ordered: + raise ValueError("searchsorted requires an ordered Categorical.") + + values_as_codes = self.categories.values.searchsorted(np.asarray(v), side) + return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): """ diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 196ad8b7680b9..7e17fd8f3412b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -888,13 +888,57 @@ def test_nbytes(self): self.assertEqual(cat.nbytes, exp) def test_searchsorted(self): + # https://github.com/pydata/pandas/issues/8420 + s1 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + s2 = pd.Series(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) + c1 = pd.Categorical(s1) + c2 = pd.Categorical(s2) + + # Single item array + res = c1.searchsorted(['bread']) + chk = s1.searchsorted(['bread']) + exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Scalar version of single item array + # Ambiguous what Categorical should return as np.array returns + # a scalar and pd.Series returns an array. + # We get different results depending on whether + # Categorical.searchsorted(v) passes v through np.asarray() + # or pd.Series(v).values. The former returns scalar, the + # latter an array. + # Test code here follows np.array.searchsorted(). + # Commented out lines below follow pd.Series. + res = c1.searchsorted('bread') + chk = np.array(s1).searchsorted('bread') + exp = 1 + #exp = np.array([1]) + #chk = s1.searchsorted('bread') + #exp = np.array([1]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # Searching for a value that is not present in the Categorical + res = c1.searchsorted(['bread', 'eggs']) + chk = s1.searchsorted(['bread', 'eggs']) + exp = np.array([1, 4]) + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) - # See https://github.com/pydata/pandas/issues/8420 - # TODO: implement me... - cat = pd.Categorical([1,2,3]) - def f(): - cat.searchsorted(3) - self.assertRaises(NotImplementedError, f) + # Searching for a value that is not present, to the right + res = c1.searchsorted(['bread', 'eggs'], side='right') + chk = s1.searchsorted(['bread', 'eggs'], side='right') + exp = np.array([3, 4]) # eggs before milk + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) + + # As above, but with a sorter array to reorder an unsorted array + res = c2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) + exp = np.array([3, 5]) # eggs after donuts, after switching milk and donuts + self.assert_numpy_array_equal(res, exp) + self.assert_numpy_array_equal(res, chk) def test_deprecated_labels(self): # TODO: labels is deprecated and should be removed in 0.18 or 2017, whatever is earlier From 1116746f12dc8a05c126d4260e72f4dd82c2728c Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Wed, 3 Dec 2014 22:58:47 +0000 Subject: [PATCH 2/3] Revert Categorical.searchsorted() behaviour for scalar input. jreback requested array output --- pandas/core/categorical.py | 5 +++-- pandas/tests/test_categorical.py | 16 +++------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 3fc6ef1e201af..b91b46283e2fe 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -814,7 +814,7 @@ def searchsorted(self, v, side='left', sorter=None): [apple, bread, bread, cheese, milk] Categories (4, object): [apple < bread < cheese < milk] >>> x.searchsorted('bread') - 1 + array([1]) # Note: an array, not a scalar >>> x.searchsorted(['bread']) array([1]) >>> x.searchsorted(['bread', 'eggs']) @@ -828,7 +828,8 @@ def searchsorted(self, v, side='left', sorter=None): if not self.ordered: raise ValueError("searchsorted requires an ordered Categorical.") - values_as_codes = self.categories.values.searchsorted(np.asarray(v), side) + from pandas.core.series import Series + values_as_codes = self.categories.values.searchsorted(Series(v).values, side) return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7e17fd8f3412b..e04be787d04ee 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -902,20 +902,10 @@ def test_searchsorted(self): self.assert_numpy_array_equal(res, chk) # Scalar version of single item array - # Ambiguous what Categorical should return as np.array returns - # a scalar and pd.Series returns an array. - # We get different results depending on whether - # Categorical.searchsorted(v) passes v through np.asarray() - # or pd.Series(v).values. The former returns scalar, the - # latter an array. - # Test code here follows np.array.searchsorted(). - # Commented out lines below follow pd.Series. + # Categorical return np.array like pd.Series, but different from np.array.searchsorted() res = c1.searchsorted('bread') - chk = np.array(s1).searchsorted('bread') - exp = 1 - #exp = np.array([1]) - #chk = s1.searchsorted('bread') - #exp = np.array([1]) + chk = s1.searchsorted('bread') + exp = np.array([1]) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) From 20e005fcd20c767d0303ac1bf16a1af14ed0c3f3 Mon Sep 17 00:00:00 2001 From: Stephen Simmons Date: Thu, 4 Dec 2014 23:58:48 +0000 Subject: [PATCH 3/3] Add Categorical.searchsorted() to whatsnew doc --- doc/source/whatsnew/v0.15.2.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 11cf2450d2f28..1a6234625ab93 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -66,6 +66,7 @@ Enhancements - Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. +- Added support for ``searchsorted()`` on `Categorical` class (:issue:`8420`). - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See :ref:`here`. - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files.