diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b5be5b1a7c552..44cd2d8906a5b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -129,6 +129,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): Returns ------- + labels : the indexer to the original array + uniques : the unique values + + note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 027e7c5fab191..4529b5e97adf2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -44,6 +44,78 @@ def test_strings(self): expected = Series(np.array([1, 0, np.nan, 0, 1, 2, np.nan])) tm.assert_series_equal(result,expected) +class TestFactorize(tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic(self): + + labels, uniques = algos.factorize(['a', 'b', 'b', 'a', + 'a', 'c', 'c', 'c']) + self.assert_(np.array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array(['a','b','c'], dtype=object))) + + labels, uniques = algos.factorize(['a', 'b', 'b', 'a', + 'a', 'c', 'c', 'c'], sort=True) + self.assert_(np.array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array(['a','b','c'], dtype=object))) + + labels, uniques = algos.factorize(list(reversed(range(5)))) + self.assert_(np.array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))) + + labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) + self.assert_(np.array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64))) + + labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) + self.assert_(np.array_equal(labels, np.array([0., 1., 2., 3., 4.], dtype=np.float64))) + self.assert_(np.array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))) + + labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True) + self.assert_(np.array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([0., 1., 2., 3., 4.], dtype=np.float64))) + + def test_mixed(self): + + # doc example reshaping.rst + x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + labels, uniques = algos.factorize(x) + + self.assert_(np.array_equal(labels, np.array([ 0, 0, -1, 1, 2, 3],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array(['A', 'B', 3.14, np.inf], dtype=object))) + + labels, uniques = algos.factorize(x, sort=True) + self.assert_(np.array_equal(labels, np.array([ 2, 2, -1, 3, 0, 1],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([3.14, np.inf, 'A', 'B'], dtype=object))) + + def test_datelike(self): + + # M8 + v1 = pd.Timestamp('20130101 09:00:00.00004') + v2 = pd.Timestamp('20130101') + x = Series([v1,v1,v1,v2,v2,v1]) + labels, uniques = algos.factorize(x) + self.assert_(np.array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([v1.value,v2.value],dtype='M8[ns]'))) + + labels, uniques = algos.factorize(x, sort=True) + self.assert_(np.array_equal(labels, np.array([ 1,1,1,0,0,1],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([v2.value,v1.value],dtype='M8[ns]'))) + + # period + v1 = pd.Period('201302',freq='M') + v2 = pd.Period('201303',freq='M') + x = Series([v1,v1,v1,v2,v2,v1]) + + # periods are not 'sorted' as they are converted back into an index + labels, uniques = algos.factorize(x) + self.assert_(np.array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([v1, v2],dtype=object))) + + labels, uniques = algos.factorize(x,sort=True) + self.assert_(np.array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))) + self.assert_(np.array_equal(uniques, np.array([v1, v2],dtype=object))) + class TestUnique(tm.TestCase): _multiprocess_can_split_ = True