From 374402cf769f0af233c22f9015bbba824393bd55 Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Tue, 7 Jun 2016 18:49:34 -0400 Subject: [PATCH] BUG: Fix groupby with as_index for categorical multi groupers #13204 BUG: Fix string repr of Grouping --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/groupby.py | 38 +++++++++++++++++++++--- pandas/tests/test_groupby.py | 51 +++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index eae03b2a86661..be1f745537d05 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -527,3 +527,4 @@ Bug Fixes - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) +- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f6915e962c049..04e4db9d1fdc6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2250,7 +2250,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = to_timedelta(self.grouper) def __repr__(self): - return 'Grouping(%s)' % self.name + return 'Grouping({0})'.format(self.name) def __iter__(self): return iter(self.indices) @@ -3741,9 +3741,39 @@ def _reindex_output(self, result): return result levels_list = [ping.group_index for ping in groupings] - index = MultiIndex.from_product(levels_list, names=self.grouper.names) - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d).sortlevel(axis=self.axis) + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return result.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `result`. An idea is to do: + # result = result.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `result`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = [(i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis] + g_nums, g_names = zip(*in_axis_grps) + + result = result.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + result = result.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + result = result.reset_index(level=g_nums) + + return result.reset_index(drop=True) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6659e6b106a67..bc25525f936ac 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6304,6 +6304,47 @@ def test_groupby_categorical_two_columns(self): nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) + def test_groupby_multi_categorical_as_index(self): + # GH13204 + df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), + 'A': [10, 11, 11], + 'B': [101, 102, 103]}) + result = df.groupby(['cat', 'A'], as_index=False).sum() + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, 'A'] + result = df.groupby(['cat', f], as_index=False).sum() + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') + result = df.groupby(['cat', s], as_index=False).sum() + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # is original index dropped? + expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3], + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + + for name in [None, 'X', 'B', 'cat']: + df.index = Index(list("abc"), name=name) + result = df.groupby(['cat', 'A'], as_index=False).sum() + tm.assert_frame_equal(result, expected, check_index_type=True) + def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. @@ -6431,6 +6472,16 @@ def test_numpy_compat(self): tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, getattr(g, func), foo=1) + def test_grouping_string_repr(self): + # GH 13394 + mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) + df = DataFrame([[1, 2, 3]], columns=mi) + gr = df.groupby(df[('A', 'a')]) + + result = gr.grouper.groupings[0].__repr__() + expected = "Grouping(('A', 'a'))" + tm.assert_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all()