diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index a88b7332d9b9e..34291c75ea155 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -734,3 +734,28 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]}) df df.groupby(df.sum(), axis=1).sum() + + +Group DataFrame columns, compute a set of metrics and return a named Series. +The Series name is used as the name for the column index. This is especially +useful in conjunction with reshaping operations such as stacking in which the +column index name will be used as the name of the inserted column: + +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + }) + + def compute_metrics(x): + result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} + return pd.Series(result, name='metrics') + + result = df.groupby('a').apply(compute_metrics) + + result + + result.stack() diff --git a/doc/source/release.rst b/doc/source/release.rst index 1819272c59243..91c5a25ed4bab 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -112,6 +112,16 @@ API Changes - ``df.iloc[:-len(df)]`` is now empty - ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse +- Better propagation/preservation of Series names when performing groupby + operations: + - ``SeriesGroupBy.agg`` will ensure that the name attribute of the original + series is propagated to the result (:issue:`6265`). + - If the function provided to ``GroupBy.apply`` returns a named series, the + name of the series will be kept as the name of the column index of the + DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates + ``DataFrame.stack`` operations where the name of the column index is used as + the name of the inserted column containing the pivoted data. + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f0588524e16eb..cda73401d2d8d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1783,7 +1783,8 @@ def _wrap_aggregated_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: - return Series([]) + # GH #6265 + return Series([], name=self.name) def _get_index(): if self.grouper.nkeys > 1: @@ -1805,7 +1806,8 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - return Series(values, index=_get_index()) + # GH #6265 + return Series(values, index=_get_index(), name=self.name) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -2262,17 +2264,29 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): try: if self.axis == 0: + # GH6124 if the list of Series have a consistent name, + # then propagate that name to the result. + index = v.index.copy() + if index.name is None: + # Only propagate the series name to the result + # if all series have a consistent name. If the + # series do not have a consistent name, do + # nothing. + names = set(v.name for v in values) + if len(names) == 1: + index.name = list(names)[0] # normally use vstack as its faster than concat # and if we have mi-columns if not _np_version_under1p7 or isinstance(v.index,MultiIndex): stacked_values = np.vstack([np.asarray(x) for x in values]) - result = DataFrame(stacked_values,index=key_index,columns=v.index) + result = DataFrame(stacked_values,index=key_index,columns=index) else: # GH5788 instead of stacking; concat gets the dtypes correct from pandas.tools.merge import concat result = concat(values,keys=key_index,names=key_index.names, axis=self.axis).unstack() + result.columns = index else: stacked_values = np.vstack([np.asarray(x) for x in values]) result = DataFrame(stacked_values.T,index=v.index,columns=key_index) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4eee1d3a212e0..53e093741b63c 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2057,6 +2057,41 @@ def test_groupby_series_with_name(self): self.assertIn('A', result2) self.assertIn('B', result2) + def test_seriesgroupby_name_attr(self): + # GH 6265 + result = self.df.groupby('A')['C'] + self.assertEquals(result.count().name, 'C') + self.assertEquals(result.mean().name, 'C') + + testFunc = lambda x: np.sum(x)*2 + self.assertEquals(result.agg(testFunc).name, 'C') + + def test_groupby_name_propagation(self): + # GH 6124 + def summarize(df, name=None): + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=name) + + def summarize_random_name(df): + # Provide a different name for each Series. In this case, groupby + # should not attempt to propagate the Series name since they are + # inconsistent. + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=df.iloc[0]['A']) + + metrics = self.df.groupby('A').apply(summarize) + self.assertEqual(metrics.columns.name, None) + metrics = self.df.groupby('A').apply(summarize, 'metrics') + self.assertEqual(metrics.columns.name, 'metrics') + metrics = self.df.groupby('A').apply(summarize_random_name) + self.assertEqual(metrics.columns.name, None) + def test_groupby_nonstring_columns(self): df = DataFrame([np.arange(10) for x in range(10)]) grouped = df.groupby(0)