From cc43503341cfee4b99f020bf780621c98d844acd Mon Sep 17 00:00:00 2001 From: Stephen Rauch Date: Thu, 16 Feb 2017 11:09:48 -0800 Subject: [PATCH 1/3] BUG: GH15429 transform result of timedelta from datetime The transform() operation needs to return a like-indexed. To facilitate this, transform starts with a copy of the original series. Then, after the computation for each group, sets the appropriate elements of the copied series equal to the result. At that point is does a type comparison, and discovers that the timedelta is not cast-able to a datetime. --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/groupby.py | 13 +++++++-- pandas/tests/groupby/test_transform.py | 39 +++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index be487e165c602..9298d4b058d98 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -626,6 +626,7 @@ Bug Fixes - Bug in ``.read_csv()`` with ``parse_dates`` when multiline headers are specified (:issue:`15376`) +- Bug in ``groupby.transform()`` that would coerce the resultant dtypes back to the original (:issue:`10972`) - Bug in ``DataFrame.boxplot`` where ``fontsize`` was not applied to the tick labels on both axes (:issue:`15108`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 831ca3886773e..e21b560654e1f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -31,7 +31,7 @@ _ensure_object, _ensure_categorical, _ensure_float) -from pandas.types.cast import _possibly_downcast_to_dtype +from pandas.types.cast import _possibly_downcast_to_dtype, _find_common_type from pandas.types.missing import isnull, notnull, _maybe_fill from pandas.core.common import (_values_from_object, AbstractMethodError, @@ -2906,8 +2906,15 @@ def transform(self, func, *args, **kwargs): common_type = np.common_type(np.array(res), result) if common_type != result.dtype: result = result.astype(common_type) - except: - pass + except Exception as exc: + # date math can cause type of result to change + if i == 0 and (is_datetime64_dtype(result.dtype) or + is_timedelta64_dtype(result.dtype)): + try: + dtype = res.dtype + except Exception as exc: + dtype = type(res) + result = np.empty_like(result, dtype) indexer = self._get_index(name) result[indexer] = res diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index cf5e9eb26ff13..51920ec642705 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd from pandas.util import testing as tm -from pandas import Series, DataFrame, Timestamp, MultiIndex, concat +from pandas import Series, DataFrame, Timestamp, MultiIndex, concat, date_range from pandas.types.common import _ensure_platform_int from .common import MixIn, assert_fp_equal @@ -190,6 +190,43 @@ def test_transform_bug(self): expected = Series(np.arange(5, 0, step=-1), name='B') assert_series_equal(result, expected) + def test_transform_datetime_to_timedelta(self): + # GH 15429 + # transforming a datetime to timedelta + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + expected = pd.Series([ + Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') + + # this does date math without changing result type in transform + base_time = df['A'][0] + result = df.groupby('A')['A'].transform( + lambda x: x.max() - x.min() + base_time) - base_time + assert_series_equal(result, expected) + + # this does date math and causes the transform to return timedelta + result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) + assert_series_equal(result, expected) + + def test_transform_datetime_to_numeric(self): + # GH 10972 + # convert dt to float + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) + + expected = Series([-0.5, 0.5], name='b') + assert_series_equal(result, expected) + + # convert dt to int + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) + + expected = Series([0, 1], name='b') + assert_series_equal(result, expected) + def test_transform_multiple(self): grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) From 2f485493ae0687da0bc6ef16c29298272aa43437 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Feb 2017 08:42:19 -0500 Subject: [PATCH 2/3] fixup slow transforms --- pandas/core/groupby.py | 41 ++++++++++++---------------- pandas/tests/groupby/test_filters.py | 1 + 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e21b560654e1f..0f02357473a80 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2890,39 +2890,32 @@ def transform(self, func, *args, **kwargs): lambda: getattr(self, func)(*args, **kwargs)) # reg transform - dtype = self._selected_obj.dtype - result = self._selected_obj.values.copy() - + klass = self._selected_obj.__class__ + results = [] wrapper = lambda x: func(x, *args, **kwargs) - for i, (name, group) in enumerate(self): + for name, group in self: object.__setattr__(group, 'name', name) res = wrapper(group) if hasattr(res, 'values'): res = res.values - # may need to astype - try: - common_type = np.common_type(np.array(res), result) - if common_type != result.dtype: - result = result.astype(common_type) - except Exception as exc: - # date math can cause type of result to change - if i == 0 and (is_datetime64_dtype(result.dtype) or - is_timedelta64_dtype(result.dtype)): - try: - dtype = res.dtype - except Exception as exc: - dtype = type(res) - result = np.empty_like(result, dtype) - indexer = self._get_index(name) - result[indexer] = res + s = klass(res, indexer) + results.append(s) - result = _possibly_downcast_to_dtype(result, dtype) - return self._selected_obj.__class__(result, - index=self._selected_obj.index, - name=self._selected_obj.name) + from pandas.tools.concat import concat + result = concat(results).sort_index() + + # we will only try to coerce the result type if + # we have a numeric dtype + dtype = self._selected_obj.dtype + if is_numeric_dtype(dtype): + result = _possibly_downcast_to_dtype(result, dtype) + + result.name = self._selected_obj.name + result.index = self._selected_obj.index + return result def _transform_fast(self, func): """ diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 46ddb5a5318fb..de6757786a363 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -216,6 +216,7 @@ def test_filter_against_workaround(self): grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) From c3b0dd086548156e14b5c97706bec63889461962 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Feb 2017 10:37:29 -0500 Subject: [PATCH 3/3] PEP fix --- pandas/core/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0f02357473a80..2c61a73d6814e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -31,7 +31,7 @@ _ensure_object, _ensure_categorical, _ensure_float) -from pandas.types.cast import _possibly_downcast_to_dtype, _find_common_type +from pandas.types.cast import _possibly_downcast_to_dtype from pandas.types.missing import isnull, notnull, _maybe_fill from pandas.core.common import (_values_from_object, AbstractMethodError,