diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a5032c590300c..accbf4468405d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1722,6 +1722,82 @@ def map(self, mapper, na_action=None): """ return map_array(self, mapper, na_action=na_action) + # ------------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ) -> ArrayLike: + """ + Dispatch GroupBy reduction or transformation operation. + + This is an *experimental* API to allow ExtensionArray authors to implement + reductions and transformations. The API is subject to change. + + Parameters + ---------- + how : {'any', 'all', 'sum', 'prod', 'min', 'max', 'mean', 'median', + 'median', 'var', 'std', 'sem', 'nth', 'last', 'ohlc', + 'cumprod', 'cumsum', 'cummin', 'cummax', 'rank'} + has_dropped_na : bool + min_count : int + ngroups : int + ids : np.ndarray[np.intp] + ids[i] gives the integer label for the group that self[i] belongs to. + **kwargs : operation-specific + 'any', 'all' -> ['skipna'] + 'var', 'std', 'sem' -> ['ddof'] + 'cumprod', 'cumsum', 'cummin', 'cummax' -> ['skipna'] + 'rank' -> ['ties_method', 'ascending', 'na_option', 'pct'] + + Returns + ------- + np.ndarray or ExtensionArray + """ + from pandas.core.arrays.string_ import StringDtype + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + # GH#43682 + if isinstance(self.dtype, StringDtype): + # StringArray + npvalues = self.to_numpy(object, na_value=np.nan) + else: + raise NotImplementedError( + f"function is not implemented for this dtype: {self.dtype}" + ) + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=None, + **kwargs, + ) + + if op.how in op.cast_blocklist: + # i.e. how in ["rank"], since other cast_blocklist methods don't go + # through cython_operation + return res_values + + if isinstance(self.dtype, StringDtype): + dtype = self.dtype + string_array_cls = dtype.construct_array_type() + return string_array_cls._from_sequence(res_values, dtype=dtype) + + else: + raise NotImplementedError + class ExtensionArraySupportsAnyAll(ExtensionArray): def any(self, *, skipna: bool = True) -> bool: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0219d97f12e8f..e1c9410946164 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2386,6 +2386,65 @@ def _str_get_dummies(self, sep: str = "|"): return PandasArray(self.astype(str))._str_get_dummies(sep) + # ------------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + dtype = self.dtype + if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: + raise TypeError(f"{dtype} type does not support {how} operations") + if how in ["min", "max", "rank"] and not dtype.ordered: + # raise TypeError instead of NotImplementedError to ensure we + # don't go down a group-by-group path, since in the empty-groups + # case that would fail to raise + raise TypeError(f"Cannot perform {how} with non-ordered Categorical") + if how not in ["rank", "any", "all", "first", "last", "min", "max"]: + if kind == "transform": + raise TypeError(f"{dtype} type does not support {how} operations") + raise TypeError(f"{dtype} dtype does not support aggregation '{how}'") + + result_mask = None + mask = self.isna() + if how == "rank": + assert self.ordered # checked earlier + npvalues = self._ndarray + elif how in ["first", "last", "min", "max"]: + npvalues = self._ndarray + result_mask = np.zeros(ngroups, dtype=bool) + else: + # any/all + npvalues = self.astype(bool) + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + + if how in op.cast_blocklist: + return res_values + elif how in ["first", "last", "min", "max"]: + res_values[result_mask == 1] = -1 + return self._from_backing_data(res_values) + # The Series.cat accessor diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c036dc09948d8..3b903cca3ed0c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1549,6 +1549,88 @@ def _mode(self, dropna: bool = True): npmodes = cast(np.ndarray, npmodes) return self._from_backing_data(npmodes) + # ------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + dtype = self.dtype + if dtype.kind == "M": + # Adding/multiplying datetimes is not valid + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + raise TypeError(f"datetime64 type does not support {how} operations") + if how in ["any", "all"]: + # GH#34479 + warnings.warn( + f"'{how}' with datetime64 dtypes is deprecated and will raise in a " + f"future version. Use (obj != pd.Timestamp(0)).{how}() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + elif isinstance(dtype, PeriodDtype): + # Adding/multiplying Periods is not valid + if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: + raise TypeError(f"Period type does not support {how} operations") + if how in ["any", "all"]: + # GH#34479 + warnings.warn( + f"'{how}' with PeriodDtype is deprecated and will raise in a " + f"future version. Use (obj != pd.Period(0, freq)).{how}() instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + # timedeltas we can add but not multiply + if how in ["prod", "cumprod", "skew"]: + raise TypeError(f"timedelta64 type does not support {how} operations") + + # All of the functions implemented here are ordinal, so we can + # operate on the tz-naive equivalents + npvalues = self._ndarray.view("M8[ns]") + + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + res_values = op._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=None, + **kwargs, + ) + + if op.how in op.cast_blocklist: + # i.e. how in ["rank"], since other cast_blocklist methods don't go + # through cython_operation + return res_values + + # We did a view to M8[ns] above, now we go the other direction + assert res_values.dtype == "M8[ns]" + if how in ["std", "sem"]: + from pandas.core.arrays import TimedeltaArray + + if isinstance(self.dtype, PeriodDtype): + raise TypeError("'std' and 'sem' are not valid for PeriodDtype") + self = cast("DatetimeArray | TimedeltaArray", self) + new_dtype = f"m8[{self.unit}]" + res_values = res_values.view(new_dtype) + return TimedeltaArray(res_values) + + res_values = res_values.view(self._ndarray.dtype) + return self._from_backing_data(res_values) + class DatelikeOps(DatetimeLikeArrayMixin): """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index aa3516c3ecb4f..c95e54ecbe5a3 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1382,3 +1382,46 @@ def _accumulate( data, mask = op(data, mask, skipna=skipna, **kwargs) return type(self)(data, mask, copy=False) + + # ------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + from pandas.core.groupby.ops import WrappedCythonOp + + kind = WrappedCythonOp.get_kind_from_how(how) + op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + + # libgroupby functions are responsible for NOT altering mask + mask = self._mask + if op.kind != "aggregate": + result_mask = mask.copy() + else: + result_mask = np.zeros(ngroups, dtype=bool) + + res_values = op._cython_op_ndim_compat( + self._data, + min_count=min_count, + ngroups=ngroups, + comp_ids=ids, + mask=mask, + result_mask=result_mask, + **kwargs, + ) + + if op.how == "ohlc": + arity = op._cython_arity.get(op.how, 1) + result_mask = np.tile(result_mask, (arity, 1)).T + + # res_values should already have the correct dtype, we just need to + # wrap in a MaskedArray + return self._maybe_mask_result(res_values, result_mask) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index f55fde9c75e4b..b628b750eb3c5 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1806,6 +1806,21 @@ def _formatter(self, boxed: bool = False): # This will infer the correct formatter from the dtype of the values. return None + # ------------------------------------------------------------------------ + # GroupBy Methods + + def _groupby_op( + self, + *, + how: str, + has_dropped_na: bool, + min_count: int, + ngroups: int, + ids: npt.NDArray[np.intp], + **kwargs, + ): + raise NotImplementedError(f"{self.dtype} dtype not supported") + def _make_sparse( arr: np.ndarray, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 55155ea41f430..e4149dbf3969a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -18,7 +18,6 @@ Sequence, final, ) -import warnings import numpy as np @@ -31,14 +30,12 @@ from pandas._typing import ( ArrayLike, AxisInt, - DtypeObj, NDFrameT, Shape, npt, ) from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, @@ -52,33 +49,16 @@ is_1d_only_ea_dtype, is_bool_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, is_numeric_dtype, - is_period_dtype, - is_sparse, - is_timedelta64_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, ) -from pandas.core.arrays import ( - Categorical, - DatetimeArray, - ExtensionArray, - PeriodArray, - TimedeltaArray, -) -from pandas.core.arrays.masked import ( - BaseMaskedArray, - BaseMaskedDtype, -) -from pandas.core.arrays.string_ import StringDtype from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -155,6 +135,12 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: _cython_arity = {"ohlc": 4} # OHLC + @classmethod + def get_kind_from_how(cls, how: str) -> str: + if how in cls._CYTHON_FUNCTIONS["aggregate"]: + return "aggregate" + return "transform" + # Note: we make this a classmethod and pass kind+how so that caching # works at the class level and not the instance level @classmethod @@ -237,70 +223,6 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: return values - # TODO: general case implementation overridable by EAs. - def _disallow_invalid_ops(self, dtype: DtypeObj): - """ - Check if we can do this operation with our cython functions. - - Raises - ------ - TypeError - This is not a valid operation for this dtype. - NotImplementedError - This may be a valid operation, but does not have a cython implementation. - """ - how = self.how - - if is_numeric_dtype(dtype): - # never an invalid op for those dtypes, so return early as fastpath - return - - if isinstance(dtype, CategoricalDtype): - if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: - raise TypeError(f"{dtype} type does not support {how} operations") - if how in ["min", "max", "rank"] and not dtype.ordered: - # raise TypeError instead of NotImplementedError to ensure we - # don't go down a group-by-group path, since in the empty-groups - # case that would fail to raise - raise TypeError(f"Cannot perform {how} with non-ordered Categorical") - if how not in ["rank", "any", "all", "first", "last", "min", "max"]: - if self.kind == "transform": - raise TypeError(f"{dtype} type does not support {how} operations") - raise TypeError(f"{dtype} dtype does not support aggregation '{how}'") - - elif is_sparse(dtype): - raise NotImplementedError(f"{dtype} dtype not supported") - elif is_datetime64_any_dtype(dtype): - # Adding/multiplying datetimes is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: - raise TypeError(f"datetime64 type does not support {how} operations") - if how in ["any", "all"]: - # GH#34479 - warnings.warn( - f"'{how}' with datetime64 dtypes is deprecated and will raise in a " - f"future version. Use (obj != pd.Timestamp(0)).{how}() instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - elif is_period_dtype(dtype): - # Adding/multiplying Periods is not valid - if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: - raise TypeError(f"Period type does not support {how} operations") - if how in ["any", "all"]: - # GH#34479 - warnings.warn( - f"'{how}' with PeriodDtype is deprecated and will raise in a " - f"future version. Use (obj != pd.Period(0, freq)).{how}() instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - elif is_timedelta64_dtype(dtype): - # timedeltas we can add but not multiply - if how in ["prod", "cumprod", "skew"]: - raise TypeError(f"timedelta64 type does not support {how} operations") - def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: how = self.how kind = self.kind @@ -358,163 +280,6 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: return np.dtype(np.float64) return dtype - @final - def _ea_wrap_cython_operation( - self, - values: ExtensionArray, - min_count: int, - ngroups: int, - comp_ids: np.ndarray, - **kwargs, - ) -> ArrayLike: - """ - If we have an ExtensionArray, unwrap, call _cython_operation, and - re-wrap if appropriate. - """ - if isinstance(values, BaseMaskedArray): - return self._masked_ea_wrap_cython_operation( - values, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - **kwargs, - ) - - elif isinstance(values, Categorical): - assert self.how in ["rank", "any", "all", "first", "last", "min", "max"] - mask = values.isna() - if self.how == "rank": - assert values.ordered # checked earlier - npvalues = values._ndarray - elif self.how in ["first", "last", "min", "max"]: - if self.how in ["min", "max"]: - assert values.ordered # checked earlier - npvalues = values._ndarray - result_mask = np.zeros(ngroups, dtype=np.uint8) - kwargs["result_mask"] = result_mask - else: - npvalues = values.astype(bool) - - res_values = self._cython_op_ndim_compat( - npvalues, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=mask, - **kwargs, - ) - - if self.how in self.cast_blocklist: - return res_values - elif self.how in ["first", "last", "min", "max"]: - res_values[result_mask == 1] = -1 - return values._from_backing_data(res_values) - - npvalues = self._ea_to_cython_values(values) - - res_values = self._cython_op_ndim_compat( - npvalues, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=None, - **kwargs, - ) - - if self.how in self.cast_blocklist: - # i.e. how in ["rank"], since other cast_blocklist methods don't go - # through cython_operation - return res_values - - return self._reconstruct_ea_result(values, res_values) - - # TODO: general case implementation overridable by EAs. - def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray: - # GH#43682 - if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)): - # All of the functions implemented here are ordinal, so we can - # operate on the tz-naive equivalents - npvalues = values._ndarray.view("M8[ns]") - elif isinstance(values.dtype, StringDtype): - # StringArray - npvalues = values.to_numpy(object, na_value=np.nan) - else: - raise NotImplementedError( - f"function is not implemented for this dtype: {values.dtype}" - ) - return npvalues - - # TODO: general case implementation overridable by EAs. - def _reconstruct_ea_result( - self, values: ExtensionArray, res_values: np.ndarray - ) -> ExtensionArray: - """ - Construct an ExtensionArray result from an ndarray result. - """ - dtype: BaseMaskedDtype | StringDtype - - if isinstance(values.dtype, StringDtype): - dtype = values.dtype - string_array_cls = dtype.construct_array_type() - return string_array_cls._from_sequence(res_values, dtype=dtype) - - elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)): - # In to_cython_values we took a view as M8[ns] - assert res_values.dtype == "M8[ns]" - if self.how in ["std", "sem"]: - if isinstance(values, PeriodArray): - raise TypeError("'std' and 'sem' are not valid for PeriodDtype") - new_dtype = f"m8[{values.unit}]" - res_values = res_values.view(new_dtype) - return TimedeltaArray(res_values) - - res_values = res_values.view(values._ndarray.dtype) - return values._from_backing_data(res_values) - - raise NotImplementedError - - @final - def _masked_ea_wrap_cython_operation( - self, - values: BaseMaskedArray, - min_count: int, - ngroups: int, - comp_ids: np.ndarray, - **kwargs, - ) -> BaseMaskedArray: - """ - Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's - and cython algorithms which accept a mask. - """ - orig_values = values - - # libgroupby functions are responsible for NOT altering mask - mask = values._mask - if self.kind != "aggregate": - result_mask = mask.copy() - else: - result_mask = np.zeros(ngroups, dtype=bool) - - arr = values._data - - res_values = self._cython_op_ndim_compat( - arr, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=mask, - result_mask=result_mask, - **kwargs, - ) - - if self.how == "ohlc": - arity = self._cython_arity.get(self.how, 1) - result_mask = np.tile(result_mask, (arity, 1)).T - - # res_values should already have the correct dtype, we just need to - # wrap in a MaskedArray - return orig_values._maybe_mask_result(res_values, result_mask) - @final def _cython_op_ndim_compat( self, @@ -707,6 +472,17 @@ def _call_cython_op( return op_result + @final + def _validate_axis(self, axis: AxisInt, values: ArrayLike) -> None: + if values.ndim > 2: + raise NotImplementedError("number of dimensions is currently limited to 2") + if values.ndim == 2: + assert axis == 1, axis + elif not is_1d_only_ea_dtype(values.dtype): + # Note: it is *not* the case that axis is always 0 for 1-dim values, + # as we can have 1D ExtensionArrays that we need to treat as 2D + assert axis == 0 + @final def cython_operation( self, @@ -721,26 +497,16 @@ def cython_operation( """ Call our cython function, with appropriate pre- and post- processing. """ - if values.ndim > 2: - raise NotImplementedError("number of dimensions is currently limited to 2") - if values.ndim == 2: - assert axis == 1, axis - elif not is_1d_only_ea_dtype(values.dtype): - # Note: it is *not* the case that axis is always 0 for 1-dim values, - # as we can have 1D ExtensionArrays that we need to treat as 2D - assert axis == 0 - - # can we do this operation with our cython functions - # if not raise NotImplementedError - self._disallow_invalid_ops(values.dtype) + self._validate_axis(axis, values) if not isinstance(values, np.ndarray): # i.e. ExtensionArray - return self._ea_wrap_cython_operation( - values, + return values._groupby_op( + how=self.how, + has_dropped_na=self.has_dropped_na, min_count=min_count, ngroups=ngroups, - comp_ids=comp_ids, + ids=comp_ids, **kwargs, )