Skip to content

Commit 39bd3d3

Browse files
authored
BUG/TST: non-numeric EA reductions (#59234)
* BUG/TST: non-numeric EA reductions * whatsnew * add keepdims keyword to StringArray._reduce
1 parent 2a9855b commit 39bd3d3

File tree

9 files changed

+81
-19
lines changed

9 files changed

+81
-19
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ ExtensionArray
616616
^^^^^^^^^^^^^^
617617
- Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
618618
- Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
619+
- Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`)
619620

620621
Styler
621622
^^^^^^

pandas/core/arrays/arrow/array.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1706,8 +1706,6 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
17061706
if name == "median":
17071707
# GH 52679: Use quantile instead of approximate_median; returns array
17081708
result = result[0]
1709-
if pc.is_null(result).as_py():
1710-
return result
17111709

17121710
if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
17131711
result = result.cast(pa_type)

pandas/core/arrays/base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1986,7 +1986,10 @@ def _reduce(
19861986
)
19871987
result = meth(skipna=skipna, **kwargs)
19881988
if keepdims:
1989-
result = np.array([result])
1989+
if name in ["min", "max"]:
1990+
result = self._from_sequence([result], dtype=self.dtype)
1991+
else:
1992+
result = np.array([result])
19901993

19911994
return result
19921995

pandas/core/arrays/datetimes.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2275,6 +2275,19 @@ def to_julian_date(self) -> npt.NDArray[np.float64]:
22752275
# -----------------------------------------------------------------
22762276
# Reductions
22772277

2278+
def _reduce(
2279+
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
2280+
):
2281+
result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
2282+
if keepdims and isinstance(result, np.ndarray):
2283+
if name == "std":
2284+
from pandas.core.arrays import TimedeltaArray
2285+
2286+
return TimedeltaArray._from_sequence(result)
2287+
else:
2288+
return self._from_sequence(result, dtype=self.dtype)
2289+
return result
2290+
22782291
def std(
22792292
self,
22802293
axis=None,

pandas/core/arrays/period.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,17 @@ def _check_timedeltalike_freq_compat(self, other):
956956
delta = delta.view("i8")
957957
return lib.item_from_zerodim(delta)
958958

959+
# ------------------------------------------------------------------
960+
# Reductions
961+
962+
def _reduce(
963+
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
964+
):
965+
result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
966+
if keepdims and isinstance(result, np.ndarray):
967+
return self._from_sequence(result, dtype=self.dtype)
968+
return result
969+
959970

960971
def raise_on_incompatible(left, right) -> IncompatibleFrequency:
961972
"""

pandas/core/arrays/string_.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -522,10 +522,19 @@ def astype(self, dtype, copy: bool = True):
522522
return super().astype(dtype, copy)
523523

524524
def _reduce(
525-
self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs
525+
self,
526+
name: str,
527+
*,
528+
skipna: bool = True,
529+
keepdims: bool = False,
530+
axis: AxisInt | None = 0,
531+
**kwargs,
526532
):
527533
if name in ["min", "max"]:
528-
return getattr(self, name)(skipna=skipna, axis=axis)
534+
result = getattr(self, name)(skipna=skipna, axis=axis)
535+
if keepdims:
536+
return self._from_sequence([result], dtype=self.dtype)
537+
return result
529538

530539
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
531540

pandas/tests/extension/base/reduce.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import pandas as pd
66
import pandas._testing as tm
7-
from pandas.api.types import is_numeric_dtype
87

98

109
class BaseReduceTests:
@@ -119,8 +118,6 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
119118
def test_reduce_frame(self, data, all_numeric_reductions, skipna):
120119
op_name = all_numeric_reductions
121120
ser = pd.Series(data)
122-
if not is_numeric_dtype(ser.dtype):
123-
pytest.skip(f"{ser.dtype} is not numeric dtype")
124121

125122
if op_name in ["count", "kurt", "sem"]:
126123
pytest.skip(f"{op_name} not an array method")

pandas/tests/extension/test_arrow.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,10 @@
6767

6868
pa = pytest.importorskip("pyarrow")
6969

70-
from pandas.core.arrays.arrow.array import ArrowExtensionArray
70+
from pandas.core.arrays.arrow.array import (
71+
ArrowExtensionArray,
72+
get_unit_from_pa_dtype,
73+
)
7174
from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
7275

7376

@@ -505,6 +508,16 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
505508
# behavior which does not support this.
506509
return False
507510

511+
if pa.types.is_boolean(pa_dtype) and op_name in [
512+
"median",
513+
"std",
514+
"var",
515+
"skew",
516+
"kurt",
517+
"sem",
518+
]:
519+
return False
520+
508521
return True
509522

510523
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
@@ -540,18 +553,9 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, reque
540553
f"pyarrow={pa.__version__} for {pa_dtype}"
541554
),
542555
)
543-
if all_numeric_reductions in {"skew", "kurt"} and (
544-
dtype._is_numeric or dtype.kind == "b"
545-
):
556+
if all_numeric_reductions in {"skew", "kurt"} and dtype._is_numeric:
546557
request.applymarker(xfail_mark)
547558

548-
elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in {
549-
"sem",
550-
"std",
551-
"var",
552-
"median",
553-
}:
554-
request.applymarker(xfail_mark)
555559
super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
556560

557561
@pytest.mark.parametrize("skipna", [True, False])
@@ -574,15 +578,32 @@ def test_reduce_series_boolean(
574578
return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna)
575579

576580
def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
581+
pa_type = arr._pa_array.type
582+
577583
if op_name in ["max", "min"]:
578584
cmp_dtype = arr.dtype
585+
elif pa.types.is_temporal(pa_type):
586+
if op_name in ["std", "sem"]:
587+
if pa.types.is_duration(pa_type):
588+
cmp_dtype = arr.dtype
589+
elif pa.types.is_date(pa_type):
590+
cmp_dtype = ArrowDtype(pa.duration("s"))
591+
elif pa.types.is_time(pa_type):
592+
unit = get_unit_from_pa_dtype(pa_type)
593+
cmp_dtype = ArrowDtype(pa.duration(unit))
594+
else:
595+
cmp_dtype = ArrowDtype(pa.duration(pa_type.unit))
596+
else:
597+
cmp_dtype = arr.dtype
579598
elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
580599
if op_name not in ["median", "var", "std"]:
581600
cmp_dtype = arr.dtype
582601
else:
583602
cmp_dtype = "float64[pyarrow]"
584603
elif op_name in ["median", "var", "std", "mean", "skew"]:
585604
cmp_dtype = "float64[pyarrow]"
605+
elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type):
606+
cmp_dtype = "uint64[pyarrow]"
586607
else:
587608
cmp_dtype = {
588609
"i": "int64[pyarrow]",
@@ -598,6 +619,10 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
598619
if data.dtype._is_numeric:
599620
mark = pytest.mark.xfail(reason="skew not implemented")
600621
request.applymarker(mark)
622+
elif op_name == "std" and pa.types.is_date64(data._pa_array.type) and skipna:
623+
# overflow
624+
mark = pytest.mark.xfail(reason="Cannot cast")
625+
request.applymarker(mark)
601626
return super().test_reduce_frame(data, all_numeric_reductions, skipna)
602627

603628
@pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])

pandas/tests/extension/test_datetime.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,11 @@ def _get_expected_exception(self, op_name, obj, other):
9595
return None
9696
return super()._get_expected_exception(op_name, obj, other)
9797

98+
def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
99+
if op_name == "std":
100+
return "timedelta64[ns]"
101+
return arr.dtype
102+
98103
def _supports_accumulation(self, ser, op_name: str) -> bool:
99104
return op_name in ["cummin", "cummax"]
100105

0 commit comments

Comments
 (0)