Skip to content

Commit 5630d9d

Browse files
committed
ENH: Added a min_count keyword to stat funcs
The current default is 1, reproducing the behavior of pandas 0.21. The current test suite should pass. Currently, only nansum and nanprod actually do anything with `min_count`. It will not be hard to adjust other nan* methods use it if we want. This was just simplest for now. Additional tests for the new behavior have been added.
1 parent 8e33a71 commit 5630d9d

File tree

5 files changed

+97
-29
lines changed

5 files changed

+97
-29
lines changed

pandas/core/categorical.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1956,7 +1956,7 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
19561956
raise TypeError(msg.format(op=name))
19571957
return func(numeric_only=numeric_only, **kwds)
19581958

1959-
def min(self, numeric_only=None, **kwargs):
1959+
def min(self, numeric_only=None, min_count=1, **kwargs):
19601960
""" The minimum value of the object.
19611961
19621962
Only ordered `Categoricals` have a minimum!
@@ -1981,7 +1981,7 @@ def min(self, numeric_only=None, **kwargs):
19811981
else:
19821982
return self.categories[pointer]
19831983

1984-
def max(self, numeric_only=None, **kwargs):
1984+
def max(self, numeric_only=None, min_count=1, **kwargs):
19851985
""" The maximum value of the object.
19861986
19871987
Only ordered `Categoricals` have a maximum!

pandas/core/generic.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7424,7 +7424,7 @@ def _add_series_only_operations(cls):
74247424

74257425
axis_descr, name, name2 = _doc_parms(cls)
74267426

7427-
def nanptp(values, axis=0, skipna=True):
7427+
def nanptp(values, axis=0, min_count=1, skipna=True):
74287428
nmax = nanops.nanmax(values, axis, skipna)
74297429
nmin = nanops.nanmin(values, axis, skipna)
74307430
return nmax - nmin
@@ -7528,6 +7528,9 @@ def _doc_parms(cls):
75287528
numeric_only : boolean, default None
75297529
Include only float, int, boolean columns. If None, will attempt to use
75307530
everything, then use only numeric data. Not implemented for Series.
7531+
min_count : int, default 1
7532+
The required number of valid values to perform the operation. If fewer than
7533+
``min_count`` non-NA values are present the result will be NA.
75317534
75327535
Returns
75337536
-------
@@ -7604,6 +7607,7 @@ def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
76047607
axis_descr=axis_descr)
76057608
@Appender(_num_doc)
76067609
def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
7610+
min_count=1,
76077611
**kwargs):
76087612
nv.validate_stat_func(tuple(), kwargs, fname=name)
76097613
if skipna is None:
@@ -7614,7 +7618,7 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
76147618
return self._agg_by_level(name, axis=axis, level=level,
76157619
skipna=skipna)
76167620
return self._reduce(f, name, axis=axis, skipna=skipna,
7617-
numeric_only=numeric_only)
7621+
numeric_only=numeric_only, min_count=min_count)
76187622

76197623
return set_function_name(stat_func, name, cls)
76207624

pandas/core/indexes/datetimelike.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ def tolist(self):
479479
"""
480480
return list(self.astype(object))
481481

482-
def min(self, axis=None, *args, **kwargs):
482+
def min(self, axis=None, min_count=1, *args, **kwargs):
483483
"""
484484
Return the minimum value of the Index or minimum along
485485
an axis.
@@ -527,7 +527,7 @@ def argmin(self, axis=None, *args, **kwargs):
527527
i8[mask] = np.iinfo('int64').max
528528
return i8.argmin()
529529

530-
def max(self, axis=None, *args, **kwargs):
530+
def max(self, axis=None, min_count=1, *args, **kwargs):
531531
"""
532532
Return the maximum value of the Index or maximum along
533533
an axis.

pandas/core/nanops.py

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -101,14 +101,13 @@ def __call__(self, alt):
101101
bn_func = None
102102

103103
@functools.wraps(alt)
104-
def f(values, axis=None, skipna=True, **kwds):
104+
def f(values, axis=None, skipna=True, min_count=1, **kwds):
105105
if len(self.kwargs) > 0:
106106
for k, v in compat.iteritems(self.kwargs):
107107
if k not in kwds:
108108
kwds[k] = v
109109
try:
110-
if values.size == 0:
111-
110+
if values.size < min_count:
112111
# we either return np.nan or pd.NaT
113112
if is_numeric_dtype(values):
114113
values = values.astype('float64')
@@ -132,7 +131,8 @@ def f(values, axis=None, skipna=True, **kwds):
132131
if _has_infs(result):
133132
result = alt(values, axis=axis, skipna=skipna, **kwds)
134133
else:
135-
result = alt(values, axis=axis, skipna=skipna, **kwds)
134+
result = alt(values, axis=axis, skipna=skipna,
135+
min_count=min_count, **kwds)
136136
except Exception:
137137
try:
138138
result = alt(values, axis=axis, skipna=skipna, **kwds)
@@ -292,34 +292,53 @@ def _wrap_results(result, dtype):
292292
return result
293293

294294

295-
def nanany(values, axis=None, skipna=True):
295+
def _na_for_min_count(values, axis):
296+
# we either return np.nan or pd.NaT
297+
if is_numeric_dtype(values):
298+
values = values.astype('float64')
299+
fill_value = na_value_for_dtype(values.dtype)
300+
301+
if values.ndim == 1:
302+
return fill_value
303+
else:
304+
result_shape = (values.shape[:axis] +
305+
values.shape[axis + 1:])
306+
result = np.empty(result_shape, dtype=values.dtype)
307+
result.fill(fill_value)
308+
return result
309+
310+
311+
def nanany(values, axis=None, skipna=True, min_count=1):
296312
values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna)
297313
return values.any(axis)
298314

299315

300-
def nanall(values, axis=None, skipna=True):
316+
def nanall(values, axis=None, skipna=True, min_count=1):
301317
values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna)
302318
return values.all(axis)
303319

304320

305321
@disallow('M8')
306322
@bottleneck_switch()
307-
def nansum(values, axis=None, skipna=True):
323+
def nansum(values, axis=None, skipna=True, min_count=1):
324+
if len(values) < min_count:
325+
return _na_for_min_count(values, axis=axis)
326+
308327
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
309328
dtype_sum = dtype_max
310329
if is_float_dtype(dtype):
311330
dtype_sum = dtype
312331
elif is_timedelta64_dtype(dtype):
313332
dtype_sum = np.float64
314333
the_sum = values.sum(axis, dtype=dtype_sum)
315-
the_sum = _maybe_null_out(the_sum, axis, mask)
334+
the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count)
316335

317336
return _wrap_results(the_sum, dtype)
318337

319338

320339
@disallow('M8')
321340
@bottleneck_switch()
322-
def nanmean(values, axis=None, skipna=True):
341+
def nanmean(values, axis=None, skipna=True, min_count=1):
323342
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
324343

325344
dtype_sum = dtype_max
@@ -345,7 +364,7 @@ def nanmean(values, axis=None, skipna=True):
345364

346365
@disallow('M8')
347366
@bottleneck_switch()
348-
def nanmedian(values, axis=None, skipna=True):
367+
def nanmedian(values, axis=None, skipna=True, min_count=1):
349368

350369
values, mask, dtype, dtype_max = _get_values(values, skipna)
351370

@@ -405,14 +424,14 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
405424

406425
@disallow('M8')
407426
@bottleneck_switch(ddof=1)
408-
def nanstd(values, axis=None, skipna=True, ddof=1):
427+
def nanstd(values, axis=None, skipna=True, ddof=1, min_count=1):
409428
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
410429
return _wrap_results(result, values.dtype)
411430

412431

413432
@disallow('M8')
414433
@bottleneck_switch(ddof=1)
415-
def nanvar(values, axis=None, skipna=True, ddof=1):
434+
def nanvar(values, axis=None, skipna=True, ddof=1, min_count=1):
416435

417436
values = _values_from_object(values)
418437
dtype = values.dtype
@@ -452,7 +471,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
452471

453472

454473
@disallow('M8', 'm8')
455-
def nansem(values, axis=None, skipna=True, ddof=1):
474+
def nansem(values, axis=None, skipna=True, ddof=1, min_count=1):
456475
var = nanvar(values, axis, skipna, ddof=ddof)
457476

458477
mask = isna(values)
@@ -492,7 +511,7 @@ def reduction(values, axis=None, skipna=True):
492511

493512

494513
@disallow('O')
495-
def nanargmax(values, axis=None, skipna=True):
514+
def nanargmax(values, axis=None, skipna=True, min_count=1):
496515
"""
497516
Returns -1 in the NA case
498517
"""
@@ -503,7 +522,7 @@ def nanargmax(values, axis=None, skipna=True):
503522

504523

505524
@disallow('O')
506-
def nanargmin(values, axis=None, skipna=True):
525+
def nanargmin(values, axis=None, skipna=True, min_count=1):
507526
"""
508527
Returns -1 in the NA case
509528
"""
@@ -514,7 +533,7 @@ def nanargmin(values, axis=None, skipna=True):
514533

515534

516535
@disallow('M8', 'm8')
517-
def nanskew(values, axis=None, skipna=True):
536+
def nanskew(values, axis=None, skipna=True, min_count=1):
518537
""" Compute the sample skewness.
519538
520539
The statistic computed here is the adjusted Fisher-Pearson standardized
@@ -573,7 +592,7 @@ def nanskew(values, axis=None, skipna=True):
573592

574593

575594
@disallow('M8', 'm8')
576-
def nankurt(values, axis=None, skipna=True):
595+
def nankurt(values, axis=None, skipna=True, min_count=1):
577596
""" Compute the sample excess kurtosis.
578597
579598
The statistic computed here is the adjusted Fisher-Pearson standardized
@@ -641,13 +660,16 @@ def nankurt(values, axis=None, skipna=True):
641660

642661

643662
@disallow('M8', 'm8')
644-
def nanprod(values, axis=None, skipna=True):
663+
def nanprod(values, axis=None, skipna=True, min_count=1):
664+
if len(values) < min_count:
665+
return _na_for_min_count(values, axis=axis)
666+
645667
mask = isna(values)
646668
if skipna and not is_any_int_dtype(values):
647669
values = values.copy()
648670
values[mask] = 1
649671
result = values.prod(axis)
650-
return _maybe_null_out(result, axis, mask)
672+
return _maybe_null_out(result, axis, mask, min_count=min_count)
651673

652674

653675
def _maybe_arg_null_out(result, axis, mask, skipna):
@@ -683,7 +705,7 @@ def _get_counts(mask, axis, dtype=float):
683705
return np.array(count, dtype=dtype)
684706

685707

686-
def _maybe_null_out(result, axis, mask):
708+
def _maybe_null_out(result, axis, mask, min_count=1):
687709
if axis is not None and getattr(result, 'ndim', False):
688710
null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
689711
if np.any(null_mask):
@@ -698,7 +720,7 @@ def _maybe_null_out(result, axis, mask):
698720
result[null_mask] = None
699721
elif result is not tslib.NaT:
700722
null_mask = mask.size - mask.sum()
701-
if null_mask == 0:
723+
if null_mask < min_count:
702724
result = np.nan
703725

704726
return result
@@ -714,7 +736,7 @@ def _zero_out_fperr(arg):
714736

715737

716738
@disallow('M8', 'm8')
717-
def nancorr(a, b, method='pearson', min_periods=None):
739+
def nancorr(a, b, method='pearson', min_periods=None, min_count=1):
718740
"""
719741
a, b: ndarrays
720742
"""
@@ -761,7 +783,7 @@ def _spearman(a, b):
761783

762784

763785
@disallow('M8', 'm8')
764-
def nancov(a, b, min_periods=None):
786+
def nancov(a, b, min_periods=None, min_count=1):
765787
if len(a) != len(b):
766788
raise AssertionError('Operands to nancov must have same size')
767789

pandas/tests/series/test_analytics.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1759,6 +1759,48 @@ def test_value_counts_categorical_not_ordered(self):
17591759
tm.assert_series_equal(idx.value_counts(normalize=True), exp)
17601760

17611761

1762+
class TestMinCount():
1763+
@pytest.mark.parametrize("use_bottleneck", [True, False])
1764+
@pytest.mark.parametrize("method", [("sum", 0), ("prod", 1)])
1765+
def test_min_count_empty(self, method, use_bottleneck):
1766+
method, unit = method
1767+
s = pd.Series()
1768+
1769+
with pd.option_context("compute.use_bottleneck", use_bottleneck):
1770+
result = getattr(s, method)(min_count=0)
1771+
assert result == unit
1772+
1773+
result = getattr(s, method)(min_count=1)
1774+
assert np.isnan(result)
1775+
1776+
s = pd.Series([1])
1777+
with pd.option_context("compute.use_bottleneck", use_bottleneck):
1778+
result = getattr(s, method)(min_count=0)
1779+
assert result == 1
1780+
1781+
result = getattr(s, method)(min_count=1)
1782+
assert result == 1
1783+
1784+
result = getattr(s, method)(min_count=2)
1785+
assert np.isnan(result)
1786+
1787+
@pytest.mark.parametrize("use_bottleneck", [True, False])
1788+
@pytest.mark.parametrize("method", [("sum", 0), ("prod", 1)])
1789+
def test_min_count_with_na(self, method, use_bottleneck):
1790+
method, unit = method
1791+
s = pd.Series([np.nan])
1792+
with pd.option_context("compute.use_bottleneck", use_bottleneck):
1793+
result = getattr(s, method)(min_count=0)
1794+
assert result == unit
1795+
1796+
result = getattr(s, method)(min_count=1)
1797+
assert np.isnan(result)
1798+
1799+
s = pd.Series([np.nan, unit])
1800+
result = getattr(s, method)(min_count=1)
1801+
assert result == unit
1802+
1803+
17621804
@pytest.fixture
17631805
def s_main_dtypes():
17641806
df = pd.DataFrame(

0 commit comments

Comments
 (0)