Skip to content

Commit 8631741

Browse files
author
Joris Vankerschaver
committed
BUG: Use stable algorithm for _nanvar.
1 parent 57ba8dc commit 8631741

File tree

5 files changed

+167
-39
lines changed

5 files changed

+167
-39
lines changed

doc/source/whatsnew/v0.17.0.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,3 +923,5 @@ Bug Fixes
923923
- Bug in plotting functions may raise ``IndexError`` when plotted on ``GridSpec`` (:issue:`10819`)
924924
- Bug in plot result may show unnecessary minor ticklabels (:issue:`10657`)
925925
- Bug in ``groupby`` incorrect computation for aggregation on ``DataFrame`` with ``NaT`` (E.g ``first``, ``last``, ``min``). (:issue:`10590`)
926+
- Bug when constructing ``DataFrame`` where passing a dictionary with only scalar values and specifying columns did not raise an error (:issue:`10856`)
927+
- Bug in ``.var()`` causing roundoff errors for highly similar values (:issue:`10242`)

pandas/core/nanops.py

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -346,11 +346,22 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
346346
return count, d
347347

348348

349-
def _nanvar(values, axis=None, skipna=True, ddof=1):
350-
# private nanvar calculator
349+
@disallow('M8')
350+
@bottleneck_switch(ddof=1)
351+
def nanstd(values, axis=None, skipna=True, ddof=1):
352+
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
353+
return _wrap_results(result, values.dtype)
354+
355+
356+
@disallow('M8')
357+
@bottleneck_switch(ddof=1)
358+
def nanvar(values, axis=None, skipna=True, ddof=1):
359+
360+
dtype = values.dtype
351361
mask = isnull(values)
352362
if is_any_int_dtype(values):
353363
values = values.astype('f8')
364+
values[mask] = np.nan
354365

355366
if is_float_dtype(values):
356367
count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
@@ -361,36 +372,35 @@ def _nanvar(values, axis=None, skipna=True, ddof=1):
361372
values = values.copy()
362373
np.putmask(values, mask, 0)
363374

364-
X = _ensure_numeric(values.sum(axis))
365-
XX = _ensure_numeric((values ** 2).sum(axis))
366-
result = np.fabs((XX - X * X / count) / d)
367-
return result
368-
369-
@disallow('M8')
370-
@bottleneck_switch(ddof=1)
371-
def nanstd(values, axis=None, skipna=True, ddof=1):
372-
373-
result = np.sqrt(_nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
375+
# Compute variance via two-pass algorithm, which is stable against
376+
# cancellation errors and relatively accurate for small numbers of
377+
# observations.
378+
#
379+
# See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
380+
avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
381+
if axis is not None:
382+
avg = np.expand_dims(avg, axis)
383+
sqr = _ensure_numeric((avg - values) ** 2)
384+
np.putmask(sqr, mask, 0)
385+
result = sqr.sum(axis=axis, dtype=np.float64) / d
386+
387+
# Return variance as np.float64 (the datatype used in the accumulator),
388+
# unless we were dealing with a float array, in which case use the same
389+
# precision as the original values array.
390+
if is_float_dtype(dtype):
391+
result = result.astype(dtype)
374392
return _wrap_results(result, values.dtype)
375393

376-
@disallow('M8','m8')
377-
@bottleneck_switch(ddof=1)
378-
def nanvar(values, axis=None, skipna=True, ddof=1):
379394

380-
# we are going to allow timedelta64[ns] here
381-
# but NOT going to coerce them to the Timedelta type
382-
# as this could cause overflow
383-
# so var cannot be computed (but std can!)
384-
return _nanvar(values, axis=axis, skipna=skipna, ddof=ddof)
385-
386-
@disallow('M8','m8')
395+
@disallow('M8', 'm8')
387396
def nansem(values, axis=None, skipna=True, ddof=1):
388397
var = nanvar(values, axis, skipna, ddof=ddof)
389398

390399
mask = isnull(values)
391400
if not is_float_dtype(values.dtype):
392401
values = values.astype('f8')
393402
count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
403+
var = nanvar(values, axis, skipna, ddof=ddof)
394404

395405
return np.sqrt(var) / np.sqrt(count)
396406

pandas/tests/test_frame.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12589,11 +12589,11 @@ def test_numeric_only_flag(self):
1258912589
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
1259012590
assert_series_equal(expected, result)
1259112591

12592-
assertRaisesRegexp(TypeError, 'float',
12593-
getattr(df1, meth), axis=1, numeric_only=False)
12594-
12595-
assertRaisesRegexp(TypeError, 'float',
12596-
getattr(df2, meth), axis=1, numeric_only=False)
12592+
try:
12593+
getattr(df1, meth)(axis=1, numeric_only=False)
12594+
getattr(df2, meth)(axis=1, numeric_only=False)
12595+
except (TypeError, ValueError) as e:
12596+
self.assertIn('float', str(e))
1259712597

1259812598
def test_sem(self):
1259912599
alt = lambda x: np.std(x, ddof=1)/np.sqrt(len(x))

pandas/tests/test_nanops.py

Lines changed: 126 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def check_fun_data(self, testfunc, targfunc,
190190
if skipna and axis is None:
191191
res = testfunc(testarval, **kwargs)
192192
self.check_results(targ, res, axis)
193-
except AssertionError as exc:
193+
except BaseException as exc:
194194
exc.args += ('axis: %s of %s' % (axis, testarval.ndim-1),
195195
'skipna: %s' % skipna,
196196
'kwargs: %s' % kwargs)
@@ -222,7 +222,7 @@ def check_fun(self, testfunc, targfunc,
222222
try:
223223
self.check_fun_data(testfunc, targfunc,
224224
testarval, targarval, targarnanval, **kwargs)
225-
except AssertionError as exc:
225+
except BaseException as exc:
226226
exc.args += ('testar: %s' % testar,
227227
'targar: %s' % targar,
228228
'targarnan: %s' % targarnan)
@@ -295,7 +295,7 @@ def check_funs_ddof(self, testfunc, targfunc,
295295
allow_complex, allow_all_nan, allow_str,
296296
allow_date, allow_tdelta, allow_obj,
297297
ddof=ddof)
298-
except AssertionError as exc:
298+
except BaseException as exc:
299299
exc.args += ('ddof %s' % ddof,)
300300
raise
301301

@@ -370,7 +370,7 @@ def test_nanvar(self):
370370
allow_complex=False,
371371
allow_str=False,
372372
allow_date=False,
373-
allow_tdelta=False,
373+
allow_tdelta=True,
374374
allow_obj='convert')
375375

376376
def test_nanstd(self):
@@ -388,7 +388,8 @@ def test_nansem(self):
388388
allow_complex=False,
389389
allow_str=False,
390390
allow_date=False,
391-
allow_tdelta=False)
391+
allow_tdelta=True,
392+
allow_obj='convert')
392393

393394
def _minmax_wrap(self, value, axis=None, func=None):
394395
res = func(value, axis)
@@ -676,7 +677,7 @@ def check_bool(self, func, value, correct, *args, **kwargs):
676677
self.assertTrue(res0)
677678
else:
678679
self.assertFalse(res0)
679-
except AssertionError as exc:
680+
except BaseException as exc:
680681
exc.args += ('dim: %s' % getattr(value, 'ndim', value),)
681682
raise
682683
if not hasattr(value, 'ndim'):
@@ -713,7 +714,7 @@ def test__has_infs(self):
713714
val = getattr(self, arr)
714715
try:
715716
self.check_bool(nanops._has_infs, val, correct)
716-
except AssertionError as exc:
717+
except BaseException as exc:
717718
exc.args += (arr,)
718719
raise
719720

@@ -723,7 +724,7 @@ def test__has_infs(self):
723724
self.check_bool(nanops._has_infs, val, correct)
724725
self.check_bool(nanops._has_infs, val.astype('f4'), correct)
725726
self.check_bool(nanops._has_infs, val.astype('f2'), correct)
726-
except AssertionError as exc:
727+
except BaseException as exc:
727728
exc.args += (arr,)
728729
raise
729730

@@ -756,7 +757,7 @@ def test__isfinite(self):
756757
val = getattr(self, arr)
757758
try:
758759
self.check_bool(func1, val, correct)
759-
except AssertionError as exc:
760+
except BaseException as exc:
760761
exc.args += (arr,)
761762
raise
762763

@@ -766,7 +767,7 @@ def test__isfinite(self):
766767
self.check_bool(func1, val, correct)
767768
self.check_bool(func1, val.astype('f4'), correct)
768769
self.check_bool(func1, val.astype('f2'), correct)
769-
except AssertionError as exc:
770+
except BaseException as exc:
770771
exc.args += (arr,)
771772
raise
772773

@@ -830,6 +831,121 @@ def test_non_convertable_values(self):
830831
lambda: nanops._ensure_numeric([]))
831832

832833

834+
class TestNanvarFixedValues(tm.TestCase):
835+
836+
def setUp(self):
837+
# Samples from a normal distribution.
838+
self.variance = variance = 3.0
839+
self.samples = self.prng.normal(scale=variance ** 0.5, size=100000)
840+
841+
def test_nanvar_all_finite(self):
842+
samples = self.samples
843+
actual_variance = nanops.nanvar(samples)
844+
np.testing.assert_almost_equal(
845+
actual_variance, self.variance, decimal=2)
846+
847+
def test_nanvar_nans(self):
848+
samples = np.nan * np.ones(2 * self.samples.shape[0])
849+
samples[::2] = self.samples
850+
851+
actual_variance = nanops.nanvar(samples, skipna=True)
852+
np.testing.assert_almost_equal(
853+
actual_variance, self.variance, decimal=2)
854+
855+
actual_variance = nanops.nanvar(samples, skipna=False)
856+
np.testing.assert_almost_equal(
857+
actual_variance, np.nan, decimal=2)
858+
859+
def test_nanstd_nans(self):
860+
samples = np.nan * np.ones(2 * self.samples.shape[0])
861+
samples[::2] = self.samples
862+
863+
actual_std = nanops.nanstd(samples, skipna=True)
864+
np.testing.assert_almost_equal(
865+
actual_std, self.variance ** 0.5, decimal=2)
866+
867+
actual_std = nanops.nanvar(samples, skipna=False)
868+
np.testing.assert_almost_equal(
869+
actual_std, np.nan, decimal=2)
870+
871+
def test_nanvar_axis(self):
872+
# Generate some sample data.
873+
samples_norm = self.samples
874+
samples_unif = self.prng.uniform(size=samples_norm.shape[0])
875+
samples = np.vstack([samples_norm, samples_unif])
876+
877+
actual_variance = nanops.nanvar(samples, axis=1)
878+
np.testing.assert_array_almost_equal(
879+
actual_variance, np.array([self.variance, 1.0 / 12]), decimal=2)
880+
881+
def test_nanvar_ddof(self):
882+
n = 5
883+
samples = self.prng.uniform(size=(10000, n+1))
884+
samples[:, -1] = np.nan # Force use of our own algorithm.
885+
886+
variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
887+
variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
888+
variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
889+
890+
# The unbiased estimate.
891+
var = 1.0 / 12
892+
np.testing.assert_almost_equal(variance_1, var, decimal=2)
893+
# The underestimated variance.
894+
np.testing.assert_almost_equal(
895+
variance_0, (n - 1.0) / n * var, decimal=2)
896+
# The overestimated variance.
897+
np.testing.assert_almost_equal(
898+
variance_2, (n - 1.0) / (n - 2.0) * var, decimal=2)
899+
900+
def test_ground_truth(self):
901+
# Test against values that were precomputed with Numpy.
902+
samples = np.empty((4, 4))
903+
samples[:3, :3] = np.array([[0.97303362, 0.21869576, 0.55560287],
904+
[0.72980153, 0.03109364, 0.99155171],
905+
[0.09317602, 0.60078248, 0.15871292]])
906+
samples[3] = samples[:, 3] = np.nan
907+
908+
# Actual variances along axis=0, 1 for ddof=0, 1, 2
909+
variance = np.array(
910+
[[[0.13762259, 0.05619224, 0.11568816],
911+
[0.20643388, 0.08428837, 0.17353224],
912+
[0.41286776, 0.16857673, 0.34706449]],
913+
[[0.09519783, 0.16435395, 0.05082054],
914+
[0.14279674, 0.24653093, 0.07623082],
915+
[0.28559348, 0.49306186, 0.15246163]]]
916+
)
917+
918+
# Test nanvar.
919+
for axis in range(2):
920+
for ddof in range(3):
921+
var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
922+
np.testing.assert_array_almost_equal(
923+
var[:3], variance[axis, ddof]
924+
)
925+
np.testing.assert_equal(var[3], np.nan)
926+
927+
# Test nanstd.
928+
for axis in range(2):
929+
for ddof in range(3):
930+
std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
931+
np.testing.assert_array_almost_equal(
932+
std[:3], variance[axis, ddof] ** 0.5
933+
)
934+
np.testing.assert_equal(std[3], np.nan)
935+
936+
def test_nanstd_roundoff(self):
937+
# Regression test for GH 10242 (test data taken from GH 10489). Ensure
938+
# that variance is stable.
939+
data = Series(766897346 * np.ones(10))
940+
for ddof in range(3):
941+
result = data.std(ddof=ddof)
942+
self.assertEqual(result, 0.0)
943+
944+
@property
945+
def prng(self):
946+
return np.random.RandomState(1234)
947+
948+
833949
if __name__ == '__main__':
834950
import nose
835951
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure',

pandas/tseries/tests/test_timedeltas.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -684,8 +684,8 @@ def test_timedelta_ops(self):
684684
self.assertEqual(result[0], expected)
685685

686686
# invalid ops
687-
for op in ['skew','kurt','sem','var','prod']:
688-
self.assertRaises(TypeError, lambda : getattr(td,op)())
687+
for op in ['skew','kurt','sem','prod']:
688+
self.assertRaises(TypeError, getattr(td,op))
689689

690690
# GH 10040
691691
# make sure NaT is properly handled by median()

0 commit comments

Comments
 (0)