Skip to content

BUG: pivot_table mean of integer input casted back to int #54263

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 2, 2023
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,7 @@ Other
- Bug in :func:`api.interchange.from_dataframe` when converting an empty DataFrame object (:issue:`53155`)
- Bug in :func:`assert_almost_equal` now throwing assertion error for two unequal sets (:issue:`51727`)
- Bug in :func:`assert_frame_equal` checks category dtypes even when asked not to check index type (:issue:`52126`)
- Bug in :meth:`DataFrame.pivot_table` with casting the mean of ints back to an int (:issue:`16676`)
- Bug in :meth:`DataFrame.reindex` with a ``fill_value`` that should be inferred with a :class:`ExtensionDtype` incorrectly inferring ``object`` dtype (:issue:`52586`)
- Bug in :meth:`DataFrame.shift` and :meth:`Series.shift` and :meth:`DataFrameGroupBy.shift` when passing both "freq" and "fill_value" silently ignoring "fill_value" instead of raising ``ValueError`` (:issue:`53832`)
- Bug in :meth:`DataFrame.shift` with ``axis=1`` on a :class:`DataFrame` with a single :class:`ExtensionDtype` column giving incorrect results (:issue:`53832`)
Expand All @@ -650,7 +651,6 @@ Other
- Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`)
- Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`)
- Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`)
-

.. ***DO NOT USE THIS SECTION***

Expand Down
23 changes: 0 additions & 23 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
is_integer_dtype,
is_list_like,
is_nested_list_like,
is_scalar,
Expand Down Expand Up @@ -172,28 +171,6 @@ def __internal_pivot_table(
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
agged = agged.dropna(how="all")

# gh-21133
# we want to down cast if
# the original values are ints
# as we grouped with a NaN value
# and then dropped, coercing to floats
for v in values:
if (
v in data
and is_integer_dtype(data[v])
and v in agged
and not is_integer_dtype(agged[v])
):
if not isinstance(agged[v], ABCDataFrame) and isinstance(
data[v].dtype, np.dtype
):
# exclude DataFrame case bc maybe_downcast_to_dtype expects
# ArrayLike
# e.g. test_pivot_table_multiindex_columns_doctest_case
# agged.columns is a MultiIndex and 'v' is indexing only
# on its first level.
agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)

table = agged

# GH17038, this check should only happen if index is defined (not None)
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/frame/methods/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,7 @@ def test_drop_multiindex_not_lexsorted(self):
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns._is_lexsorted()

# compare the results
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

expected = lexsorted_df.drop("a", axis=1)
expected = lexsorted_df.drop("a", axis=1).astype(float)
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.drop("a", axis=1)

Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1789,9 +1789,6 @@ def test_groupby_multiindex_not_lexsorted():
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns._is_lexsorted()

# compare the results
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

expected = lexsorted_df.groupby("a").mean()
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.groupby("a").mean()
Expand Down
71 changes: 40 additions & 31 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def test_pivot_table_categorical(self):
result = pivot_table(df, values="values", index=["A", "B"], dropna=True)

exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
expected = DataFrame({"values": [1.0, 2.0, 3.0, 4.0]}, index=exp_index)
tm.assert_frame_equal(result, expected)

def test_pivot_table_dropna_categoricals(self, dropna):
Expand All @@ -225,7 +225,7 @@ def test_pivot_table_dropna_categoricals(self, dropna):
expected_columns = expected_columns.astype(CDT(categories, ordered=False))
expected_index = Series([1, 2, 3], name="B")
expected = DataFrame(
[[0, 3, 6], [1, 4, 7], [2, 5, 8]],
[[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]],
index=expected_index,
columns=expected_columns,
)
Expand Down Expand Up @@ -283,7 +283,7 @@ def test_pivot_with_non_observable_dropna_multi_cat(self, dropna):

result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame(
{"B": [2, 3, 0]},
{"B": [2.0, 3.0, 0.0]},
index=Index(
Categorical.from_codes(
[0, 1, 2], categories=["low", "high", "left"], ordered=True
Expand All @@ -300,7 +300,9 @@ def test_pivot_with_interval_index(self, interval_values, dropna):
# GH 25814
df = DataFrame({"A": interval_values, "B": 1})
result = df.pivot_table(index="A", values="B", dropna=dropna)
expected = DataFrame({"B": 1}, index=Index(interval_values.unique(), name="A"))
expected = DataFrame(
{"B": 1.0}, index=Index(interval_values.unique(), name="A")
)
if not dropna:
expected = expected.astype(float)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -444,7 +446,7 @@ def test_pivot_no_values(self):
index=Grouper(freq="A"), columns=Grouper(key="dt", freq="M")
)
exp = DataFrame(
[3], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns
[3.0], index=pd.DatetimeIndex(["2011-12-31"], freq="A"), columns=exp_columns
)
tm.assert_frame_equal(res, exp)

Expand Down Expand Up @@ -1059,7 +1061,7 @@ def test_pivot_table_multiindex_only(self, cols):

result = df2.pivot_table(values="v", columns=cols)
expected = DataFrame(
[[4, 5, 6]],
[[4.0, 5.0, 6.0]],
columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols),
index=Index(["v"]),
)
Expand Down Expand Up @@ -1558,7 +1560,9 @@ def test_pivot_datetime_tz(self):
exp_col1 = Index(["value1", "value1"])
exp_col2 = Index(["a", "b"], name="label")
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
expected = DataFrame([[0, 3], [1, 4], [2, 5]], index=exp_idx, columns=exp_col)
expected = DataFrame(
[[0.0, 3.0], [1.0, 4.0], [2.0, 5.0]], index=exp_idx, columns=exp_col
)
result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"])
tm.assert_frame_equal(result, expected)

Expand All @@ -1570,18 +1574,35 @@ def test_pivot_datetime_tz(self):
name="dt2",
)
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
expected = DataFrame(
expected1 = DataFrame(
np.array(
[
[0, 3, 1, 2, 0, 3, 1, 2],
[1, 4, 2, 1, 1, 4, 2, 1],
[2, 5, 1, 2, 2, 5, 1, 2],
[
0,
3,
1,
2,
],
[1, 4, 2, 1],
[2, 5, 1, 2],
],
dtype="int64",
),
index=exp_idx,
columns=exp_col,
columns=exp_col[:4],
)
expected2 = DataFrame(
np.array(
[
[0.0, 3.0, 1.0, 2.0],
[1.0, 4.0, 2.0, 1.0],
[2.0, 5.0, 1.0, 2.0],
],
),
index=exp_idx,
columns=exp_col[4:],
)
expected = concat([expected1, expected2], axis=1)

result = pivot_table(
df,
Expand Down Expand Up @@ -1628,7 +1649,7 @@ def test_pivot_dtaccessor(self):

exp_idx = Index(["a", "b"], name="label")
expected = DataFrame(
{7: [0, 3], 8: [1, 4], 9: [2, 5]},
{7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]},
index=exp_idx,
columns=Index([7, 8, 9], dtype=np.int32, name="dt1"),
)
Expand All @@ -1639,7 +1660,7 @@ def test_pivot_dtaccessor(self):
)

expected = DataFrame(
{7: [0, 3], 8: [1, 4], 9: [2, 5]},
{7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]},
index=Index([1, 2], dtype=np.int32, name="dt2"),
columns=Index([7, 8, 9], dtype=np.int32, name="dt1"),
)
Expand All @@ -1660,7 +1681,7 @@ def test_pivot_dtaccessor(self):
names=["dt1", "dt2"],
)
expected = DataFrame(
np.array([[0, 3, 1, 4, 2, 5]], dtype="int64"),
np.array([[0.0, 3.0, 1.0, 4.0, 2.0, 5.0]]),
index=Index([2013], dtype=np.int32),
columns=exp_col,
)
Expand Down Expand Up @@ -1764,13 +1785,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self):
expected = DataFrame(table.values, index=ix, columns=cols)
tm.assert_frame_equal(table, expected)

def test_categorical_margins(self, observed, request):
if observed:
request.node.add_marker(
pytest.mark.xfail(
reason="GH#17035 (np.mean of ints is casted back to ints)"
)
)
def test_categorical_margins(self, observed):
# GH 10989
df = DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
Expand All @@ -1783,13 +1798,7 @@ def test_categorical_margins(self, observed, request):
table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
tm.assert_frame_equal(table, expected)

def test_categorical_margins_category(self, observed, request):
if observed:
request.node.add_marker(
pytest.mark.xfail(
reason="GH#17035 (np.mean of ints is casted back to ints)"
)
)
def test_categorical_margins_category(self, observed):
df = DataFrame(
{"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
)
Expand All @@ -1816,7 +1825,7 @@ def test_margins_casted_to_float(self):

result = pivot_table(df, index="D", margins=True)
expected = DataFrame(
{"A": [3, 7, 5], "B": [2.5, 6.5, 4.5], "C": [2, 5, 3.5]},
{"A": [3.0, 7.0, 5], "B": [2.5, 6.5, 4.5], "C": [2.0, 5.0, 3.5]},
index=Index(["X", "Y", "All"], name="D"),
)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -2249,7 +2258,7 @@ def test_pivot_table_sort_false_with_multiple_values(self):
index=["lastname", "firstname"], values=["height", "age"], sort=False
)
expected = DataFrame(
[[173, 47], [182, 33]],
[[173.0, 47.0], [182.0, 33.0]],
columns=["height", "age"],
index=MultiIndex.from_tuples(
[("Foo", "John"), ("Bar", "Michael")],
Expand Down