Skip to content

BUG: groupby with dropna=False drops nulls from categorical groupers #49652

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Dec 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
6ae5ef9
BUG: groupby(..., dropna=False) drops null values with categorical gr…
rhshadrach Oct 5, 2022
20e17ab
Use intp
rhshadrach Nov 12, 2022
29ee263
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 12, 2022
8254619
Fixups
rhshadrach Nov 13, 2022
f679fd7
Use intp
rhshadrach Nov 13, 2022
9a3fb30
Merge branch 'main' into groupby_dropna_filtering
rhshadrach Nov 15, 2022
34760ca
int64
rhshadrach Nov 16, 2022
93f306c
dtype fix
rhshadrach Nov 19, 2022
d6100b4
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 19, 2022
f3a3ebb
Breakup op to debug on CI
rhshadrach Nov 20, 2022
4bfeaa1
Trying with intp
rhshadrach Nov 20, 2022
af9d90c
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Nov 20, 2022
45f3947
Merge branch 'main' into groupby_dropna_filtering
rhshadrach Nov 28, 2022
4d72402
Restore cache decorator
rhshadrach Nov 29, 2022
6f2f51d
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 29, 2022
9ddc2d0
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Nov 29, 2022
1e3bff3
Add bincount comment
rhshadrach Nov 29, 2022
5a42eeb
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Nov 29, 2022
c8ba7ad
Rework recoding logic
rhshadrach Nov 30, 2022
a548506
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Nov 30, 2022
9bec396
Merge branch 'main' of https://github.com/pandas-dev/pandas into grou…
rhshadrach Dec 2, 2022
867e97a
Merge branch 'groupby_dropna_filtering' of https://github.com/rhshadr…
rhshadrach Dec 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.apply` and :class:`SeriesGroupBy.apply` with ``as_index=False`` would not attempt the computation without using the grouping keys when using them failed with a ``TypeError`` (:issue:`49256`)
- Bug in :meth:`.DataFrameGroupBy.describe` would describe the group keys (:issue:`49256`)
- Bug in :meth:`.SeriesGroupBy.describe` with ``as_index=False`` would have the incorrect shape (:issue:`49256`)
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`)

Reshaping
^^^^^^^^^
Expand Down
23 changes: 23 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,29 @@ def unique(values):
return unique_with_mask(values)


def nunique_ints(values: ArrayLike) -> int:
"""
Return the number of unique values for integer array-likes.

Significantly faster than pandas.unique for long enough sequences.
No checks are done to ensure input is integral.

Parameters
----------
values : 1d array-like

Returns
-------
int : The number of unique values in ``values``
"""
if len(values) == 0:
return 0
values = _ensure_data(values)
# bincount requires intp
result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
return result


def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
"""See algorithms.unique for docs. Takes a mask for masked arrays."""
values = _ensure_arraylike(values)
Expand Down
25 changes: 17 additions & 8 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class providing the base-class of operations.
Timestamp,
lib,
)
from pandas._libs.algos import rank_1d
import pandas._libs.groupby as libgroupby
from pandas._typing import (
AnyArrayLike,
Expand Down Expand Up @@ -2268,12 +2269,15 @@ def size(self) -> DataFrame | Series:
else:
result = self._obj_1d_constructor(result)

with com.temp_setattr(self, "as_index", True):
# size already has the desired behavior in GH#49519, but this makes the
# as_index=False path of _reindex_output fail on categorical groupers.
result = self._reindex_output(result, fill_value=0)
if not self.as_index:
# error: Incompatible types in assignment (expression has
# type "DataFrame", variable has type "Series")
result = result.rename("size").reset_index() # type: ignore[assignment]

return self._reindex_output(result, fill_value=0)
return result

@final
@doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
Expand Down Expand Up @@ -3269,6 +3273,10 @@ def ngroup(self, ascending: bool = True):
else:
dtype = np.int64

if any(ping._passed_categorical for ping in self.grouper.groupings):
# comp_ids reflect non-observed groups, we need only observed
comp_ids = rank_1d(comp_ids, ties_method="dense") - 1

result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
if not ascending:
result = self.ngroups - 1 - result
Expand Down Expand Up @@ -3950,7 +3958,7 @@ def _reindex_output(
names = names + [None]
index = MultiIndex.from_product(levels_list, names=names)
if self.sort:
index = index.sortlevel()[0]
index = index.sort_values()

if self.as_index:
# Always holds for SeriesGroupBy unless GH#36507 is implemented
Expand All @@ -3972,12 +3980,12 @@ def _reindex_output(
# reindex `output`, and then reset the in-axis grouper columns.

# Select in-axis groupers
in_axis_grps = (
in_axis_grps = list(
(i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
)
g_nums, g_names = zip(*in_axis_grps)

output = output.drop(labels=list(g_names), axis=1)
if len(in_axis_grps) > 0:
g_nums, g_names = zip(*in_axis_grps)
output = output.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
output = output.set_index(self.grouper.result_index).reindex(
Expand All @@ -3986,7 +3994,8 @@ def _reindex_output(

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
output = output.reset_index(level=g_nums)
if len(in_axis_grps) > 0:
output = output.reset_index(level=g_nums)

return output.reset_index(drop=True)

Expand Down
47 changes: 43 additions & 4 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,9 @@ def group_arraylike(self) -> ArrayLike:
# retain dtype for categories, including unobserved ones
return self.result_index._values

elif self._passed_categorical:
return self.group_index._values

return self._codes_and_uniques[1]

@cache_readonly
Expand All @@ -621,14 +624,31 @@ def result_index(self) -> Index:
if self._all_grouper is not None:
group_idx = self.group_index
assert isinstance(group_idx, CategoricalIndex)
categories = self._all_grouper.categories
cats = self._orig_cats
# set_categories is dynamically added
return group_idx.set_categories(categories) # type: ignore[attr-defined]
return group_idx.set_categories(cats) # type: ignore[attr-defined]
return self.group_index

@cache_readonly
def group_index(self) -> Index:
uniques = self._codes_and_uniques[1]
codes, uniques = self._codes_and_uniques
if not self._dropna and self._passed_categorical:
assert isinstance(uniques, Categorical)
if self._sort and (codes == len(uniques)).any():
# Add NA value on the end when sorting
uniques = Categorical.from_codes(
np.append(uniques.codes, [-1]), uniques.categories
)
else:
# Need to determine proper placement of NA value when not sorting
cat = self.grouping_vector
na_idx = (cat.codes < 0).argmax()
if cat.codes[na_idx] < 0:
# count number of unique codes that comes before the nan value
na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
uniques = Categorical.from_codes(
np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
)
return Index._with_infer(uniques, name=self.name)

@cache_readonly
Expand All @@ -651,9 +671,28 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques = Categorical.from_codes(
codes=ucodes, categories=categories, ordered=cat.ordered
)

codes = cat.codes
if not self._dropna:
na_mask = codes < 0
if np.any(na_mask):
if self._sort:
# Replace NA codes with `largest code + 1`
na_code = len(categories)
codes = np.where(na_mask, na_code, codes)
else:
# Insert NA code into the codes based on first appearance
# A negative code must exist, no need to check codes[na_idx] < 0
na_idx = na_mask.argmax()
# count number of unique codes that comes before the nan value
na_code = algorithms.nunique_ints(codes[:na_idx])
codes = np.where(codes >= na_code, codes + 1, codes)
codes = np.where(na_mask, na_code, codes)

if not self._observed:
uniques = uniques.reorder_categories(self._orig_cats)
return cat.codes, uniques

return codes, uniques

elif isinstance(self.grouping_vector, ops.BaseGrouper):
# we have a list of groupers
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,6 +831,7 @@ def test_preserve_categories():
df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
# GH#48749 - don't change order of categories
# GH#42482 - don't sort result when sort=False, even when ordered=True
nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
tm.assert_index_equal(
df.groupby("A", sort=True, observed=False).first().index, sort_index
Expand Down Expand Up @@ -1218,7 +1219,7 @@ def test_seriesgroupby_observed_true(df_cat, operation):
lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
index = MultiIndex.from_arrays([lev_a, lev_b])
expected = Series(data=[2, 4, 1, 3], index=index, name="C")
expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()

grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
result = getattr(grouped, operation)(sum)
Expand Down
Loading