diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 02747911d2226..84d4a28d675d5 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -42,7 +42,8 @@ def time_regular(self): pd.Categorical(self.values, self.categories) def time_fastpath(self): - pd.Categorical(self.codes, self.cat_idx, fastpath=True) + dtype = pd.CategoricalDtype(categories=self.cat_idx) + pd.Categorical._simple_new(self.codes, dtype) def time_datetimes(self): pd.Categorical(self.datetimes) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 743bb78c70c36..3de94697511d1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -167,12 +167,12 @@ Deprecations - Deprecated 'method', 'limit', and 'fill_axis' keywords in :meth:`DataFrame.align` and :meth:`Series.align`, explicitly call ``fillna`` on the alignment results instead (:issue:`51856`) - Deprecated 'broadcast_axis' keyword in :meth:`Series.align` and :meth:`DataFrame.align`, upcast before calling ``align`` with ``left = DataFrame({col: left for col in right.columns}, index=right.index)`` (:issue:`51856`) - Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) +- Deprecated the "fastpath" keyword in :class:`Categorical` constructor, use :meth:`Categorical.from_codes` instead (:issue:`20110`) - Deprecated passing a dictionary to :meth:`.SeriesGroupBy.agg`; pass a list of aggregations instead (:issue:`50684`) - Deprecated logical operations (``|``, ``&``, ``^``) between pandas objects and dtype-less sequences (e.g. ``list``, ``tuple``), wrap a sequence in a :class:`Series` or numpy array before operating instead (:issue:`51521`) - Deprecated the methods :meth:`Series.bool` and :meth:`DataFrame.bool` (:issue:`51749`) - Deprecated :meth:`DataFrame.swapaxes` and :meth:`Series.swapaxes`, use :meth:`DataFrame.transpose` or :meth:`Series.transpose` instead (:issue:`51946`) - Deprecated parameter ``convert_type`` in :meth:`Series.apply` (:issue:`52140`) -- .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 18dde1a3d22a4..f41327a4a16ca 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -355,15 +355,38 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi _dtype: CategoricalDtype + @classmethod + # error: Argument 2 of "_simple_new" is incompatible with supertype + # "NDArrayBacked"; supertype defines the argument type as + # "Union[dtype[Any], ExtensionDtype]" + def _simple_new( # type: ignore[override] + cls, codes: np.ndarray, dtype: CategoricalDtype + ) -> Self: + # NB: This is not _quite_ as simple as the "usual" _simple_new + codes = coerce_indexer_dtype(codes, dtype.categories) + dtype = CategoricalDtype(ordered=False).update_dtype(dtype) + return super()._simple_new(codes, dtype) + def __init__( self, values, categories=None, ordered=None, dtype: Dtype | None = None, - fastpath: bool = False, + fastpath: bool | lib.NoDefault = lib.no_default, copy: bool = True, ) -> None: + if fastpath is not lib.no_default: + # GH#20110 + warnings.warn( + "The 'fastpath' keyword in Categorical is deprecated and will " + "be removed in a future version. Use Categorical.from_codes instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + fastpath = False + dtype = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype ) @@ -626,7 +649,7 @@ def _from_inferred_categories( dtype = CategoricalDtype(cats, ordered=False) codes = inferred_codes - return cls(codes, dtype=dtype, fastpath=True) + return cls._simple_new(codes, dtype=dtype) @classmethod def from_codes( @@ -693,7 +716,7 @@ def from_codes( if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and len(categories)-1") - return cls(codes, dtype=dtype, fastpath=True) + return cls._simple_new(codes, dtype=dtype) # ------------------------------------------------------------------ # Categories/Codes/Ordered @@ -805,7 +828,7 @@ def _set_dtype(self, dtype: CategoricalDtype) -> Self: a (valid) instance of `CategoricalDtype`. """ codes = recode_for_categories(self.codes, self.categories, dtype.categories) - return type(self)(codes, dtype=dtype, fastpath=True) + return type(self)._simple_new(codes, dtype=dtype) def set_ordered(self, value: bool) -> Self: """ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 087b1488010bf..b55c8cd31c110 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -20,6 +20,7 @@ ) from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.dtypes import ( + CategoricalDtype, DatetimeTZDtype, ExtensionDtype, ) @@ -323,7 +324,8 @@ def _maybe_unwrap(x): if ignore_order: ordered = False - return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) + dtype = CategoricalDtype(categories=categories, ordered=ordered) + return Categorical._simple_new(new_codes, dtype=dtype) def _concatenate_2d(to_concat: Sequence[np.ndarray], axis: AxisInt) -> np.ndarray: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ac52e43472cbe..e28c587f00e3e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -260,7 +260,7 @@ def _from_values_or_dtype( CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object) >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True) >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False) - >>> c = pd.Categorical([0, 1], dtype=dtype1, fastpath=True) + >>> c = pd.Categorical([0, 1], dtype=dtype1) >>> pd.CategoricalDtype._from_values_or_dtype( ... c, ['x', 'y'], ordered=True, dtype=dtype2 ... ) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 20248cd69bfb9..6ab98cf4fe55e 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -63,7 +63,7 @@ def recode_for_groupby( # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) - return Categorical(codes, dtype=dtype, fastpath=True), c + return Categorical._simple_new(codes, dtype=dtype), c # Already sorted according to c.categories; all is fine if sort: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 15490ad853d84..88dbee0808533 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -16,6 +16,7 @@ from pandas._libs.hashing import hash_object_array from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -203,7 +204,10 @@ def hash_tuples( # create a list-of-Categoricals cat_vals = [ - Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True) + Categorical._simple_new( + mi.codes[level], + CategoricalDtype(categories=mi.levels[level], ordered=False), + ) for level in range(mi.nlevels) ] @@ -296,7 +300,8 @@ def _hash_ndarray( ) codes, categories = factorize(vals, sort=False) - cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) + dtype = CategoricalDtype(categories=Index(categories), ordered=False) + cat = Categorical._simple_new(codes, dtype) return cat._hash_pandas_object( encoding=encoding, hash_key=hash_key, categorize=False ) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6cb0e31eb0a5d..46ff46fe84dd1 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -32,6 +32,13 @@ class TestCategoricalConstructors: + def test_fastpath_deprecated(self): + codes = np.array([1, 2, 3]) + dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False) + msg = "The 'fastpath' keyword in Categorical is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + Categorical(codes, dtype=dtype, fastpath=True) + def test_categorical_from_cat_and_dtype_str_preserve_ordered(self): # GH#49309 we should preserve orderedness in `res` cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 95748f619172b..d7830248cb73c 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -31,7 +31,7 @@ def test_na_flags_int_categories(self): labels = np.random.randint(0, 10, 20) labels[::5] = -1 - cat = Categorical(labels, categories, fastpath=True) + cat = Categorical(labels, categories) repr(cat) tm.assert_numpy_array_equal(isna(cat), labels == -1) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d31489aba5a6f..ffc44b30a3870 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -2,6 +2,7 @@ from pandas import ( Categorical, + CategoricalDtype, CategoricalIndex, Series, date_range, @@ -24,7 +25,9 @@ def test_print(self, factor): class TestCategoricalRepr: def test_big_print(self): - factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True) + codes = np.array([0, 1, 2, 0, 1, 2] * 100) + dtype = CategoricalDtype(categories=["a", "b", "c"]) + factor = Categorical.from_codes(codes, dtype=dtype) expected = [ "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", "Length: 600", diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 768a1551a8d58..2d353bab3ebe8 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -123,7 +123,7 @@ def test_constructor_invalid(self): dtype1 = CategoricalDtype(["a", "b"], ordered=True) dtype2 = CategoricalDtype(["x", "y"], ordered=False) - c = Categorical([0, 1], dtype=dtype1, fastpath=True) + c = Categorical([0, 1], dtype=dtype1) @pytest.mark.parametrize( "values, categories, ordered, dtype, expected", diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index d3cc3239da482..a09ab1f8657d2 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -385,7 +385,7 @@ def test_constructor_map(self): tm.assert_series_equal(result, exp) def test_constructor_categorical(self): - cat = Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"], fastpath=True) + cat = Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"]) res = Series(cat) tm.assert_categorical_equal(res.values, cat)