fix na_value handling for categorical case + update tests for expected categorical behaviour

jorisvandenbossche · jorisvandenbossche · commit adf2b9953e03 · 2024-09-06T19:41:19.000+02:00
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2678,8 +2678,12 @@ def _str_map(
         codes = self.codes
         if categories.dtype == "string":
             result = categories.array._str_map(f, na_value, dtype)
-            if categories.dtype.na_value is np.nan:
-                # NaN propagates as False
+            if (
+                categories.dtype.na_value is np.nan
+                and is_bool_dtype(dtype)
+                and (na_value is lib.no_default or isna(na_value))
+            ):
+                # NaN propagates as False for functions with boolean return type
                 na_value = False
         else:
             from pandas.core.arrays import NumpyExtensionArray
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
@@ -122,6 +122,7 @@ def test_api_per_method(
     any_allowed_skipna_inferred_dtype,
     any_string_method,
     request,
+    using_infer_string,
 ):
     # this test does not check correctness of the different methods,
     # just that the methods work on the specified (inferred) dtypes,
@@ -160,6 +161,10 @@ def test_api_per_method(
     t = box(values, dtype=dtype)  # explicit dtype to avoid casting
     method = getattr(t.str, method_name)
 
+    if using_infer_string and dtype == "category":
+        string_allowed = method_name not in ["decode"]
+    else:
+        string_allowed = True
     bytes_allowed = method_name in ["decode", "get", "len", "slice"]
     # as of v0.23.4, all methods except 'cat' are very lenient with the
     # allowed data types, just returning NaN for entries that error.
@@ -168,7 +173,8 @@ def test_api_per_method(
     mixed_allowed = method_name not in ["cat"]
 
     allowed_types = (
-        ["string", "unicode", "empty"]
+        ["empty"]
+        + ["string", "unicode"] * string_allowed
         + ["bytes"] * bytes_allowed
         + ["mixed", "mixed-integer"] * mixed_allowed
     )
@@ -182,6 +188,7 @@ def test_api_per_method(
         msg = (
             f"Cannot use .str.{method_name} with values of "
             f"inferred dtype {inferred_dtype!r}."
+            "|a bytes-like object is required, not 'str'"
         )
         with pytest.raises(TypeError, match=msg):
             method(*args, **kwargs)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
@@ -320,7 +320,7 @@ def test_startswith_endswith_validate_na(any_string_dtype):
 @pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
-def test_startswith(pat, dtype, null_value, na):
+def test_startswith(pat, dtype, null_value, na, using_infer_string):
     # add category dtype parametrizations for GH-36241
     values = Series(
         ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
@@ -334,6 +334,8 @@ def test_startswith(pat, dtype, null_value, na):
         exp = exp.fillna(null_value)
     elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
     tm.assert_series_equal(result, exp)
 
     result = values.str.startswith(pat, na=na)
@@ -389,7 +391,7 @@ def test_startswith_string_dtype(any_string_dtype, na):
 @pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
-def test_endswith(pat, dtype, null_value, na):
+def test_endswith(pat, dtype, null_value, na, using_infer_string):
     # add category dtype parametrizations for GH-36241
     values = Series(
         ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
@@ -403,6 +405,8 @@ def test_endswith(pat, dtype, null_value, na):
         exp = exp.fillna(null_value)
     elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
     tm.assert_series_equal(result, exp)
 
     result = values.str.endswith(pat, na=na)