From 9620e00c7c64bd909a18235439ab9f289d697f2d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 26 Aug 2024 17:53:12 +0200
Subject: [PATCH 01/13] String dtype: propagate NaNs as False in predicate
 methods (eg .str.startswith)

---
 pandas/core/arrays/arrow/array.py         |  23 ++--
 pandas/core/arrays/string_.py             |  19 ++-
 pandas/core/arrays/string_arrow.py        |  72 +++++++----
 pandas/core/strings/accessor.py           |  15 ++-
 pandas/core/strings/object_array.py       |  19 ++-
 pandas/tests/strings/test_find_replace.py | 142 ++++++++++++++--------
 pandas/tests/strings/test_string_array.py |   4 +-
 pandas/tests/strings/test_strings.py      |  12 +-
 8 files changed, 206 insertions(+), 100 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index e95fa441e18fb..399af4076c60f 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2291,7 +2291,12 @@ def _str_count(self, pat: str, flags: int = 0) -> Self:
         return type(self)(pc.count_substring_regex(self._pa_array, pat))
 
     def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na=lib.no_default,
+        regex: bool = True,
     ) -> Self:
         if flags:
             raise NotImplementedError(f"contains not implemented with {flags=}")
@@ -2301,11 +2306,11 @@ def _str_contains(
         else:
             pa_contains = pc.match_substring
         result = pa_contains(self._pa_array, pat, ignore_case=not case)
-        if not isna(na):
+        if na is not lib.no_default and not isna(na):
             result = result.fill_null(na)
         return type(self)(result)
 
-    def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self:
+    def _str_startswith(self, pat: str | tuple[str, ...], na=lib.no_default) -> Self:
         if isinstance(pat, str):
             result = pc.starts_with(self._pa_array, pattern=pat)
         else:
@@ -2318,7 +2323,7 @@ def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self:
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
-        if not isna(na):
+        if na is not lib.no_default and not isna(na):
             result = result.fill_null(na)
         return type(self)(result)
 
@@ -2335,7 +2340,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self:
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
-        if not isna(na):
+        if na is not lib.no_default and not isna(na):
             result = result.fill_null(na)
         return type(self)(result)
 
@@ -2374,14 +2379,18 @@ def _str_repeat(self, repeats: int | Sequence[int]) -> Self:
         return type(self)(pc.binary_repeat(self._pa_array, repeats))
 
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | None = lib.no_default,
     ) -> Self:
         if not pat.startswith("^"):
             pat = f"^{pat}"
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default
     ) -> Self:
         if not pat.endswith("$") or pat.endswith("\\$"):
             pat = f"{pat}$"
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2e7f9314c4f09..e4364b1ef507f 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -351,7 +351,11 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
         return cls._from_sequence(scalars, dtype=dtype)
 
     def _str_map(
-        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+        self,
+        f,
+        na_value=lib.no_default,
+        dtype: Dtype | None = None,
+        convert: bool = True,
     ):
         if self.dtype.na_value is np.nan:
             return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype)
@@ -360,7 +364,7 @@ def _str_map(
 
         if dtype is None:
             dtype = self.dtype
-        if na_value is None:
+        if na_value is lib.no_default:
             na_value = self.dtype.na_value
 
         mask = isna(self)
@@ -429,11 +433,16 @@ def _str_map_str_or_object(
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
 
-    def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
+    def _str_map_nan_semantics(
+        self, f, na_value=lib.no_default, dtype: Dtype | None = None
+    ):
         if dtype is None:
             dtype = self.dtype
-        if na_value is None:
-            na_value = self.dtype.na_value
+        if na_value is lib.no_default:
+            if is_bool_dtype(dtype):
+                na_value = False
+            else:
+                na_value = self.dtype.na_value
 
         mask = isna(self)
         arr = np.asarray(self)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 67114815341b6..a1164392941f3 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -221,11 +221,16 @@ def insert(self, loc: int, item) -> ArrowStringArray:
             raise TypeError("Scalar must be NA or str")
         return super().insert(loc, item)
 
-    def _result_converter(self, values, na=None):
+    def _predicate_result_converter(self, values, na=lib.no_default):
         if self.dtype.na_value is np.nan:
-            if not isna(na):
+            if na is lib.no_default:
+                na_value = False
+            elif not isna(na):
                 values = values.fill_null(bool(na))
-            return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
+                na_value = lib.no_default
+            else:
+                na_value = np.nan
+            return ArrowExtensionArray(values).to_numpy(na_value=na_value)
         return BooleanDtype().__from_arrow__(values)
 
     def _maybe_convert_setitem_value(self, value):
@@ -282,7 +287,12 @@ def astype(self, dtype, copy: bool = True):
     _str_map = BaseStringArray._str_map
 
     def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na=lib.no_default,
+        regex: bool = True,
     ):
         if flags:
             if get_option("mode.performance_warnings"):
@@ -293,12 +303,18 @@ def _str_contains(
             result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
         else:
             result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
-        result = self._result_converter(result, na=na)
-        if not isna(na):
+        result = self._predicate_result_converter(result, na=na)
+        if (
+            self.dtype.na_value is libmissing.NA
+            and na is not lib.no_default
+            and not isna(na)
+        ):
             result[isna(result)] = bool(na)
         return result
 
-    def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+    def _str_startswith(
+        self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default
+    ):
         if isinstance(pat, str):
             result = pc.starts_with(self._pa_array, pattern=pat)
         else:
@@ -313,9 +329,13 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
-        if not isna(na):
+        if (
+            self.dtype.na_value is libmissing.NA
+            and na is not lib.no_default
+            and not isna(na)
+        ):
             result = result.fill_null(na)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
         if isinstance(pat, str):
@@ -332,9 +352,13 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
-        if not isna(na):
+        if (
+            self.dtype.na_value is libmissing.NA
+            and na is not lib.no_default
+            and not isna(na)
+        ):
             result = result.fill_null(na)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_replace(
         self,
@@ -361,14 +385,18 @@ def _str_repeat(self, repeats: int | Sequence[int]):
             return type(self)(pc.binary_repeat(self._pa_array, repeats))
 
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | None = lib.no_default,
     ):
         if not pat.startswith("^"):
             pat = f"^{pat}"
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default
     ):
         if not pat.endswith("$") or pat.endswith("\\$"):
             pat = f"{pat}$"
@@ -389,39 +417,39 @@ def _str_slice(
 
     def _str_isalnum(self):
         result = pc.utf8_is_alnum(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_isalpha(self):
         result = pc.utf8_is_alpha(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_isdecimal(self):
         result = pc.utf8_is_decimal(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_isdigit(self):
         result = pc.utf8_is_digit(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_islower(self):
         result = pc.utf8_is_lower(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_isnumeric(self):
         result = pc.utf8_is_numeric(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_isspace(self):
         result = pc.utf8_is_space(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_istitle(self):
         result = pc.utf8_is_title(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_isupper(self):
         result = pc.utf8_is_upper(self._pa_array)
-        return self._result_converter(result)
+        return self._predicate_result_converter(result)
 
     def _str_len(self):
         result = pc.utf8_length(self._pa_array)
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index c88270b2a2f16..cc9821026fcda 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -1222,7 +1222,12 @@ def join(self, sep: str):
 
     @forbid_nonstring_types(["bytes"])
     def contains(
-        self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na=lib.no_default,
+        regex: bool = True,
     ):
         r"""
         Test if pattern or regex is contained within a string of a Series or Index.
@@ -1359,7 +1364,7 @@ def contains(
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
-    def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
+    def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
         """
         Determine if each string starts with a match of a regular expression.
 
@@ -1403,7 +1408,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
-    def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):
+    def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
         """
         Determine if each string entirely matches a regular expression.
 
@@ -2581,7 +2586,7 @@ def count(self, pat, flags: int = 0):
 
     @forbid_nonstring_types(["bytes"])
     def startswith(
-        self, pat: str | tuple[str, ...], na: Scalar | None = None
+        self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default
     ) -> Series | Index:
         """
         Test if the start of each string element matches a pattern.
@@ -2651,7 +2656,7 @@ def startswith(
 
     @forbid_nonstring_types(["bytes"])
     def endswith(
-        self, pat: str | tuple[str, ...], na: Scalar | None = None
+        self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default
     ) -> Series | Index:
         """
         Test if the end of each string element matches a pattern.
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 100afa956bd24..c150a652d19c3 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -42,7 +42,11 @@ def __len__(self) -> int:
         raise NotImplementedError
 
     def _str_map(
-        self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True
+        self,
+        f,
+        na_value=lib.no_default,
+        dtype: NpDtype | None = None,
+        convert: bool = True,
     ):
         """
         Map a callable over valid elements of the array.
@@ -63,7 +67,7 @@ def _str_map(
         """
         if dtype is None:
             dtype = np.dtype("object")
-        if na_value is None:
+        if na_value is lib.no_default:
             na_value = self.dtype.na_value  # type: ignore[attr-defined]
 
         if not len(self):
@@ -127,7 +131,12 @@ def _str_pad(
         return self._str_map(f)
 
     def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na=lib.no_default,
+        regex: bool = True,
     ):
         if regex:
             if not case:
@@ -144,11 +153,11 @@ def _str_contains(
                 f = lambda x: upper_pat in x.upper()
         return self._str_map(f, na, dtype=np.dtype("bool"))
 
-    def _str_startswith(self, pat, na=None):
+    def _str_startswith(self, pat, na=lib.no_default):
         f = lambda x: x.startswith(pat)
         return self._str_map(f, na_value=na, dtype=np.dtype(bool))
 
-    def _str_endswith(self, pat, na=None):
+    def _str_endswith(self, pat, na=lib.no_default):
         f = lambda x: x.endswith(pat)
         return self._str_map(f, na_value=na, dtype=np.dtype(bool))
 
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 00677ef4fcfe9..7c30d5db09607 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -33,20 +33,28 @@ def test_contains(any_string_dtype):
     pat = "mmm[_]+"
 
     result = values.str.contains(pat)
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series(
-        np.array([False, np.nan, True, True, False], dtype=np.object_),
-        dtype=expected_dtype,
-    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, True, True, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(
+            np.array([False, np.nan, True, True, False], dtype=np.object_),
+            dtype=expected_dtype,
+        )
+
     tm.assert_series_equal(result, expected)
 
     result = values.str.contains(pat, regex=False)
-    expected = Series(
-        np.array([False, np.nan, False, False, True], dtype=np.object_),
-        dtype=expected_dtype,
-    )
+    if any_string_dtype == "str":
+        expected = Series([False, False, False, False, True], dtype=bool)
+    else:
+        expected = Series(
+            np.array([False, np.nan, False, False, True], dtype=np.object_),
+            dtype=expected_dtype,
+        )
     tm.assert_series_equal(result, expected)
 
     values = Series(
@@ -83,12 +91,16 @@ def test_contains(any_string_dtype):
     pat = "mmm[_]+"
 
     result = values.str.contains(pat)
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series(
-        np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype
-    )
+    if any_string_dtype == "str":
+        expected = Series([False, False, True, True], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(
+            np.array([False, np.nan, True, True], dtype=np.object_),
+            dtype=expected_dtype,
+        )
     tm.assert_series_equal(result, expected)
 
     result = values.str.contains(pat, na=False)
@@ -179,39 +191,45 @@ def test_contains_moar(any_string_dtype):
     )
 
     result = s.str.contains("a")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
     expected = Series(
-        [False, False, False, True, True, False, np.nan, False, False, True],
+        [False, False, False, True, True, False, na_value, False, False, True],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("a", case=False)
     expected = Series(
-        [True, False, False, True, True, False, np.nan, True, False, True],
+        [True, False, False, True, True, False, na_value, True, False, True],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("Aa")
     expected = Series(
-        [False, False, False, True, False, False, np.nan, False, False, False],
+        [False, False, False, True, False, False, na_value, False, False, False],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("ba")
     expected = Series(
-        [False, False, False, True, False, False, np.nan, False, False, False],
+        [False, False, False, True, False, False, na_value, False, False, False],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("ba", case=False)
     expected = Series(
-        [False, False, False, True, True, False, np.nan, True, False, False],
+        [False, False, False, True, True, False, na_value, True, False, False],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
@@ -242,10 +260,14 @@ def test_contains_nan(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("foo")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -688,36 +710,41 @@ def test_replace_regex_single_character(regex, any_string_dtype):
 
 
 def test_match(any_string_dtype):
-    # New match behavior introduced in 0.13
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
 
     values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
     result = values.str.match(".*(BAD[_]+).*(BAD)")
-    expected = Series([True, np.nan, False], dtype=expected_dtype)
+    expected = Series([True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     values = Series(
         ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = values.str.match(".*BAD[_]+.*BAD")
-    expected = Series([True, True, np.nan, False], dtype=expected_dtype)
+    expected = Series([True, True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     result = values.str.match("BAD[_]+.*BAD")
-    expected = Series([False, True, np.nan, False], dtype=expected_dtype)
+    expected = Series([False, True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     values = Series(
         ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = values.str.match("^BAD[_]+.*BAD")
-    expected = Series([False, False, np.nan, False], dtype=expected_dtype)
+    expected = Series([False, False, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     result = values.str.match("\\^BAD[_]+.*BAD")
-    expected = Series([False, True, np.nan, False], dtype=expected_dtype)
+    expected = Series([False, True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -753,10 +780,17 @@ def test_match_na_kwarg(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     result = s.str.match("a")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([True, False, np.nan], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
+
+    expected = Series([True, False, na_value], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -781,10 +815,14 @@ def test_fullmatch(any_string_dtype):
         ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = ser.str.fullmatch(".*BAD[_]+.*BAD")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([True, False, np.nan, False], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([True, False, False, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([True, False, np.nan, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -792,10 +830,14 @@ def test_fullmatch_dollar_literal(any_string_dtype):
     # GH 56652
     ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype)
     result = ser.str.fullmatch("foo\\$")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([False, False, np.nan, True], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, False, True], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([False, False, np.nan, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index 0b3f368afea5e..03b3a4f266c7e 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -13,7 +13,7 @@
 
 
 @pytest.mark.filterwarnings("ignore:Falling back")
-def test_string_array(nullable_string_dtype, any_string_method):
+def test_string_array(nullable_string_dtype, any_string_method, using_infer_string):
     method_name, args, kwargs = any_string_method
 
     data = ["a", "bb", np.nan, "ccc"]
@@ -39,7 +39,7 @@ def test_string_array(nullable_string_dtype, any_string_method):
             expected.values, skipna=True
         ):
             assert result.dtype == "boolean"
-            result = result.astype(object)
+            expected = expected.astype("boolean")
 
         elif expected.dtype == "bool":
             assert result.dtype == "boolean"
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 1ce46497c3c22..a8aa72ebc3351 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -259,10 +259,14 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
 def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
     values = ["A", np.nan, "¼", "★", np.nan, "３", "four"]  # noqa: RUF001
     ser = Series(values, dtype=any_string_dtype)
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series(expected, dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series(expected, dtype=object).fillna(False).astype(bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(expected, dtype=expected_dtype)
     result = getattr(ser.str, method)()
     tm.assert_series_equal(result, expected)
 

From b06764e828cfc0c94662f73365bba0eee07e1811 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 26 Aug 2024 18:02:04 +0200
Subject: [PATCH 02/13] use no_default for ArrowEA._str_endswith as well

---
 pandas/core/arrays/arrow/array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 399af4076c60f..3cd89d6518003 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2327,7 +2327,7 @@ def _str_startswith(self, pat: str | tuple[str, ...], na=lib.no_default) -> Self
             result = result.fill_null(na)
         return type(self)(result)
 
-    def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self:
+    def _str_endswith(self, pat: str | tuple[str, ...], na=lib.no_default) -> Self:
         if isinstance(pat, str):
             result = pc.ends_with(self._pa_array, pattern=pat)
         else:

From b2357351297c7c90ea6831e84c515484cedce122 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 26 Aug 2024 20:43:32 +0200
Subject: [PATCH 03/13] update type annotations

---
 pandas/core/arrays/arrow/array.py  |  6 +++++-
 pandas/core/arrays/string_arrow.py | 14 ++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 3cd89d6518003..358ad7091839a 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2390,7 +2390,11 @@ def _str_match(
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ) -> Self:
         if not pat.endswith("$") or pat.endswith("\\$"):
             pat = f"{pat}$"
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index a1164392941f3..b0bf49415f16e 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -313,7 +313,7 @@ def _str_contains(
         return result
 
     def _str_startswith(
-        self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
     ):
         if isinstance(pat, str):
             result = pc.starts_with(self._pa_array, pattern=pat)
@@ -337,7 +337,9 @@ def _str_startswith(
             result = result.fill_null(na)
         return self._predicate_result_converter(result)
 
-    def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+    def _str_endswith(
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
+    ):
         if isinstance(pat, str):
             result = pc.ends_with(self._pa_array, pattern=pat)
         else:
@@ -389,14 +391,18 @@ def _str_match(
         pat: str,
         case: bool = True,
         flags: int = 0,
-        na: Scalar | None = lib.no_default,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not pat.startswith("^"):
             pat = f"^{pat}"
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not pat.endswith("$") or pat.endswith("\\$"):
             pat = f"{pat}$"

From 562118e2a2e2b3f44a136792fcbfdc6d03be9085 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 26 Aug 2024 20:48:34 +0200
Subject: [PATCH 04/13] update docstrings

---
 pandas/core/strings/accessor.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index cc9821026fcda..e7bf5bbd2b3f4 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -1245,8 +1245,9 @@ def contains(
             Flags to pass through to the re module, e.g. re.IGNORECASE.
         na : scalar, optional
             Fill value for missing values. The default depends on dtype of the
-            array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
-            ``pandas.NA`` is used.
+            array. For object-dtype, ``numpy.nan`` is used. For the nullable
+            ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
+            ``False`` is used.
         regex : bool, default True
             If True, assumes the pat is a regular expression.
 
@@ -1378,8 +1379,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
             Regex module flags, e.g. re.IGNORECASE.
         na : scalar, optional
             Fill value for missing values. The default depends on dtype of the
-            array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
-            ``pandas.NA`` is used.
+            array. For object-dtype, ``numpy.nan`` is used. For the nullable
+            ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
+            ``False`` is used.
 
         Returns
         -------
@@ -1422,8 +1424,9 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
             Regex module flags, e.g. re.IGNORECASE.
         na : scalar, optional
             Fill value for missing values. The default depends on dtype of the
-            array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
-            ``pandas.NA`` is used.
+            array. For object-dtype, ``numpy.nan`` is used. For the nullable
+            ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
+            ``False`` is used.
 
         Returns
         -------
@@ -2586,7 +2589,7 @@ def count(self, pat, flags: int = 0):
 
     @forbid_nonstring_types(["bytes"])
     def startswith(
-        self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
     ) -> Series | Index:
         """
         Test if the start of each string element matches a pattern.
@@ -2598,10 +2601,11 @@ def startswith(
         pat : str or tuple[str, ...]
             Character sequence or tuple of strings. Regular expressions are not
             accepted.
-        na : object, default NaN
+        na : scalar, optional
             Object shown if element tested is not a string. The default depends
             on dtype of the array. For object-dtype, ``numpy.nan`` is used.
-            For ``StringDtype``, ``pandas.NA`` is used.
+            For the nullable ``StringDtype``, ``pandas.NA`` is used.
+            For the ``"str"`` dtype, ``False`` is used.
 
         Returns
         -------
@@ -2656,7 +2660,7 @@ def startswith(
 
     @forbid_nonstring_types(["bytes"])
     def endswith(
-        self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
     ) -> Series | Index:
         """
         Test if the end of each string element matches a pattern.
@@ -2668,10 +2672,11 @@ def endswith(
         pat : str or tuple[str, ...]
             Character sequence or tuple of strings. Regular expressions are not
             accepted.
-        na : object, default NaN
+        na : scalar, optional
             Object shown if element tested is not a string. The default depends
             on dtype of the array. For object-dtype, ``numpy.nan`` is used.
-            For ``StringDtype``, ``pandas.NA`` is used.
+            For the nullable ``StringDtype``, ``pandas.NA`` is used.
+            For the ``"str"`` dtype, ``False`` is used.
 
         Returns
         -------

From ef05ade2a2338e806476dfdb2d78d2944a368f4a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 27 Aug 2024 08:28:11 +0200
Subject: [PATCH 05/13] more type annotations

---
 pandas/core/arrays/arrow/array.py   |  2 +-
 pandas/core/arrays/string_arrow.py  |  1 +
 pandas/core/strings/base.py         | 10 +++++++---
 pandas/core/strings/object_array.py |  8 ++++++--
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 358ad7091839a..243c76321a841 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2383,7 +2383,7 @@ def _str_match(
         pat: str,
         case: bool = True,
         flags: int = 0,
-        na: Scalar | None = lib.no_default,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ) -> Self:
         if not pat.startswith("^"):
             pat = f"^{pat}"
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index b0bf49415f16e..fac9972d3cccd 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -223,6 +223,7 @@ def insert(self, loc: int, item) -> ArrowStringArray:
 
     def _predicate_result_converter(self, values, na=lib.no_default):
         if self.dtype.na_value is np.nan:
+            na_value: bool | float | lib.NoDefault
             if na is lib.no_default:
                 na_value = False
             elif not isna(na):
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
index 1281a03e297f9..85bb716f7c3e4 100644
--- a/pandas/core/strings/base.py
+++ b/pandas/core/strings/base.py
@@ -6,7 +6,7 @@
     Literal,
 )
 
-import numpy as np
+from pandas._libs import lib
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -88,7 +88,11 @@ def _str_repeat(self, repeats: int | Sequence[int]):
 
     @abc.abstractmethod
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         pass
 
@@ -98,7 +102,7 @@ def _str_fullmatch(
         pat: str | re.Pattern,
         case: bool = True,
         flags: int = 0,
-        na: Scalar = np.nan,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         pass
 
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index c150a652d19c3..2cd837c2e8534 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -220,7 +220,11 @@ def rep(x, r):
             return type(self)._from_sequence(result, dtype=self.dtype)
 
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not case:
             flags |= re.IGNORECASE
@@ -235,7 +239,7 @@ def _str_fullmatch(
         pat: str | re.Pattern,
         case: bool = True,
         flags: int = 0,
-        na: Scalar | None = None,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not case:
             flags |= re.IGNORECASE

From b9612fcc1060db1f47ba8f0c3f03ee14607b39dc Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 27 Aug 2024 11:20:38 +0200
Subject: [PATCH 06/13] test and fix startswith/endswith

---
 pandas/core/arrays/string_.py             | 11 +++----
 pandas/core/arrays/string_arrow.py        | 14 +++++----
 pandas/tests/strings/test_find_replace.py | 37 ++++++++++++++++++-----
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index e4364b1ef507f..599373823bab5 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -453,7 +453,8 @@ def _str_map_nan_semantics(
                 if is_integer_dtype(dtype):
                     na_value = 0
                 else:
-                    na_value = True
+                    # NaN propagates as False
+                    na_value = False
 
             result = lib.map_infer_mask(
                 arr,
@@ -463,15 +464,13 @@ def _str_map_nan_semantics(
                 na_value=na_value,
                 dtype=np.dtype(cast(type, dtype)),
             )
-            if na_value_is_na and mask.any():
+            if na_value_is_na and is_integer_dtype(dtype) and mask.any():
                 # TODO: we could alternatively do this check before map_infer_mask
                 #  and adjust the dtype/na_value we pass there. Which is more
                 #  performant?
-                if is_integer_dtype(dtype):
-                    result = result.astype("float64")
-                else:
-                    result = result.astype("object")
+                result = result.astype("float64")
                 result[mask] = np.nan
+
             return result
 
         else:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index fac9972d3cccd..27b40ec2686da 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -223,14 +223,16 @@ def insert(self, loc: int, item) -> ArrowStringArray:
 
     def _predicate_result_converter(self, values, na=lib.no_default):
         if self.dtype.na_value is np.nan:
-            na_value: bool | float | lib.NoDefault
+            na_value: bool | lib.NoDefault
             if na is lib.no_default:
                 na_value = False
-            elif not isna(na):
-                values = values.fill_null(bool(na))
+            elif isna(na):
+                # NaN propagates as False
+                values = values.fill_null(False)
                 na_value = lib.no_default
             else:
-                na_value = np.nan
+                values = values.fill_null(bool(na))
+                na_value = lib.no_default
             return ArrowExtensionArray(values).to_numpy(na_value=na_value)
         return BooleanDtype().__from_arrow__(values)
 
@@ -336,7 +338,7 @@ def _str_startswith(
             and not isna(na)
         ):
             result = result.fill_null(na)
-        return self._predicate_result_converter(result)
+        return self._predicate_result_converter(result, na=na)
 
     def _str_endswith(
         self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
@@ -361,7 +363,7 @@ def _str_endswith(
             and not isna(na)
         ):
             result = result.fill_null(na)
-        return self._predicate_result_converter(result)
+        return self._predicate_result_converter(result, na=na)
 
     def _str_replace(
         self,
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 7c30d5db09607..1b128120e2965 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -311,20 +311,31 @@ def test_startswith(pat, dtype, null_value, na):
 
 
 @pytest.mark.parametrize("na", [None, True, False])
-def test_startswith_nullable_string_dtype(nullable_string_dtype, na):
+def test_startswith_string_dtype(any_string_dtype, na):
     values = Series(
         ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
-        dtype=nullable_string_dtype,
+        dtype=any_string_dtype,
     )
     result = values.str.startswith("foo", na=na)
+
+    expected_dtype = (
+        (object if na is None else bool)
+        if is_object_or_nan_string_dtype(any_string_dtype)
+        else "boolean"
+    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        if na is None:
+            na = False
     exp = Series(
-        [False, na, True, False, False, na, True, False, False], dtype="boolean"
+        [False, na, True, False, False, na, True, False, False], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 
     result = values.str.startswith("rege.", na=na)
     exp = Series(
-        [False, na, False, False, False, na, False, False, True], dtype="boolean"
+        [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 
@@ -369,20 +380,30 @@ def test_endswith(pat, dtype, null_value, na):
 
 
 @pytest.mark.parametrize("na", [None, True, False])
-def test_endswith_nullable_string_dtype(nullable_string_dtype, na):
+def test_endswith_string_dtype(any_string_dtype, na):
     values = Series(
         ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
-        dtype=nullable_string_dtype,
+        dtype=any_string_dtype,
     )
     result = values.str.endswith("foo", na=na)
+    expected_dtype = (
+        (object if na is None else bool)
+        if is_object_or_nan_string_dtype(any_string_dtype)
+        else "boolean"
+    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        if na is None:
+            na = False
     exp = Series(
-        [False, na, False, False, True, na, True, False, False], dtype="boolean"
+        [False, na, False, False, True, na, True, False, False], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 
     result = values.str.endswith("rege.", na=na)
     exp = Series(
-        [False, na, False, False, False, na, False, False, True], dtype="boolean"
+        [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 

From cf242a2b5bef277a4e45cccc57d5299b4c2aaa2c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 27 Aug 2024 11:29:59 +0200
Subject: [PATCH 07/13] test ismethods

---
 pandas/tests/strings/test_strings.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index a8aa72ebc3351..d24fddbc5905c 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -217,8 +217,21 @@ def test_ismethods(method, expected, any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     # compare with standard library
-    expected = [getattr(item, method)() for item in ser]
-    assert list(result) == expected
+    expected_stdlib = [getattr(item, method)() for item in ser]
+    assert list(result) == expected_stdlib
+
+    # with missing value
+    ser.iloc[[1, 2, 3, 4]] = np.nan
+    result = getattr(ser.str, method)()
+    if ser.dtype == "object":
+        expected = expected.astype(object)
+        expected.iloc[[1, 2, 3, 4]] = np.nan
+    elif ser.dtype == "str":
+        # NaN propagates as False
+        expected.iloc[[1, 2, 3, 4]] = False
+    else:
+        # nullable dtypes propagate NaN
+        expected.iloc[[1, 2, 3, 4]] = np.nan
 
 
 @pytest.mark.parametrize(

From ad0d6e1e15f46d7f477cedbf5c1b2de07809f85d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sat, 31 Aug 2024 20:00:45 +0200
Subject: [PATCH 08/13] fix warnings

---
 pandas/core/arrays/_arrow_string_mixins.py |  6 +++---
 pandas/core/arrays/arrow/array.py          |  2 +-
 pandas/core/arrays/string_arrow.py         | 12 ++++++++++--
 pandas/tests/strings/test_find_replace.py  |  3 ++-
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index 0597c2b9dc4d7..0791faa07ebbf 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -34,7 +34,7 @@ class ArrowStringArrayMixin:
     def __init__(self, *args, **kwargs) -> None:
         raise NotImplementedError
 
-    def _convert_bool_result(self, result, na=lib.no_default):
+    def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
         # Convert a bool-dtype result to the appropriate result type
         raise NotImplementedError
 
@@ -130,7 +130,7 @@ def _str_startswith(
             and not isna(na)
         ):  # pyright: ignore [reportGeneralTypeIssues]
             result = result.fill_null(na)
-        return self._convert_bool_result(result, na=na)
+        return self._convert_bool_result(result, na=na, method_name="startswith")
 
     def _str_endswith(
         self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
@@ -153,4 +153,4 @@ def _str_endswith(
             and not isna(na)
         ):  # pyright: ignore [reportGeneralTypeIssues]
             result = result.fill_null(na)
-        return self._convert_bool_result(result, na=na)
+        return self._convert_bool_result(result, na=na, method_name="endswith")
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 7eff347823b20..c5447953809ba 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2311,7 +2311,7 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
             for chunk in self._pa_array.iterchunks()
         ]
 
-    def _convert_bool_result(self, result, na=lib.no_default):
+    def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
         return type(self)(result)
 
     def _convert_int_result(self, result):
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 6fc507e8a5444..501e90448251b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -223,7 +223,7 @@ def insert(self, loc: int, item) -> ArrowStringArray:
             raise TypeError("Scalar must be NA or str")
         return super().insert(loc, item)
 
-    def _convert_bool_result(self, values, na=lib.no_default):
+    def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
         if self.dtype.na_value is np.nan:
             na_value: bool | lib.NoDefault
             if na is lib.no_default:
@@ -233,6 +233,14 @@ def _convert_bool_result(self, values, na=lib.no_default):
                 values = values.fill_null(False)
                 na_value = lib.no_default
             else:
+                if not isinstance(na, bool):
+                    # GH#59561
+                    warnings.warn(
+                        f"Allowing a non-bool 'na' in obj.str.{method_name} is "
+                        "deprecated and will raise in a future version.",
+                        FutureWarning,
+                        stacklevel=find_stack_level(),
+                    )
                 values = values.fill_null(bool(na))
                 na_value = lib.no_default
             return ArrowExtensionArray(values).to_numpy(na_value=na_value)
@@ -310,7 +318,7 @@ def _str_contains(
             result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
         else:
             result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
-        result = self._convert_bool_result(result, na=na)
+        result = self._convert_bool_result(result, na=na, method_name="contains")
         if (
             self.dtype.na_value is libmissing.NA
             and na is not lib.no_default
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index a63b23ed40f9e..65cf19ab5da01 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -299,7 +299,8 @@ def test_startswith_endswith_validate_na(any_string_dtype):
 
     dtype = ser.dtype
     if (
-        isinstance(dtype, pd.StringDtype) and dtype.storage == "python"
+        isinstance(dtype, pd.StringDtype)
+        and (dtype.storage == "python" or dtype.na_value is np.nan)
     ) or dtype == np.dtype("object"):
         msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
         with tm.assert_produces_warning(FutureWarning, match=msg):

From bf02000a5aecb736dbd675df1828ea700e62fae6 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 2 Sep 2024 08:54:02 +0200
Subject: [PATCH 09/13] try fix typing

---
 pandas/core/arrays/_arrow_string_mixins.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index 0791faa07ebbf..071240b049e09 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -27,9 +27,12 @@
         Self,
     )
 
+    from pandas.core.dtypes.dtypes import ExtensionDtype
+
 
 class ArrowStringArrayMixin:
     _pa_array: Sized
+    dtype: ExtensionDtype
 
     def __init__(self, *args, **kwargs) -> None:
         raise NotImplementedError

From 377ff3aaf605f39198a7fd1e81cfb0d2080fac2c Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 6 Sep 2024 18:21:55 +0200
Subject: [PATCH 10/13] follow same behaviour for categorical[str]

---
 pandas/core/arrays/_arrow_string_mixins.py |  2 +-
 pandas/core/arrays/categorical.py          | 16 ++++++++++++----
 pandas/core/arrays/string_.py              |  1 +
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index a9cea627e988a..f8c2b4a78b06a 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -223,7 +223,7 @@ def _str_contains(
         pat,
         case: bool = True,
         flags: int = 0,
-        na=lib.no_default,
+        na: Scalar | lib.NoDefault = lib.no_default,
         regex: bool = True,
     ):
         if flags:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index c613a345686cc..48ba608243e0b 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2669,16 +2669,24 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
     # ------------------------------------------------------------------------
     # String methods interface
     def _str_map(
-        self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
+        self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True
     ):
         # Optimization to apply the callable `f` to the categories once
         # and rebuild the result by `take`ing from the result with the codes.
         # Returns the same type as the object-dtype implementation though.
-        from pandas.core.arrays import NumpyExtensionArray
-
         categories = self.categories
         codes = self.codes
-        result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
+        if categories.dtype == "string":
+            result = categories.array._str_map(f, na_value, dtype)
+            if categories.dtype.na_value is np.nan:
+                # NaN propagates as False
+                na_value = False
+        else:
+            from pandas.core.arrays import NumpyExtensionArray
+
+            result = NumpyExtensionArray(categories.to_numpy())._str_map(
+                f, na_value, dtype
+            )
         return take_nd(result, codes, fill_value=na_value)
 
     def _str_get_dummies(self, sep: str = "|"):
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2a0498932ce21..1e9b07d37cf2e 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -440,6 +440,7 @@ def _str_map_nan_semantics(
             dtype = self.dtype
         if na_value is lib.no_default:
             if is_bool_dtype(dtype):
+                # NaN propagates as False
                 na_value = False
             else:
                 na_value = self.dtype.na_value

From 2dfd50bb52081466512a23624f66a1bd6a254691 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 6 Sep 2024 19:11:53 +0200
Subject: [PATCH 11/13] simplify fill_null calls for string[pyarrow] case

---
 pandas/core/arrays/_arrow_string_mixins.py | 28 +------------
 pandas/core/arrays/arrow/array.py          |  2 +
 pandas/core/arrays/string_arrow.py         | 47 ++++++++--------------
 pandas/tests/strings/test_find_replace.py  |  5 +--
 4 files changed, 20 insertions(+), 62 deletions(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index f8c2b4a78b06a..22b1d84e2373e 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -9,18 +9,13 @@
 
 import numpy as np
 
-from pandas._libs import (
-    lib,
-    missing as libmissing,
-)
+from pandas._libs import lib
 from pandas.compat import (
     pa_version_under10p1,
     pa_version_under13p0,
     pa_version_under17p0,
 )
 
-from pandas.core.dtypes.missing import isna
-
 if not pa_version_under10p1:
     import pyarrow as pa
     import pyarrow.compute as pc
@@ -36,12 +31,9 @@
         Self,
     )
 
-    from pandas.core.dtypes.dtypes import ExtensionDtype
-
 
 class ArrowStringArrayMixin:
     _pa_array: Sized
-    dtype: ExtensionDtype
 
     def __init__(self, *args, **kwargs) -> None:
         raise NotImplementedError
@@ -151,12 +143,6 @@ def _str_startswith(
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
-        if (
-            self.dtype.na_value is libmissing.NA
-            and na is not lib.no_default
-            and not isna(na)
-        ):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
         return self._convert_bool_result(result, na=na, method_name="startswith")
 
     def _str_endswith(
@@ -174,12 +160,6 @@ def _str_endswith(
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
-        if (
-            self.dtype.na_value is libmissing.NA
-            and na is not lib.no_default
-            and not isna(na)
-        ):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
         return self._convert_bool_result(result, na=na, method_name="endswith")
 
     def _str_isalnum(self):
@@ -234,12 +214,6 @@ def _str_contains(
         else:
             pa_contains = pc.match_substring
         result = pa_contains(self._pa_array, pat, ignore_case=not case)
-        if (
-            self.dtype.na_value is libmissing.NA
-            and na is not lib.no_default
-            and not isna(na)
-        ):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
         return self._convert_bool_result(result, na=na, method_name="contains")
 
     def _str_find(self, sub: str, start: int = 0, end: int | None = None):
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index c08c84cbdfeda..2309ebbe2b25e 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2312,6 +2312,8 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
         ]
 
     def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
+        if na is not lib.no_default and not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
+            result = result.fill_null(na)
         return type(self)(result)
 
     def _convert_int_result(self, result):
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e10c0134b6348..f3763f2a75aa8 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -224,26 +224,26 @@ def insert(self, loc: int, item) -> ArrowStringArray:
         return super().insert(loc, item)
 
     def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
+        if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
+            # GH#59561
+            warnings.warn(
+                f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated "
+                "and will raise in a future version.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+            na = bool(na)
+
         if self.dtype.na_value is np.nan:
-            na_value: bool | lib.NoDefault
-            if na is lib.no_default:
-                na_value = False
-            elif isna(na):
+            if na is lib.no_default or isna(na):
                 # NaN propagates as False
                 values = values.fill_null(False)
-                na_value = lib.no_default
             else:
-                if not isinstance(na, bool):
-                    # GH#59561
-                    warnings.warn(
-                        f"Allowing a non-bool 'na' in obj.str.{method_name} is "
-                        "deprecated and will raise in a future version.",
-                        FutureWarning,
-                        stacklevel=find_stack_level(),
-                    )
-                values = values.fill_null(bool(na))
-                na_value = lib.no_default
-            return ArrowExtensionArray(values).to_numpy(na_value=na_value)
+                values = values.fill_null(na)
+            return values.to_numpy(zero_copy_only=False)
+        else:
+            if na is not lib.no_default and not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
+                values = values.fill_null(na)
         return BooleanDtype().__from_arrow__(values)
 
     def _maybe_convert_setitem_value(self, value):
@@ -325,21 +325,6 @@ def _str_contains(
                 fallback_performancewarning()
             return super()._str_contains(pat, case, flags, na, regex)
 
-        if (
-            self.dtype.na_value is libmissing.NA
-            and na is not lib.no_default
-            and not isna(na)
-        ):
-            if not isinstance(na, bool):
-                # GH#59561
-                warnings.warn(
-                    "Allowing a non-bool 'na' in obj.str.contains is deprecated "
-                    "and will raise in a future version.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
-                )
-                na = bool(na)
-
         return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
 
     def _str_replace(
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 65cf19ab5da01..79c1c0c182493 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -298,10 +298,7 @@ def test_startswith_endswith_validate_na(any_string_dtype):
     )
 
     dtype = ser.dtype
-    if (
-        isinstance(dtype, pd.StringDtype)
-        and (dtype.storage == "python" or dtype.na_value is np.nan)
-    ) or dtype == np.dtype("object"):
+    if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"):
         msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
         with tm.assert_produces_warning(FutureWarning, match=msg):
             ser.str.startswith("kapow", na="baz")

From adf2b9953e039f0fe2ea88a393d188f15d221ebd Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 6 Sep 2024 19:41:19 +0200
Subject: [PATCH 12/13] fix na_value handling for categorical case + update
 tests for expected categorical behaviour

---
 pandas/core/arrays/categorical.py         | 8 ++++++--
 pandas/tests/strings/test_api.py          | 9 ++++++++-
 pandas/tests/strings/test_find_replace.py | 8 ++++++--
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 48ba608243e0b..4776d4a04a939 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2678,8 +2678,12 @@ def _str_map(
         codes = self.codes
         if categories.dtype == "string":
             result = categories.array._str_map(f, na_value, dtype)
-            if categories.dtype.na_value is np.nan:
-                # NaN propagates as False
+            if (
+                categories.dtype.na_value is np.nan
+                and is_bool_dtype(dtype)
+                and (na_value is lib.no_default or isna(na_value))
+            ):
+                # NaN propagates as False for functions with boolean return type
                 na_value = False
         else:
             from pandas.core.arrays import NumpyExtensionArray
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
index 2511474e03ff7..4a1b97606db2b 100644
--- a/pandas/tests/strings/test_api.py
+++ b/pandas/tests/strings/test_api.py
@@ -122,6 +122,7 @@ def test_api_per_method(
     any_allowed_skipna_inferred_dtype,
     any_string_method,
     request,
+    using_infer_string,
 ):
     # this test does not check correctness of the different methods,
     # just that the methods work on the specified (inferred) dtypes,
@@ -160,6 +161,10 @@ def test_api_per_method(
     t = box(values, dtype=dtype)  # explicit dtype to avoid casting
     method = getattr(t.str, method_name)
 
+    if using_infer_string and dtype == "category":
+        string_allowed = method_name not in ["decode"]
+    else:
+        string_allowed = True
     bytes_allowed = method_name in ["decode", "get", "len", "slice"]
     # as of v0.23.4, all methods except 'cat' are very lenient with the
     # allowed data types, just returning NaN for entries that error.
@@ -168,7 +173,8 @@ def test_api_per_method(
     mixed_allowed = method_name not in ["cat"]
 
     allowed_types = (
-        ["string", "unicode", "empty"]
+        ["empty"]
+        + ["string", "unicode"] * string_allowed
         + ["bytes"] * bytes_allowed
         + ["mixed", "mixed-integer"] * mixed_allowed
     )
@@ -182,6 +188,7 @@ def test_api_per_method(
         msg = (
             f"Cannot use .str.{method_name} with values of "
             f"inferred dtype {inferred_dtype!r}."
+            "|a bytes-like object is required, not 'str'"
         )
         with pytest.raises(TypeError, match=msg):
             method(*args, **kwargs)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index 79c1c0c182493..408fc2028b10a 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -320,7 +320,7 @@ def test_startswith_endswith_validate_na(any_string_dtype):
 @pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
-def test_startswith(pat, dtype, null_value, na):
+def test_startswith(pat, dtype, null_value, na, using_infer_string):
     # add category dtype parametrizations for GH-36241
     values = Series(
         ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
@@ -334,6 +334,8 @@ def test_startswith(pat, dtype, null_value, na):
         exp = exp.fillna(null_value)
     elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
     tm.assert_series_equal(result, exp)
 
     result = values.str.startswith(pat, na=na)
@@ -389,7 +391,7 @@ def test_startswith_string_dtype(any_string_dtype, na):
 @pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
-def test_endswith(pat, dtype, null_value, na):
+def test_endswith(pat, dtype, null_value, na, using_infer_string):
     # add category dtype parametrizations for GH-36241
     values = Series(
         ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
@@ -403,6 +405,8 @@ def test_endswith(pat, dtype, null_value, na):
         exp = exp.fillna(null_value)
     elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
     tm.assert_series_equal(result, exp)
 
     result = values.str.endswith(pat, na=na)

From ddd531a978b14ec72fdb8b0c0e728de43e907ef6 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 6 Sep 2024 20:16:29 +0200
Subject: [PATCH 13/13] fix typing + fix conversion for old pyarrow

---
 pandas/core/arrays/categorical.py  | 4 ++--
 pandas/core/arrays/string_arrow.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 4776d4a04a939..269d88baa6422 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2677,9 +2677,9 @@ def _str_map(
         categories = self.categories
         codes = self.codes
         if categories.dtype == "string":
-            result = categories.array._str_map(f, na_value, dtype)
+            result = categories.array._str_map(f, na_value, dtype)  # type: ignore[attr-defined]
             if (
-                categories.dtype.na_value is np.nan
+                categories.dtype.na_value is np.nan  # type: ignore[union-attr]
                 and is_bool_dtype(dtype)
                 and (na_value is lib.no_default or isna(na_value))
             ):
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index f3763f2a75aa8..129272ecb64f7 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -240,7 +240,7 @@ def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
                 values = values.fill_null(False)
             else:
                 values = values.fill_null(na)
-            return values.to_numpy(zero_copy_only=False)
+            return values.to_numpy()
         else:
             if na is not lib.no_default and not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
                 values = values.fill_null(na)