From 9620e00c7c64bd909a18235439ab9f289d697f2d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 26 Aug 2024 17:53:12 +0200 Subject: [PATCH 01/13] String dtype: propagate NaNs as False in predicate methods (eg .str.startswith) --- pandas/core/arrays/arrow/array.py | 23 ++-- pandas/core/arrays/string_.py | 19 ++- pandas/core/arrays/string_arrow.py | 72 +++++++---- pandas/core/strings/accessor.py | 15 ++- pandas/core/strings/object_array.py | 19 ++- pandas/tests/strings/test_find_replace.py | 142 ++++++++++++++-------- pandas/tests/strings/test_string_array.py | 4 +- pandas/tests/strings/test_strings.py | 12 +- 8 files changed, 206 insertions(+), 100 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e95fa441e18fb..399af4076c60f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2291,7 +2291,12 @@ def _str_count(self, pat: str, flags: int = 0) -> Self: return type(self)(pc.count_substring_regex(self._pa_array, pat)) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ) -> Self: if flags: raise NotImplementedError(f"contains not implemented with {flags=}") @@ -2301,11 +2306,11 @@ def _str_contains( else: pa_contains = pc.match_substring result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): + if na is not lib.no_default and not isna(na): result = result.fill_null(na) return type(self)(result) - def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: + def _str_startswith(self, pat: str | tuple[str, ...], na=lib.no_default) -> Self: if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) else: @@ -2318,7 +2323,7 @@ def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: for p in pat[1:]: result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): + if na is not lib.no_default and not isna(na): result = result.fill_null(na) return type(self)(result) @@ -2335,7 +2340,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: for p in pat[1:]: result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): + if na is not lib.no_default and not isna(na): result = result.fill_null(na) return type(self)(result) @@ -2374,14 +2379,18 @@ def _str_repeat(self, repeats: int | Sequence[int]) -> Self: return type(self)(pc.binary_repeat(self._pa_array, repeats)) def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | None = lib.no_default, ) -> Self: if not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default ) -> Self: if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2e7f9314c4f09..e4364b1ef507f 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -351,7 +351,11 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: return cls._from_sequence(scalars, dtype=dtype) def _str_map( - self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: Dtype | None = None, + convert: bool = True, ): if self.dtype.na_value is np.nan: return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype) @@ -360,7 +364,7 @@ def _str_map( if dtype is None: dtype = self.dtype - if na_value is None: + if na_value is lib.no_default: na_value = self.dtype.na_value mask = isna(self) @@ -429,11 +433,16 @@ def _str_map_str_or_object( # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8")) - def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None): + def _str_map_nan_semantics( + self, f, na_value=lib.no_default, dtype: Dtype | None = None + ): if dtype is None: dtype = self.dtype - if na_value is None: - na_value = self.dtype.na_value + if na_value is lib.no_default: + if is_bool_dtype(dtype): + na_value = False + else: + na_value = self.dtype.na_value mask = isna(self) arr = np.asarray(self) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 67114815341b6..a1164392941f3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -221,11 +221,16 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _result_converter(self, values, na=None): + def _predicate_result_converter(self, values, na=lib.no_default): if self.dtype.na_value is np.nan: - if not isna(na): + if na is lib.no_default: + na_value = False + elif not isna(na): values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) + na_value = lib.no_default + else: + na_value = np.nan + return ArrowExtensionArray(values).to_numpy(na_value=na_value) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -282,7 +287,12 @@ def astype(self, dtype, copy: bool = True): _str_map = BaseStringArray._str_map def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if flags: if get_option("mode.performance_warnings"): @@ -293,12 +303,18 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._result_converter(result, na=na) - if not isna(na): + result = self._predicate_result_converter(result, na=na) + if ( + self.dtype.na_value is libmissing.NA + and na is not lib.no_default + and not isna(na) + ): result[isna(result)] = bool(na) return result - def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + def _str_startswith( + self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default + ): if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) else: @@ -313,9 +329,13 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if not isna(na): + if ( + self.dtype.na_value is libmissing.NA + and na is not lib.no_default + and not isna(na) + ): result = result.fill_null(na) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): if isinstance(pat, str): @@ -332,9 +352,13 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): for p in pat[1:]: result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if not isna(na): + if ( + self.dtype.na_value is libmissing.NA + and na is not lib.no_default + and not isna(na) + ): result = result.fill_null(na) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_replace( self, @@ -361,14 +385,18 @@ def _str_repeat(self, repeats: int | Sequence[int]): return type(self)(pc.binary_repeat(self._pa_array, repeats)) def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | None = lib.no_default, ): if not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default ): if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" @@ -389,39 +417,39 @@ def _str_slice( def _str_isalnum(self): result = pc.utf8_is_alnum(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_isalpha(self): result = pc.utf8_is_alpha(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_isdecimal(self): result = pc.utf8_is_decimal(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_isdigit(self): result = pc.utf8_is_digit(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_islower(self): result = pc.utf8_is_lower(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_isnumeric(self): result = pc.utf8_is_numeric(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_isspace(self): result = pc.utf8_is_space(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_istitle(self): result = pc.utf8_is_title(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) - return self._result_converter(result) + return self._predicate_result_converter(result) def _str_len(self): result = pc.utf8_length(self._pa_array) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c88270b2a2f16..cc9821026fcda 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1222,7 +1222,12 @@ def join(self, sep: str): @forbid_nonstring_types(["bytes"]) def contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): r""" Test if pattern or regex is contained within a string of a Series or Index. @@ -1359,7 +1364,7 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat: str, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string starts with a match of a regular expression. @@ -1403,7 +1408,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None): return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): + def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): """ Determine if each string entirely matches a regular expression. @@ -2581,7 +2586,7 @@ def count(self, pat, flags: int = 0): @forbid_nonstring_types(["bytes"]) def startswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default ) -> Series | Index: """ Test if the start of each string element matches a pattern. @@ -2651,7 +2656,7 @@ def startswith( @forbid_nonstring_types(["bytes"]) def endswith( - self, pat: str | tuple[str, ...], na: Scalar | None = None + self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default ) -> Series | Index: """ Test if the end of each string element matches a pattern. diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 100afa956bd24..c150a652d19c3 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -42,7 +42,11 @@ def __len__(self) -> int: raise NotImplementedError def _str_map( - self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True + self, + f, + na_value=lib.no_default, + dtype: NpDtype | None = None, + convert: bool = True, ): """ Map a callable over valid elements of the array. @@ -63,7 +67,7 @@ def _str_map( """ if dtype is None: dtype = np.dtype("object") - if na_value is None: + if na_value is lib.no_default: na_value = self.dtype.na_value # type: ignore[attr-defined] if not len(self): @@ -127,7 +131,12 @@ def _str_pad( return self._str_map(f) def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True + self, + pat, + case: bool = True, + flags: int = 0, + na=lib.no_default, + regex: bool = True, ): if regex: if not case: @@ -144,11 +153,11 @@ def _str_contains( f = lambda x: upper_pat in x.upper() return self._str_map(f, na, dtype=np.dtype("bool")) - def _str_startswith(self, pat, na=None): + def _str_startswith(self, pat, na=lib.no_default): f = lambda x: x.startswith(pat) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - def _str_endswith(self, pat, na=None): + def _str_endswith(self, pat, na=lib.no_default): f = lambda x: x.endswith(pat) return self._str_map(f, na_value=na, dtype=np.dtype(bool)) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 00677ef4fcfe9..7c30d5db09607 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -33,20 +33,28 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series( - np.array([False, np.nan, True, True, False], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, True, True, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_), + dtype=expected_dtype, + ) + tm.assert_series_equal(result, expected) result = values.str.contains(pat, regex=False) - expected = Series( - np.array([False, np.nan, False, False, True], dtype=np.object_), - dtype=expected_dtype, - ) + if any_string_dtype == "str": + expected = Series([False, False, False, False, True], dtype=bool) + else: + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) values = Series( @@ -83,12 +91,16 @@ def test_contains(any_string_dtype): pat = "mmm[_]+" result = values.str.contains(pat) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series( - np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype - ) + if any_string_dtype == "str": + expected = Series([False, False, True, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series( + np.array([False, np.nan, True, True], dtype=np.object_), + dtype=expected_dtype, + ) tm.assert_series_equal(result, expected) result = values.str.contains(pat, na=False) @@ -179,39 +191,45 @@ def test_contains_moar(any_string_dtype): ) result = s.str.contains("a") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan expected = Series( - [False, False, False, True, True, False, np.nan, False, False, True], + [False, False, False, True, True, False, na_value, False, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("a", case=False) expected = Series( - [True, False, False, True, True, False, np.nan, True, False, True], + [True, False, False, True, True, False, na_value, True, False, True], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("Aa") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba") expected = Series( - [False, False, False, True, False, False, np.nan, False, False, False], + [False, False, False, True, False, False, na_value, False, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) result = s.str.contains("ba", case=False) expected = Series( - [False, False, False, True, True, False, np.nan, True, False, False], + [False, False, False, True, True, False, na_value, True, False, False], dtype=expected_dtype, ) tm.assert_series_equal(result, expected) @@ -242,10 +260,14 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.contains("foo") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -688,36 +710,41 @@ def test_replace_regex_single_character(regex, any_string_dtype): def test_match(any_string_dtype): - # New match behavior introduced in 0.13 - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype) result = values.str.match(".*(BAD[_]+).*(BAD)") - expected = Series([True, np.nan, False], dtype=expected_dtype) + expected = Series([True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match(".*BAD[_]+.*BAD") - expected = Series([True, True, np.nan, False], dtype=expected_dtype) + expected = Series([True, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) values = Series( ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = values.str.match("^BAD[_]+.*BAD") - expected = Series([False, False, np.nan, False], dtype=expected_dtype) + expected = Series([False, False, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) result = values.str.match("\\^BAD[_]+.*BAD") - expected = Series([False, True, np.nan, False], dtype=expected_dtype) + expected = Series([False, True, na_value, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -753,10 +780,17 @@ def test_match_na_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) result = s.str.match("a") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([True, False, np.nan], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + na_value = False + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + na_value = np.nan + + expected = Series([True, False, na_value], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -781,10 +815,14 @@ def test_fullmatch(any_string_dtype): ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype ) result = ser.str.fullmatch(".*BAD[_]+.*BAD") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([True, False, np.nan, False], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([True, False, False, False], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, False, np.nan, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) @@ -792,10 +830,14 @@ def test_fullmatch_dollar_literal(any_string_dtype): # GH 56652 ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype) result = ser.str.fullmatch("foo\\$") - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series([False, False, np.nan, True], dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series([False, False, False, True], dtype=bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([False, False, np.nan, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0b3f368afea5e..03b3a4f266c7e 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -13,7 +13,7 @@ @pytest.mark.filterwarnings("ignore:Falling back") -def test_string_array(nullable_string_dtype, any_string_method): +def test_string_array(nullable_string_dtype, any_string_method, using_infer_string): method_name, args, kwargs = any_string_method data = ["a", "bb", np.nan, "ccc"] @@ -39,7 +39,7 @@ def test_string_array(nullable_string_dtype, any_string_method): expected.values, skipna=True ): assert result.dtype == "boolean" - result = result.astype(object) + expected = expected.astype("boolean") elif expected.dtype == "bool": assert result.dtype == "boolean" diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ce46497c3c22..a8aa72ebc3351 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -259,10 +259,14 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): def test_isnumeric_unicode_missing(method, expected, any_string_dtype): values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001 ser = Series(values, dtype=any_string_dtype) - expected_dtype = ( - "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" - ) - expected = Series(expected, dtype=expected_dtype) + if any_string_dtype == "str": + # NaN propagates as False + expected = Series(expected, dtype=object).fillna(False).astype(bool) + else: + expected_dtype = ( + "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series(expected, dtype=expected_dtype) result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) From b06764e828cfc0c94662f73365bba0eee07e1811 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 26 Aug 2024 18:02:04 +0200 Subject: [PATCH 02/13] use no_default for ArrowEA._str_endswith as well --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 399af4076c60f..3cd89d6518003 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2327,7 +2327,7 @@ def _str_startswith(self, pat: str | tuple[str, ...], na=lib.no_default) -> Self result = result.fill_null(na) return type(self)(result) - def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: + def _str_endswith(self, pat: str | tuple[str, ...], na=lib.no_default) -> Self: if isinstance(pat, str): result = pc.ends_with(self._pa_array, pattern=pat) else: From b2357351297c7c90ea6831e84c515484cedce122 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 26 Aug 2024 20:43:32 +0200 Subject: [PATCH 03/13] update type annotations --- pandas/core/arrays/arrow/array.py | 6 +++++- pandas/core/arrays/string_arrow.py | 14 ++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3cd89d6518003..358ad7091839a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2390,7 +2390,11 @@ def _str_match( return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ) -> Self: if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a1164392941f3..b0bf49415f16e 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -313,7 +313,7 @@ def _str_contains( return result def _str_startswith( - self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ): if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) @@ -337,7 +337,9 @@ def _str_startswith( result = result.fill_null(na) return self._predicate_result_converter(result) - def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None): + def _str_endswith( + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default + ): if isinstance(pat, str): result = pc.ends_with(self._pa_array, pattern=pat) else: @@ -389,14 +391,18 @@ def _str_match( pat: str, case: bool = True, flags: int = 0, - na: Scalar | None = lib.no_default, + na: Scalar | lib.NoDefault = lib.no_default, ): if not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = lib.no_default + self, + pat, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" From 562118e2a2e2b3f44a136792fcbfdc6d03be9085 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 26 Aug 2024 20:48:34 +0200 Subject: [PATCH 04/13] update docstrings --- pandas/core/strings/accessor.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index cc9821026fcda..e7bf5bbd2b3f4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1245,8 +1245,9 @@ def contains( Flags to pass through to the re module, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. regex : bool, default True If True, assumes the pat is a regular expression. @@ -1378,8 +1379,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -1422,8 +1424,9 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default): Regex module flags, e.g. re.IGNORECASE. na : scalar, optional Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. + array. For object-dtype, ``numpy.nan`` is used. For the nullable + ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype, + ``False`` is used. Returns ------- @@ -2586,7 +2589,7 @@ def count(self, pat, flags: int = 0): @forbid_nonstring_types(["bytes"]) def startswith( - self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the start of each string element matches a pattern. @@ -2598,10 +2601,11 @@ def startswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- @@ -2656,7 +2660,7 @@ def startswith( @forbid_nonstring_types(["bytes"]) def endswith( - self, pat: str | tuple[str, ...], na: Scalar | None = lib.no_default + self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default ) -> Series | Index: """ Test if the end of each string element matches a pattern. @@ -2668,10 +2672,11 @@ def endswith( pat : str or tuple[str, ...] Character sequence or tuple of strings. Regular expressions are not accepted. - na : object, default NaN + na : scalar, optional Object shown if element tested is not a string. The default depends on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. + For the nullable ``StringDtype``, ``pandas.NA`` is used. + For the ``"str"`` dtype, ``False`` is used. Returns ------- From ef05ade2a2338e806476dfdb2d78d2944a368f4a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Aug 2024 08:28:11 +0200 Subject: [PATCH 05/13] more type annotations --- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/string_arrow.py | 1 + pandas/core/strings/base.py | 10 +++++++--- pandas/core/strings/object_array.py | 8 ++++++-- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 358ad7091839a..243c76321a841 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2383,7 +2383,7 @@ def _str_match( pat: str, case: bool = True, flags: int = 0, - na: Scalar | None = lib.no_default, + na: Scalar | lib.NoDefault = lib.no_default, ) -> Self: if not pat.startswith("^"): pat = f"^{pat}" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b0bf49415f16e..fac9972d3cccd 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -223,6 +223,7 @@ def insert(self, loc: int, item) -> ArrowStringArray: def _predicate_result_converter(self, values, na=lib.no_default): if self.dtype.na_value is np.nan: + na_value: bool | float | lib.NoDefault if na is lib.no_default: na_value = False elif not isna(na): diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 1281a03e297f9..85bb716f7c3e4 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -6,7 +6,7 @@ Literal, ) -import numpy as np +from pandas._libs import lib if TYPE_CHECKING: from collections.abc import ( @@ -88,7 +88,11 @@ def _str_repeat(self, repeats: int | Sequence[int]): @abc.abstractmethod def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): pass @@ -98,7 +102,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar = np.nan, + na: Scalar | lib.NoDefault = lib.no_default, ): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c150a652d19c3..2cd837c2e8534 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -220,7 +220,11 @@ def rep(x, r): return type(self)._from_sequence(result, dtype=self.dtype) def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + self, + pat: str, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE @@ -235,7 +239,7 @@ def _str_fullmatch( pat: str | re.Pattern, case: bool = True, flags: int = 0, - na: Scalar | None = None, + na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE From b9612fcc1060db1f47ba8f0c3f03ee14607b39dc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Aug 2024 11:20:38 +0200 Subject: [PATCH 06/13] test and fix startswith/endswith --- pandas/core/arrays/string_.py | 11 +++---- pandas/core/arrays/string_arrow.py | 14 +++++---- pandas/tests/strings/test_find_replace.py | 37 ++++++++++++++++++----- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index e4364b1ef507f..599373823bab5 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -453,7 +453,8 @@ def _str_map_nan_semantics( if is_integer_dtype(dtype): na_value = 0 else: - na_value = True + # NaN propagates as False + na_value = False result = lib.map_infer_mask( arr, @@ -463,15 +464,13 @@ def _str_map_nan_semantics( na_value=na_value, dtype=np.dtype(cast(type, dtype)), ) - if na_value_is_na and mask.any(): + if na_value_is_na and is_integer_dtype(dtype) and mask.any(): # TODO: we could alternatively do this check before map_infer_mask # and adjust the dtype/na_value we pass there. Which is more # performant? - if is_integer_dtype(dtype): - result = result.astype("float64") - else: - result = result.astype("object") + result = result.astype("float64") result[mask] = np.nan + return result else: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fac9972d3cccd..27b40ec2686da 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -223,14 +223,16 @@ def insert(self, loc: int, item) -> ArrowStringArray: def _predicate_result_converter(self, values, na=lib.no_default): if self.dtype.na_value is np.nan: - na_value: bool | float | lib.NoDefault + na_value: bool | lib.NoDefault if na is lib.no_default: na_value = False - elif not isna(na): - values = values.fill_null(bool(na)) + elif isna(na): + # NaN propagates as False + values = values.fill_null(False) na_value = lib.no_default else: - na_value = np.nan + values = values.fill_null(bool(na)) + na_value = lib.no_default return ArrowExtensionArray(values).to_numpy(na_value=na_value) return BooleanDtype().__from_arrow__(values) @@ -336,7 +338,7 @@ def _str_startswith( and not isna(na) ): result = result.fill_null(na) - return self._predicate_result_converter(result) + return self._predicate_result_converter(result, na=na) def _str_endswith( self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default @@ -361,7 +363,7 @@ def _str_endswith( and not isna(na) ): result = result.fill_null(na) - return self._predicate_result_converter(result) + return self._predicate_result_converter(result, na=na) def _str_replace( self, diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 7c30d5db09607..1b128120e2965 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -311,20 +311,31 @@ def test_startswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_startswith_nullable_string_dtype(nullable_string_dtype, na): +def test_startswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.startswith("foo", na=na) + + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, True, False, False, na, True, False, False], dtype="boolean" + [False, na, True, False, False, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.startswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) @@ -369,20 +380,30 @@ def test_endswith(pat, dtype, null_value, na): @pytest.mark.parametrize("na", [None, True, False]) -def test_endswith_nullable_string_dtype(nullable_string_dtype, na): +def test_endswith_string_dtype(any_string_dtype, na): values = Series( ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."], - dtype=nullable_string_dtype, + dtype=any_string_dtype, ) result = values.str.endswith("foo", na=na) + expected_dtype = ( + (object if na is None else bool) + if is_object_or_nan_string_dtype(any_string_dtype) + else "boolean" + ) + if any_string_dtype == "str": + # NaN propagates as False + expected_dtype = bool + if na is None: + na = False exp = Series( - [False, na, False, False, True, na, True, False, False], dtype="boolean" + [False, na, False, False, True, na, True, False, False], dtype=expected_dtype ) tm.assert_series_equal(result, exp) result = values.str.endswith("rege.", na=na) exp = Series( - [False, na, False, False, False, na, False, False, True], dtype="boolean" + [False, na, False, False, False, na, False, False, True], dtype=expected_dtype ) tm.assert_series_equal(result, exp) From cf242a2b5bef277a4e45cccc57d5299b4c2aaa2c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 27 Aug 2024 11:29:59 +0200 Subject: [PATCH 07/13] test ismethods --- pandas/tests/strings/test_strings.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index a8aa72ebc3351..d24fddbc5905c 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -217,8 +217,21 @@ def test_ismethods(method, expected, any_string_dtype): tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + expected_stdlib = [getattr(item, method)() for item in ser] + assert list(result) == expected_stdlib + + # with missing value + ser.iloc[[1, 2, 3, 4]] = np.nan + result = getattr(ser.str, method)() + if ser.dtype == "object": + expected = expected.astype(object) + expected.iloc[[1, 2, 3, 4]] = np.nan + elif ser.dtype == "str": + # NaN propagates as False + expected.iloc[[1, 2, 3, 4]] = False + else: + # nullable dtypes propagate NaN + expected.iloc[[1, 2, 3, 4]] = np.nan @pytest.mark.parametrize( From ad0d6e1e15f46d7f477cedbf5c1b2de07809f85d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 31 Aug 2024 20:00:45 +0200 Subject: [PATCH 08/13] fix warnings --- pandas/core/arrays/_arrow_string_mixins.py | 6 +++--- pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/string_arrow.py | 12 ++++++++++-- pandas/tests/strings/test_find_replace.py | 3 ++- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 0597c2b9dc4d7..0791faa07ebbf 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -34,7 +34,7 @@ class ArrowStringArrayMixin: def __init__(self, *args, **kwargs) -> None: raise NotImplementedError - def _convert_bool_result(self, result, na=lib.no_default): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): # Convert a bool-dtype result to the appropriate result type raise NotImplementedError @@ -130,7 +130,7 @@ def _str_startswith( and not isna(na) ): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) - return self._convert_bool_result(result, na=na) + return self._convert_bool_result(result, na=na, method_name="startswith") def _str_endswith( self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default @@ -153,4 +153,4 @@ def _str_endswith( and not isna(na) ): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) - return self._convert_bool_result(result, na=na) + return self._convert_bool_result(result, na=na, method_name="endswith") diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 7eff347823b20..c5447953809ba 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2311,7 +2311,7 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: for chunk in self._pa_array.iterchunks() ] - def _convert_bool_result(self, result, na=lib.no_default): + def _convert_bool_result(self, result, na=lib.no_default, method_name=None): return type(self)(result) def _convert_int_result(self, result): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6fc507e8a5444..501e90448251b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -223,7 +223,7 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _convert_bool_result(self, values, na=lib.no_default): + def _convert_bool_result(self, values, na=lib.no_default, method_name=None): if self.dtype.na_value is np.nan: na_value: bool | lib.NoDefault if na is lib.no_default: @@ -233,6 +233,14 @@ def _convert_bool_result(self, values, na=lib.no_default): values = values.fill_null(False) na_value = lib.no_default else: + if not isinstance(na, bool): + # GH#59561 + warnings.warn( + f"Allowing a non-bool 'na' in obj.str.{method_name} is " + "deprecated and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) values = values.fill_null(bool(na)) na_value = lib.no_default return ArrowExtensionArray(values).to_numpy(na_value=na_value) @@ -310,7 +318,7 @@ def _str_contains( result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._convert_bool_result(result, na=na) + result = self._convert_bool_result(result, na=na, method_name="contains") if ( self.dtype.na_value is libmissing.NA and na is not lib.no_default diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index a63b23ed40f9e..65cf19ab5da01 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -299,7 +299,8 @@ def test_startswith_endswith_validate_na(any_string_dtype): dtype = ser.dtype if ( - isinstance(dtype, pd.StringDtype) and dtype.storage == "python" + isinstance(dtype, pd.StringDtype) + and (dtype.storage == "python" or dtype.na_value is np.nan) ) or dtype == np.dtype("object"): msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): From bf02000a5aecb736dbd675df1828ea700e62fae6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Sep 2024 08:54:02 +0200 Subject: [PATCH 09/13] try fix typing --- pandas/core/arrays/_arrow_string_mixins.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 0791faa07ebbf..071240b049e09 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -27,9 +27,12 @@ Self, ) + from pandas.core.dtypes.dtypes import ExtensionDtype + class ArrowStringArrayMixin: _pa_array: Sized + dtype: ExtensionDtype def __init__(self, *args, **kwargs) -> None: raise NotImplementedError From 377ff3aaf605f39198a7fd1e81cfb0d2080fac2c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Sep 2024 18:21:55 +0200 Subject: [PATCH 10/13] follow same behaviour for categorical[str] --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- pandas/core/arrays/categorical.py | 16 ++++++++++++---- pandas/core/arrays/string_.py | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index a9cea627e988a..f8c2b4a78b06a 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -223,7 +223,7 @@ def _str_contains( pat, case: bool = True, flags: int = 0, - na=lib.no_default, + na: Scalar | lib.NoDefault = lib.no_default, regex: bool = True, ): if flags: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c613a345686cc..48ba608243e0b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2669,16 +2669,24 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: # ------------------------------------------------------------------------ # String methods interface def _str_map( - self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. - from pandas.core.arrays import NumpyExtensionArray - categories = self.categories codes = self.codes - result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) + if categories.dtype == "string": + result = categories.array._str_map(f, na_value, dtype) + if categories.dtype.na_value is np.nan: + # NaN propagates as False + na_value = False + else: + from pandas.core.arrays import NumpyExtensionArray + + result = NumpyExtensionArray(categories.to_numpy())._str_map( + f, na_value, dtype + ) return take_nd(result, codes, fill_value=na_value) def _str_get_dummies(self, sep: str = "|"): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2a0498932ce21..1e9b07d37cf2e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -440,6 +440,7 @@ def _str_map_nan_semantics( dtype = self.dtype if na_value is lib.no_default: if is_bool_dtype(dtype): + # NaN propagates as False na_value = False else: na_value = self.dtype.na_value From 2dfd50bb52081466512a23624f66a1bd6a254691 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Sep 2024 19:11:53 +0200 Subject: [PATCH 11/13] simplify fill_null calls for string[pyarrow] case --- pandas/core/arrays/_arrow_string_mixins.py | 28 +------------ pandas/core/arrays/arrow/array.py | 2 + pandas/core/arrays/string_arrow.py | 47 ++++++++-------------- pandas/tests/strings/test_find_replace.py | 5 +-- 4 files changed, 20 insertions(+), 62 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index f8c2b4a78b06a..22b1d84e2373e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -9,18 +9,13 @@ import numpy as np -from pandas._libs import ( - lib, - missing as libmissing, -) +from pandas._libs import lib from pandas.compat import ( pa_version_under10p1, pa_version_under13p0, pa_version_under17p0, ) -from pandas.core.dtypes.missing import isna - if not pa_version_under10p1: import pyarrow as pa import pyarrow.compute as pc @@ -36,12 +31,9 @@ Self, ) - from pandas.core.dtypes.dtypes import ExtensionDtype - class ArrowStringArrayMixin: _pa_array: Sized - dtype: ExtensionDtype def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -151,12 +143,6 @@ def _str_startswith( for p in pat[1:]: result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) - if ( - self.dtype.na_value is libmissing.NA - and na is not lib.no_default - and not isna(na) - ): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) return self._convert_bool_result(result, na=na, method_name="startswith") def _str_endswith( @@ -174,12 +160,6 @@ def _str_endswith( for p in pat[1:]: result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) - if ( - self.dtype.na_value is libmissing.NA - and na is not lib.no_default - and not isna(na) - ): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) return self._convert_bool_result(result, na=na, method_name="endswith") def _str_isalnum(self): @@ -234,12 +214,6 @@ def _str_contains( else: pa_contains = pc.match_substring result = pa_contains(self._pa_array, pat, ignore_case=not case) - if ( - self.dtype.na_value is libmissing.NA - and na is not lib.no_default - and not isna(na) - ): # pyright: ignore [reportGeneralTypeIssues] - result = result.fill_null(na) return self._convert_bool_result(result, na=na, method_name="contains") def _str_find(self, sub: str, start: int = 0, end: int | None = None): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c08c84cbdfeda..2309ebbe2b25e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2312,6 +2312,8 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]: ] def _convert_bool_result(self, result, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) return type(self)(result) def _convert_int_result(self, result): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e10c0134b6348..f3763f2a75aa8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -224,26 +224,26 @@ def insert(self, loc: int, item) -> ArrowStringArray: return super().insert(loc, item) def _convert_bool_result(self, values, na=lib.no_default, method_name=None): + if na is not lib.no_default and not isna(na) and not isinstance(na, bool): + # GH#59561 + warnings.warn( + f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated " + "and will raise in a future version.", + FutureWarning, + stacklevel=find_stack_level(), + ) + na = bool(na) + if self.dtype.na_value is np.nan: - na_value: bool | lib.NoDefault - if na is lib.no_default: - na_value = False - elif isna(na): + if na is lib.no_default or isna(na): # NaN propagates as False values = values.fill_null(False) - na_value = lib.no_default else: - if not isinstance(na, bool): - # GH#59561 - warnings.warn( - f"Allowing a non-bool 'na' in obj.str.{method_name} is " - "deprecated and will raise in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - values = values.fill_null(bool(na)) - na_value = lib.no_default - return ArrowExtensionArray(values).to_numpy(na_value=na_value) + values = values.fill_null(na) + return values.to_numpy(zero_copy_only=False) + else: + if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues] + values = values.fill_null(na) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -325,21 +325,6 @@ def _str_contains( fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) - if ( - self.dtype.na_value is libmissing.NA - and na is not lib.no_default - and not isna(na) - ): - if not isinstance(na, bool): - # GH#59561 - warnings.warn( - "Allowing a non-bool 'na' in obj.str.contains is deprecated " - "and will raise in a future version.", - FutureWarning, - stacklevel=find_stack_level(), - ) - na = bool(na) - return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 65cf19ab5da01..79c1c0c182493 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -298,10 +298,7 @@ def test_startswith_endswith_validate_na(any_string_dtype): ) dtype = ser.dtype - if ( - isinstance(dtype, pd.StringDtype) - and (dtype.storage == "python" or dtype.na_value is np.nan) - ) or dtype == np.dtype("object"): + if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"): msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): ser.str.startswith("kapow", na="baz") From adf2b9953e039f0fe2ea88a393d188f15d221ebd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Sep 2024 19:41:19 +0200 Subject: [PATCH 12/13] fix na_value handling for categorical case + update tests for expected categorical behaviour --- pandas/core/arrays/categorical.py | 8 ++++++-- pandas/tests/strings/test_api.py | 9 ++++++++- pandas/tests/strings/test_find_replace.py | 8 ++++++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 48ba608243e0b..4776d4a04a939 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2678,8 +2678,12 @@ def _str_map( codes = self.codes if categories.dtype == "string": result = categories.array._str_map(f, na_value, dtype) - if categories.dtype.na_value is np.nan: - # NaN propagates as False + if ( + categories.dtype.na_value is np.nan + and is_bool_dtype(dtype) + and (na_value is lib.no_default or isna(na_value)) + ): + # NaN propagates as False for functions with boolean return type na_value = False else: from pandas.core.arrays import NumpyExtensionArray diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py index 2511474e03ff7..4a1b97606db2b 100644 --- a/pandas/tests/strings/test_api.py +++ b/pandas/tests/strings/test_api.py @@ -122,6 +122,7 @@ def test_api_per_method( any_allowed_skipna_inferred_dtype, any_string_method, request, + using_infer_string, ): # this test does not check correctness of the different methods, # just that the methods work on the specified (inferred) dtypes, @@ -160,6 +161,10 @@ def test_api_per_method( t = box(values, dtype=dtype) # explicit dtype to avoid casting method = getattr(t.str, method_name) + if using_infer_string and dtype == "category": + string_allowed = method_name not in ["decode"] + else: + string_allowed = True bytes_allowed = method_name in ["decode", "get", "len", "slice"] # as of v0.23.4, all methods except 'cat' are very lenient with the # allowed data types, just returning NaN for entries that error. @@ -168,7 +173,8 @@ def test_api_per_method( mixed_allowed = method_name not in ["cat"] allowed_types = ( - ["string", "unicode", "empty"] + ["empty"] + + ["string", "unicode"] * string_allowed + ["bytes"] * bytes_allowed + ["mixed", "mixed-integer"] * mixed_allowed ) @@ -182,6 +188,7 @@ def test_api_per_method( msg = ( f"Cannot use .str.{method_name} with values of " f"inferred dtype {inferred_dtype!r}." + "|a bytes-like object is required, not 'str'" ) with pytest.raises(TypeError, match=msg): method(*args, **kwargs) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 79c1c0c182493..408fc2028b10a 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -320,7 +320,7 @@ def test_startswith_endswith_validate_na(any_string_dtype): @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_startswith(pat, dtype, null_value, na): +def test_startswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -334,6 +334,8 @@ def test_startswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.startswith(pat, na=na) @@ -389,7 +391,7 @@ def test_startswith_string_dtype(any_string_dtype, na): @pytest.mark.parametrize("dtype", ["object", "category"]) @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) @pytest.mark.parametrize("na", [True, False]) -def test_endswith(pat, dtype, null_value, na): +def test_endswith(pat, dtype, null_value, na, using_infer_string): # add category dtype parametrizations for GH-36241 values = Series( ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], @@ -403,6 +405,8 @@ def test_endswith(pat, dtype, null_value, na): exp = exp.fillna(null_value) elif dtype == "object" and null_value is None: exp[exp.isna()] = None + elif using_infer_string and dtype == "category": + exp = exp.fillna(False).astype(bool) tm.assert_series_equal(result, exp) result = values.str.endswith(pat, na=na) From ddd531a978b14ec72fdb8b0c0e728de43e907ef6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Sep 2024 20:16:29 +0200 Subject: [PATCH 13/13] fix typing + fix conversion for old pyarrow --- pandas/core/arrays/categorical.py | 4 ++-- pandas/core/arrays/string_arrow.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4776d4a04a939..269d88baa6422 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2677,9 +2677,9 @@ def _str_map( categories = self.categories codes = self.codes if categories.dtype == "string": - result = categories.array._str_map(f, na_value, dtype) + result = categories.array._str_map(f, na_value, dtype) # type: ignore[attr-defined] if ( - categories.dtype.na_value is np.nan + categories.dtype.na_value is np.nan # type: ignore[union-attr] and is_bool_dtype(dtype) and (na_value is lib.no_default or isna(na_value)) ): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f3763f2a75aa8..129272ecb64f7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -240,7 +240,7 @@ def _convert_bool_result(self, values, na=lib.no_default, method_name=None): values = values.fill_null(False) else: values = values.fill_null(na) - return values.to_numpy(zero_copy_only=False) + return values.to_numpy() else: if na is not lib.no_default and not isna(na): # pyright: ignore [reportGeneralTypeIssues] values = values.fill_null(na)