diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 32fbf4e6c7de3..3dbc74b1941d2 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,10 +3,12 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Series, ) +from pandas.core.arrays import StringArray from .pandas_vb_common import tm @@ -285,3 +287,18 @@ class Iter(Dtypes): def time_iter(self, dtype): for i in self.s: pass + + +class StringArrayConstruction: + def setup(self): + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) + + def time_string_array_construction(self): + StringArray(self.series_arr) + + def time_string_array_with_nan_construction(self): + StringArray(self.series_arr_nan) + + def peakmem_stringarray_construction(self): + StringArray(self.series_arr) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ccad93d83eb5b..d015c7fa39e83 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -459,6 +459,7 @@ Other API changes - Change in the position of the ``min_rows`` argument in :meth:`DataFrame.to_string` due to change in the docstring (:issue:`44304`) - Reduction operations for :class:`DataFrame` or :class:`Series` now raising a ``ValueError`` when ``None`` is passed for ``skipna`` (:issue:`44178`) - :func:`read_csv` and :func:`read_html` no longer raising an error when one of the header rows consists only of ``Unnamed:`` columns (:issue:`13054`) +- :class:`StringArray` now accepts nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) - Changed the ``name`` attribute of several holidays in ``USFederalHolidayCalendar`` to match `official federal holiday names `_ diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index a7ebd9d0c77ad..5f6d8c8071f4a 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -150,7 +150,7 @@ def maybe_convert_numeric( def ensure_string_array( arr, na_value: object = ..., - convert_na_value: bool = ..., + coerce: str = ..., copy: bool = ..., skipna: bool = ..., ) -> npt.NDArray[np.object_]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8f9016e726f1e..e373c8a584913 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -99,6 +99,7 @@ from pandas._libs.missing cimport ( is_null_timedelta64, isnaobj, ) +from pandas._libs.missing import checknull from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -670,12 +671,25 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray: return result +ctypedef enum coerce_options: + all = 0 + strict_null = 1 + null = 2 + non_null = 3 + none = 4 + + +def strict_check_null(x): + # Cython doesn't let me define this in ensure_string_array :( + return x is None or x is C_NA or util.is_nan(x) + + @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[object] ensure_string_array( arr, object na_value=np.nan, - bint convert_na_value=True, + coerce="all", bint copy=True, bint skipna=True, ): @@ -688,8 +702,16 @@ cpdef ndarray[object] ensure_string_array( The values to be converted to str, if needed. na_value : Any, default np.nan The value to use for na. For example, np.nan or pd.NA. - convert_na_value : bool, default True - If False, existing na values will be used unchanged in the new array. + coerce : {'all', 'strict-null', 'null', 'non-null', None}, default 'all' + Whether to coerce non-string elements to strings. + - 'all' will convert all non-string values. + - 'strict-null' will only convert pd.NA, np.nan, or None to na_value + raising when encountering non-strings and other null values. + - 'null' will convert nulls to na_value w/out converting other non-strings. + - 'non-null' will only convert non-null non-string elements to string. + - None will not convert anything. + If coerce is not 'all', a ValueError will be raised for values + that are not strings or na_value. copy : bool, default True Whether to ensure that a new array is returned. skipna : bool, default True @@ -699,10 +721,47 @@ cpdef ndarray[object] ensure_string_array( Returns ------- np.ndarray[object] - An array with the input array's elements casted to str or nan-like. + An array of strings and na_value. + + Raises + ------ + ValueError + If an element is encountered that is not a string or valid NA value + and element is not coerced. + + Examples + -------- + >>> import numpy as np + >>> import pandas as pd + >>> ensure_string_array(np.array([1,2,3, np.datetime64("nat")]), coerce="all") + array(['1', '2', '3', nan], dtype=object) + >>> ensure_string_array(np.array([pd.NA, "a", None]), coerce="strict-null") + array([nan, 'a', nan], dtype=object) + >>> ensure_string_array(np.array([pd.NaT, "1"]), coerce="null") + array([nan, '1'], dtype=object) + >>> ensure_string_array(np.array([1,2,3]), coerce="non-null") + array(['1', '2', '3'], dtype=object) + >>> ensure_string_array(np.array(["1", "2", "3"]), coerce=None) + array(['1', '2', '3'], dtype=object) """ cdef: Py_ssize_t i = 0, n = len(arr) + set strict_na_values = {C_NA, np.nan, None} + coerce_options coerce_val + + if coerce == "all": + coerce_val = all + elif coerce == "strict-null": + coerce_val = strict_null + elif coerce == "null": + coerce_val = null + elif coerce == "non-null": + coerce_val = non_null + elif coerce is None: + coerce_val = none + else: + raise ValueError("coerce argument must be one of " + f"'all'|'strict-null'|'null'|'non-null'|None, not {coerce}") if hasattr(arr, "to_numpy"): @@ -722,21 +781,34 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + if coerce_val == strict_null: + # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid + # If they are present, they are treated like a regular Python object + # and will either cause an exception to be raised or be coerced. + check_null = strict_check_null + else: + check_null = checknull + for i in range(n): val = arr[i] if isinstance(val, str): continue - if not checknull(val): - if not isinstance(val, np.floating): - # f"{val}" is faster than str(val) - result[i] = f"{val}" + if not check_null(val): + if coerce_val == all or coerce_val == non_null: + if not isinstance(val, np.floating): + # f"{val}" is faster than str(val) + result[i] = f"{val}" + else: + # f"{val}" is not always equivalent to str(val) for floats + result[i] = str(val) else: - # f"{val}" is not always equivalent to str(val) for floats - result[i] = str(val) + raise ValueError(f"Element {val} is not a string or valid null." + "If you want it to be coerced to a string," + "specify coerce='all'") else: - if convert_na_value: + if coerce_val != non_null and coerce_val != none: val = na_value if skipna: result[i] = val @@ -1881,8 +1953,8 @@ cdef class StringValidator(Validator): return issubclass(self.dtype.type, np.str_) cdef bint is_valid_null(self, object value) except -1: - # We deliberately exclude None / NaN here since StringArray uses NA - return value is C_NA + # Override to exclude float('Nan') and complex NaN + return value is None or value is C_NA or value is np.nan cpdef bint is_string_array(ndarray values, bint skipna=False): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c6987d9a11e4c..8fe5343e471ae 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -246,11 +246,18 @@ class StringArray(BaseStringArray, PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings or :attr:`pandas.NA`. + where the elements are Python strings + or nan-likes(``None``, ``np.nan``, ``NA``). This may change without warning in the future. Use :meth:`pandas.array` with ``dtype="string"`` for a stable way of creating a `StringArray` from any sequence. + .. versionchanged:: 1.4.0 + + StringArray now accepts nan-likes(``None``, ``np.nan``) for the + ``values`` parameter in its constructor + in addition to strings and :attr:`pandas.NA` + copy : bool, default False Whether to copy the array of data. @@ -310,6 +317,8 @@ def __init__(self, values, copy=False): values = extract_array(values) super().__init__(values, copy=copy) + if not isinstance(values, type(self)): + self._validate() # error: Incompatible types in assignment (expression has type "StringDtype", # variable has type "PandasDtype") NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) @@ -318,16 +327,25 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" - if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) + try: + lib.ensure_string_array( + self._ndarray.ravel("K"), + na_value=StringDtype.na_value, + coerce="strict-null", + copy=False, + ) + except ValueError: + raise ValueError("StringArray requires a sequence of strings or pandas.NA") @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + def _from_sequence( + cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True + ): if dtype and not (isinstance(dtype, str) and dtype == "string"): dtype = pandas_dtype(dtype) assert isinstance(dtype, StringDtype) and dtype.storage == "python" @@ -336,15 +354,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype + if coerce: + coerce = "non-null" + else: + coerce = None na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) result[na_values] = StringDtype.na_value else: # convert non-na-likes to str, and nan-likes to StringDtype.na_value + if coerce: + coerce = "all" + else: + coerce = "strict-null" result = lib.ensure_string_array( - scalars, na_value=StringDtype.na_value, copy=copy + scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce ) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 53fc38a973110..7a4ca57bf14ef 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -153,7 +153,9 @@ def __init__(self, values): ) @classmethod - def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): + def _from_sequence( + cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True + ): from pandas.core.arrays.masked import BaseMaskedArray _chk_pyarrow_available() @@ -167,11 +169,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) # numerical issues with Float32Dtype na_values = scalars._mask result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + if coerce: + coerce = "non-null" + else: + coerce = None + result = lib.ensure_string_array(result, copy=copy, coerce=coerce) return cls(pa.array(result, mask=na_values, type=pa.string())) # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=copy) + if coerce: + coerce = "all" + else: + coerce = "strict-null" + result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce) return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod diff --git a/pandas/core/construction.py b/pandas/core/construction.py index cf8cd070ec562..2643625aa31b4 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -754,7 +754,7 @@ def _try_cast( elif dtype.kind == "U": # TODO: test cases with arr.dtype.kind in ["m", "M"] - return lib.ensure_string_array(arr, convert_na_value=False, copy=copy) + return lib.ensure_string_array(arr, coerce="non-null", copy=copy) elif dtype.kind in ["m", "M"]: return maybe_cast_to_datetime(arr, dtype) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b70ea9f816aef..386fb4744ceb3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1122,7 +1122,7 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): - return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) + return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null") elif is_datetime64_dtype(arr): # Non-overlapping equality check (left operand type: "dtype[Any]", right diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 7c3a8c691b786..10ff1c12d6fa8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,15 +2,19 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +from decimal import Decimal + import numpy as np import pytest +import pandas._libs.lib as lib import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_dtype_equal import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray from pandas.core.arrays.string_arrow import ArrowStringArray @@ -267,13 +271,63 @@ def test_constructor_raises(cls): cls(np.array([])) with pytest.raises(ValueError, match=msg): - cls(np.array(["a", np.nan], dtype=object)) + cls(np.array(["a", pd.NaT], dtype=object)) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", None], dtype=object)) - with pytest.raises(ValueError, match=msg): - cls(np.array(["a", pd.NaT], dtype=object)) +@pytest.mark.parametrize("na", [np.nan, np.float64("nan"), float("nan"), None, pd.NA]) +def test_constructor_nan_like(na): + expected = pd.arrays.StringArray(np.array(["a", pd.NA])) + tm.assert_extension_array_equal( + pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected + ) + + +def test_invalid_coerce_raises(): + data = np.array(["a", "b"], dtype=object) + with pytest.raises( + ValueError, + match="coerce argument must be one of " + "'all'|'strict-null'|'null'|'non-null'|None, not abcd", + ): + lib.ensure_string_array(data, coerce="abcd") + + +@pytest.mark.parametrize( + "values", + [ + np.array(["foo", "bar", pd.NA], dtype=object), + np.array(["foo", "bar", np.nan], dtype=object), + np.array(["foo", "bar", None], dtype=object), + np.array(["foo", "bar", float("nan")], dtype=object), + np.array(["foo", "bar", np.float64("nan")], dtype=object), + BaseMaskedArray( + np.array(["foo", "bar", "garbage"]), np.array([False, False, True]) + ), + ], +) +def test_from_sequence_no_coerce(cls, values): + expected = pd.arrays.StringArray(np.array(["foo", "bar", pd.NA], dtype=object)) + result = cls._from_sequence(values, coerce=False) + # Use bare assert since classes are different + assert (result == expected).all() + + +@pytest.mark.parametrize( + "values", + [ + np.array(["foo", "bar", pd.NaT], dtype=object), + np.array(["foo", "bar", np.datetime64("nat")], dtype=object), + np.array(["foo", "bar", Decimal("nan")], dtype=object), + ], +) +def test_from_sequence_no_coerce_invalid(cls, values): + with pytest.raises( + ValueError, + match="Element .* is not a string or valid null." + "If you want it to be coerced to a string," + "specify coerce='all'", + ): + cls._from_sequence(values, coerce=False) @pytest.mark.parametrize("copy", [True, False]) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 7953d650636be..200a8c599737c 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1542,11 +1542,18 @@ def test_is_string_array(self): assert lib.is_string_array( np.array(["foo", "bar", pd.NA], dtype=object), skipna=True ) - # NaN is not valid for string array, just NA - assert not lib.is_string_array( + assert lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) - + assert lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=True + ) + assert not lib.is_string_array( + np.array(["foo", "bar", None], dtype=object), skipna=False + ) + assert not lib.is_string_array( + np.array(["foo", "bar", np.nan], dtype=object), skipna=False + ) assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self):