diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8cb4b3f24d435..3b363d38e18cb 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -105,6 +105,37 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv Implementation Status `_ documentation. +.. _whatsnew_220.enhancements.to_numpy_ea: + +ExtensionArray.to_numpy converts to suitable NumPy dtype +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`ExtensionArray.to_numpy`` will now convert to a suitable NumPy dtype instead +of ``object`` dtype for nullable extension dtypes. + +*Old behavior:* + +.. code-block:: ipython + + In [1]: ser = pd.Series([1, 2, 3], dtype="Int64") + In [2]: ser.to_numpy() + Out[2]: array([1, 2, 3], dtype=object) + +*New behavior:* + +.. ipython:: python + + ser = pd.Series([1, 2, 3], dtype="Int64") + ser.to_numpy() + +The default NumPy dtype (without any arguments) is determined as follows: + +- float dtypes are cast to NumPy floats +- integer dtypes without missing values are cast to NumPy integer dtypes +- integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator +- boolean dtypes without missing values are cast to NumPy bool dtype +- boolean dtypes with missing values keep object dtype + .. _whatsnew_220.enhancements.struct_accessor: Series.struct accessor to with PyArrow structured data diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index ab28b34be58f2..0f78080a3daf4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1477,7 +1477,7 @@ def _maybe_upcast( import pyarrow as pa if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow - arr = arr.to_numpy() + arr = arr.to_numpy(na_value=None) arr = ArrowExtensionArray(pa.array(arr, from_pandas=True)) return arr diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 58909643ed46a..201ce44ed0163 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -38,11 +38,15 @@ IS64, is_platform_windows, ) -from pandas.errors import AbstractMethodError +from pandas.errors import ( + AbstractMethodError, + LossySetitemError, +) from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.cast import np_can_hold_element from pandas.core.dtypes.common import ( is_bool, is_integer_dtype, @@ -69,6 +73,7 @@ from pandas.core.algorithms import ( factorize_array, isin, + map_array, mode, take, ) @@ -473,13 +478,35 @@ def to_numpy( >>> a.to_numpy(dtype="bool", na_value=False) array([ True, False, False]) """ - if na_value is lib.no_default: - na_value = libmissing.NA + hasna = self._hasna + if dtype is None: - dtype = object + dtype_given = False + if hasna: + if self.dtype.kind == "b": + dtype = object + else: + if self.dtype.kind in "iu": + dtype = np.dtype(np.float64) + else: + dtype = self.dtype.numpy_dtype + if na_value is lib.no_default: + na_value = np.nan + else: + dtype = self.dtype.numpy_dtype else: dtype = np.dtype(dtype) - if self._hasna: + dtype_given = True + if na_value is lib.no_default: + na_value = libmissing.NA + + if not dtype_given and hasna: + try: + np_can_hold_element(dtype, na_value) # type: ignore[arg-type] + except LossySetitemError: + dtype = object + + if hasna: if ( dtype != object and not is_string_dtype(dtype) @@ -506,7 +533,7 @@ def tolist(self): if self.ndim > 1: return [x.tolist() for x in self] dtype = None if self._hasna else self._data.dtype - return self.to_numpy(dtype=dtype).tolist() + return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist() @overload def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: @@ -1300,6 +1327,9 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): ) return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) + def map(self, mapper, na_action=None): + return map_array(self.to_numpy(), mapper, na_action=None) + def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ Return whether any element is truthy. diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 3295c4741c03d..7bd4851425c3b 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -9,10 +9,17 @@ import numpy as np +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import maybe_box_native -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, + ExtensionDtype, +) from pandas.core import common as com @@ -150,6 +157,10 @@ def to_dict( for i, col_dtype in enumerate(df.dtypes.values) if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype) ] + box_na_values = [ + lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA + for i, col_dtype in enumerate(df.dtypes.values) + ] are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) if orient == "dict": @@ -160,7 +171,11 @@ def to_dict( return into_c( ( k, - list(map(maybe_box_native, v.to_numpy().tolist())) + list( + map( + maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist() + ) + ) if i in object_dtype_indices_as_set else v.to_numpy().tolist(), ) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index bb2fd06d98e1d..ecb09aa6cacf1 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -61,6 +61,7 @@ ) from pandas.core.arrays import ( + BaseMaskedArray, Categorical, DatetimeArray, ExtensionArray, @@ -1527,6 +1528,8 @@ def _format_strings(self) -> list[str]: if isinstance(values, Categorical): # Categorical is special for now, so that we can preserve tzinfo array = values._internal_get_values() + elif isinstance(values, BaseMaskedArray): + array = values.to_numpy(dtype=object) else: array = np.asarray(values) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 4fb97779c690e..3ceb798a7f5ca 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -63,6 +63,7 @@ from pandas.core import algorithms from pandas.core.arrays import ( ArrowExtensionArray, + BaseMaskedArray, BooleanArray, Categorical, ExtensionArray, @@ -762,8 +763,15 @@ def _infer_types( pa = import_optional_dependency("pyarrow") if isinstance(result, np.ndarray): result = ArrowExtensionArray(pa.array(result, from_pandas=True)) + elif isinstance(result, BaseMaskedArray): + if result._mask.all(): + # We want an arrow null array here + result = ArrowExtensionArray(pa.array([None] * len(result))) + else: + result = ArrowExtensionArray( + pa.array(result._data, mask=result._mask) + ) else: - # ExtensionArray result = ArrowExtensionArray( pa.array(result.to_numpy(), from_pandas=True) ) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index d26eea19c06e9..6459b315c684d 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -223,7 +223,7 @@ def test_coerce_to_numpy_array(): # also with no missing values -> object dtype arr = pd.array([True, False, True], dtype="boolean") result = np.array(arr) - expected = np.array([True, False, True], dtype="object") + expected = np.array([True, False, True], dtype="bool") tm.assert_numpy_array_equal(result, expected) # force bool dtype @@ -263,7 +263,7 @@ def test_to_numpy(box): # default (with or without missing values) -> object dtype arr = con([True, False, True], dtype="boolean") result = arr.to_numpy() - expected = np.array([True, False, True], dtype="object") + expected = np.array([True, False, True], dtype="bool") tm.assert_numpy_array_equal(result, expected) arr = con([True, False, None], dtype="boolean") diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index 2ed52439adf53..a25ac40cb3e7c 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -13,12 +13,12 @@ def test_to_numpy(box): # default (with or without missing values) -> object dtype arr = con([0.1, 0.2, 0.3], dtype="Float64") result = arr.to_numpy() - expected = np.array([0.1, 0.2, 0.3], dtype="object") + expected = np.array([0.1, 0.2, 0.3], dtype="float64") tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") result = arr.to_numpy() - expected = np.array([0.1, 0.2, pd.NA], dtype="object") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index fd38a8df7fa22..e3848cdfe3aa9 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -141,7 +141,7 @@ def test_astype(all_data): # coerce to object s = pd.Series(mixed) result = s.astype("object") - expected = pd.Series(np.asarray(mixed)) + expected = pd.Series(np.asarray(mixed, dtype=object)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 4f3e4d3365179..3955e0e88e776 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -295,7 +295,7 @@ def test_array_multiindex_raises(): pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), @@ -346,10 +346,6 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): with tm.assert_produces_warning(None): thing = box(arr) - if arr.dtype.name == "int64" and box is pd.array: - mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object") - request.applymarker(mark) - result = thing.to_numpy() tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index e5d0aafb80b35..9a3f83e0293f5 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -133,21 +133,25 @@ def test_series_array_ea_dtypes(using_copy_on_write): assert arr.flags.writeable is True arr = np.asarray(ser) - assert not np.shares_memory(arr, get_array(ser)) - assert arr.flags.writeable is True + assert np.shares_memory(arr, get_array(ser)) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True def test_dataframe_array_ea_dtypes(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}, dtype="Int64") arr = np.asarray(df, dtype="int64") - # TODO: This should be able to share memory, but we are roundtripping - # through object - assert not np.shares_memory(arr, get_array(df, "a")) - assert arr.flags.writeable is True + assert np.shares_memory(arr, get_array(df, "a")) + if using_copy_on_write: + assert arr.flags.writeable is False + else: + assert arr.flags.writeable is True arr = np.asarray(df) + assert np.shares_memory(arr, get_array(df, "a")) if using_copy_on_write: - # TODO(CoW): This should be True assert arr.flags.writeable is False else: assert arr.flags.writeable is True diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index be4077d921a9e..3efc561d6a125 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -169,6 +169,16 @@ def data_for_grouping(dtype): class TestMaskedArrays(base.ExtensionTests): + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data_missing, na_action): + result = data_missing.map(lambda x: x, na_action=na_action) + if data_missing.dtype == Float32Dtype(): + # map roundtrips through objects, which converts to float64 + expected = data_missing.to_numpy(dtype="float64", na_value=np.nan) + else: + expected = data_missing.to_numpy() + tm.assert_numpy_array_equal(result, expected) + def _get_expected_exception(self, op_name, obj, other): try: dtype = tm.get_dtype(obj) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 6b2bf211ab748..103ec67951a01 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -717,15 +717,12 @@ def test_where_ea_other(self): # TODO: ideally we would get Int64 instead of object result = df.where(mask, ser, axis=0) - expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object) + expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]}) tm.assert_frame_equal(result, expected) ser2 = Series(arr[:2], index=["A", "B"]) - expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]}) - expected["B"] = expected["B"].astype(object) - msg = "Downcasting behavior in Series and DataFrame methods 'where'" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = df.where(mask, ser2, axis=1) + expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]}) + result = df.where(mask, ser2, axis=1) tm.assert_frame_equal(result, expected) def test_where_interval_noop(self): diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index eed48b9db116b..6184e791cab5d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -456,6 +456,20 @@ def test_to_records_with_inf_record(self): result = repr(df) assert result == expected + def test_masked_ea_with_formatter(self): + # GH#39336 + df = DataFrame( + { + "a": Series([0.123456789, 1.123456789], dtype="Float64"), + "b": Series([1, 2], dtype="Int64"), + } + ) + result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format]) + expected = """ a b +0 0.12 1.00 +1 1.12 2.00""" + assert result == expected + def test_repr_ea_columns(self, any_string_dtype): # GH#54797 pytest.importorskip("pyarrow")