Skip to content

Convert to compatible NumPy dtype for MaskedArray to_numpy #55058

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,37 @@ For a full list of ADBC drivers and their development status, see the `ADBC Driv
Implementation Status <https://arrow.apache.org/adbc/current/driver/status.html>`_
documentation.

.. _whatsnew_220.enhancements.to_numpy_ea:

ExtensionArray.to_numpy converts to suitable NumPy dtype
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:meth:`ExtensionArray.to_numpy`` will now convert to a suitable NumPy dtype instead
of ``object`` dtype for nullable extension dtypes.

*Old behavior:*

.. code-block:: ipython

In [1]: ser = pd.Series([1, 2, 3], dtype="Int64")
In [2]: ser.to_numpy()
Out[2]: array([1, 2, 3], dtype=object)

*New behavior:*

.. ipython:: python

ser = pd.Series([1, 2, 3], dtype="Int64")
ser.to_numpy()

The default NumPy dtype (without any arguments) is determined as follows:

- float dtypes are cast to NumPy floats
- integer dtypes without missing values are cast to NumPy integer dtypes
- integer dtypes with missing values are cast to NumPy float dtypes and ``NaN`` is used as missing value indicator
- boolean dtypes without missing values are cast to NumPy bool dtype
- boolean dtypes with missing values keep object dtype

.. _whatsnew_220.enhancements.struct_accessor:

Series.struct accessor to with PyArrow structured data
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1477,7 +1477,7 @@ def _maybe_upcast(
import pyarrow as pa
if isinstance(arr, IntegerArray) and arr.isna().all():
# use null instead of int64 in pyarrow
arr = arr.to_numpy()
arr = arr.to_numpy(na_value=None)
arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))

return arr
Expand Down
42 changes: 36 additions & 6 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,15 @@
IS64,
is_platform_windows,
)
from pandas.errors import AbstractMethodError
from pandas.errors import (
AbstractMethodError,
LossySetitemError,
)
from pandas.util._decorators import doc
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import np_can_hold_element
from pandas.core.dtypes.common import (
is_bool,
is_integer_dtype,
Expand All @@ -69,6 +73,7 @@
from pandas.core.algorithms import (
factorize_array,
isin,
map_array,
mode,
take,
)
Expand Down Expand Up @@ -473,13 +478,35 @@ def to_numpy(
>>> a.to_numpy(dtype="bool", na_value=False)
array([ True, False, False])
"""
if na_value is lib.no_default:
na_value = libmissing.NA
hasna = self._hasna

if dtype is None:
dtype = object
dtype_given = False
if hasna:
if self.dtype.kind == "b":
dtype = object
else:
if self.dtype.kind in "iu":
dtype = np.dtype(np.float64)
else:
dtype = self.dtype.numpy_dtype
if na_value is lib.no_default:
na_value = np.nan
else:
dtype = self.dtype.numpy_dtype
else:
dtype = np.dtype(dtype)
if self._hasna:
dtype_given = True
if na_value is lib.no_default:
na_value = libmissing.NA

if not dtype_given and hasna:
try:
np_can_hold_element(dtype, na_value) # type: ignore[arg-type]
except LossySetitemError:
dtype = object

if hasna:
if (
dtype != object
and not is_string_dtype(dtype)
Expand All @@ -506,7 +533,7 @@ def tolist(self):
if self.ndim > 1:
return [x.tolist() for x in self]
dtype = None if self._hasna else self._data.dtype
return self.to_numpy(dtype=dtype).tolist()
return self.to_numpy(dtype=dtype, na_value=libmissing.NA).tolist()

@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
Expand Down Expand Up @@ -1300,6 +1327,9 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
)
return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis)

def map(self, mapper, na_action=None):
return map_array(self.to_numpy(), mapper, na_action=None)

def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
"""
Return whether any element is truthy.
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/methods/to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,17 @@

import numpy as np

from pandas._libs import (
lib,
missing as libmissing,
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import maybe_box_native
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
ExtensionDtype,
)

from pandas.core import common as com

Expand Down Expand Up @@ -150,6 +157,10 @@ def to_dict(
for i, col_dtype in enumerate(df.dtypes.values)
if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
]
box_na_values = [
lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
for i, col_dtype in enumerate(df.dtypes.values)
]
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)

if orient == "dict":
Expand All @@ -160,7 +171,11 @@ def to_dict(
return into_c(
(
k,
list(map(maybe_box_native, v.to_numpy().tolist()))
list(
map(
maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist()
)
)
if i in object_dtype_indices_as_set
else v.to_numpy().tolist(),
)
Expand Down
3 changes: 3 additions & 0 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
)

from pandas.core.arrays import (
BaseMaskedArray,
Categorical,
DatetimeArray,
ExtensionArray,
Expand Down Expand Up @@ -1527,6 +1528,8 @@ def _format_strings(self) -> list[str]:
if isinstance(values, Categorical):
# Categorical is special for now, so that we can preserve tzinfo
array = values._internal_get_values()
elif isinstance(values, BaseMaskedArray):
array = values.to_numpy(dtype=object)
else:
array = np.asarray(values)

Expand Down
10 changes: 9 additions & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
from pandas.core import algorithms
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
BooleanArray,
Categorical,
ExtensionArray,
Expand Down Expand Up @@ -762,8 +763,15 @@ def _infer_types(
pa = import_optional_dependency("pyarrow")
if isinstance(result, np.ndarray):
result = ArrowExtensionArray(pa.array(result, from_pandas=True))
elif isinstance(result, BaseMaskedArray):
if result._mask.all():
# We want an arrow null array here
result = ArrowExtensionArray(pa.array([None] * len(result)))
else:
result = ArrowExtensionArray(
pa.array(result._data, mask=result._mask)
)
else:
# ExtensionArray
result = ArrowExtensionArray(
pa.array(result.to_numpy(), from_pandas=True)
)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/boolean/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def test_coerce_to_numpy_array():
# also with no missing values -> object dtype
arr = pd.array([True, False, True], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, True], dtype="object")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

# force bool dtype
Expand Down Expand Up @@ -263,7 +263,7 @@ def test_to_numpy(box):
# default (with or without missing values) -> object dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="object")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

arr = con([True, False, None], dtype="boolean")
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/floating/test_to_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ def test_to_numpy(box):
# default (with or without missing values) -> object dtype
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, 0.3], dtype="object")
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, pd.NA], dtype="object")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def test_astype(all_data):
# coerce to object
s = pd.Series(mixed)
result = s.astype("object")
expected = pd.Series(np.asarray(mixed))
expected = pd.Series(np.asarray(mixed, dtype=object))
tm.assert_series_equal(result, expected)


Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/base/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def test_array_multiindex_raises():
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
),
(pd.array([0, np.nan], dtype="Int64"), np.array([0, pd.NA], dtype=object)),
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
(
IntervalArray.from_breaks([0, 1, 2]),
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
Expand Down Expand Up @@ -346,10 +346,6 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
with tm.assert_produces_warning(None):
thing = box(arr)

if arr.dtype.name == "int64" and box is pd.array:
mark = pytest.mark.xfail(reason="thing is Int64 and to_numpy() returns object")
request.applymarker(mark)

result = thing.to_numpy()
tm.assert_numpy_array_equal(result, expected)

Expand Down
18 changes: 11 additions & 7 deletions pandas/tests/copy_view/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,25 @@ def test_series_array_ea_dtypes(using_copy_on_write):
assert arr.flags.writeable is True

arr = np.asarray(ser)
assert not np.shares_memory(arr, get_array(ser))
assert arr.flags.writeable is True
assert np.shares_memory(arr, get_array(ser))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True


def test_dataframe_array_ea_dtypes(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
arr = np.asarray(df, dtype="int64")
# TODO: This should be able to share memory, but we are roundtripping
# through object
assert not np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is True
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True

arr = np.asarray(df)
assert np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
# TODO(CoW): This should be True
assert arr.flags.writeable is False
else:
assert arr.flags.writeable is True
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,16 @@ def data_for_grouping(dtype):


class TestMaskedArrays(base.ExtensionTests):
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data_missing, na_action):
result = data_missing.map(lambda x: x, na_action=na_action)
if data_missing.dtype == Float32Dtype():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there a solution to avoid this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not with roundtripping through object, maybe your other suggestion above solves this.

No if your suggestion doesn't work

# map roundtrips through objects, which converts to float64
expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
else:
expected = data_missing.to_numpy()
tm.assert_numpy_array_equal(result, expected)

def _get_expected_exception(self, op_name, obj, other):
try:
dtype = tm.get_dtype(obj)
Expand Down
9 changes: 3 additions & 6 deletions pandas/tests/frame/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,15 +717,12 @@ def test_where_ea_other(self):

# TODO: ideally we would get Int64 instead of object
result = df.where(mask, ser, axis=0)
expected = DataFrame({"A": [1, pd.NA, 3], "B": [4, pd.NA, 6]}).astype(object)
expected = DataFrame({"A": [1, np.nan, 3], "B": [4, np.nan, 6]})
tm.assert_frame_equal(result, expected)

ser2 = Series(arr[:2], index=["A", "B"])
expected = DataFrame({"A": [1, 7, 3], "B": [4, pd.NA, 6]})
expected["B"] = expected["B"].astype(object)
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.where(mask, ser2, axis=1)
expected = DataFrame({"A": [1, 7, 3], "B": [4, np.nan, 6]})
result = df.where(mask, ser2, axis=1)
tm.assert_frame_equal(result, expected)

def test_where_interval_noop(self):
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/frame/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,20 @@ def test_to_records_with_inf_record(self):
result = repr(df)
assert result == expected

def test_masked_ea_with_formatter(self):
# GH#39336
df = DataFrame(
{
"a": Series([0.123456789, 1.123456789], dtype="Float64"),
"b": Series([1, 2], dtype="Int64"),
}
)
result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format])
expected = """ a b
0 0.12 1.00
1 1.12 2.00"""
assert result == expected

def test_repr_ea_columns(self, any_string_dtype):
# GH#54797
pytest.importorskip("pyarrow")
Expand Down