Skip to content

API: to_numpy() defaults to corresponding type #48937

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
34 changes: 34 additions & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,40 @@ Optional libraries below the lowest tested version may still work, but are not c

See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.

Nullable types get converted to their respective NumPy types in ``to_numpy``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Previously, for nullable data types, :meth:`Series.to_numpy` would always convert to ``object`` type:

*Old Behavior*

.. code-block:: ipython

In [1]: pd.Series([1, 2, 3], dtype="Float64").to_numpy()
Out[1]: array([1.0, 2.0, 3.0], dtype=object)

Now, the above :class:`Series` gets converted to ``float64``:

*New Behavior*

.. ipython:: python

pd.Series([1, 2, 3], dtype="Float64").to_numpy()

If a :class:`Series` contains missing values (``pd.NA``), then when converting to ``float64``,
they will be converted to ``np.nan``:

.. ipython:: python

pd.Series([1, 2, pd.NA], dtype="Float64").to_numpy()

If converting to a type other than ``float64``, then you need to specify an ``na_value``
compatible with that ``dtype``, for example:

.. ipython:: python

pd.Series([1, 2, pd.NA], dtype="Float64").to_numpy("int64", na_value=-1)

.. _whatsnew_160.api_breaking.other:

Other API changes
Expand Down
14 changes: 7 additions & 7 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -1051,15 +1051,15 @@ def assert_series_equal(
left_values,
right_values,
check_dtype=check_dtype,
index_values=np.asarray(left.index),
index_values=np.asarray(left.index, dtype=object),
)
else:
assert_numpy_array_equal(
left_values,
right_values,
check_dtype=check_dtype,
obj=str(obj),
index_values=np.asarray(left.index),
index_values=np.asarray(left.index, dtype=object),
)
elif check_datetimelike_compat and (
needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)
Expand Down Expand Up @@ -1088,7 +1088,7 @@ def assert_series_equal(
atol=atol,
check_dtype=bool(check_dtype),
obj=str(obj),
index_values=np.asarray(left.index),
index_values=np.asarray(left.index, dtype=object),
)
elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype):
assert_extension_array_equal(
Expand All @@ -1097,7 +1097,7 @@ def assert_series_equal(
rtol=rtol,
atol=atol,
check_dtype=check_dtype,
index_values=np.asarray(left.index),
index_values=np.asarray(left.index, dtype=object),
)
elif is_extension_array_dtype_and_needs_i8_conversion(
left.dtype, right.dtype
Expand All @@ -1106,15 +1106,15 @@ def assert_series_equal(
left._values,
right._values,
check_dtype=check_dtype,
index_values=np.asarray(left.index),
index_values=np.asarray(left.index, dtype=object),
)
elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
# DatetimeArray or TimedeltaArray
assert_extension_array_equal(
left._values,
right._values,
check_dtype=check_dtype,
index_values=np.asarray(left.index),
index_values=np.asarray(left.index, dtype=object),
)
else:
_testing.assert_almost_equal(
Expand All @@ -1124,7 +1124,7 @@ def assert_series_equal(
atol=atol,
check_dtype=bool(check_dtype),
obj=str(obj),
index_values=np.asarray(left.index),
index_values=np.asarray(left.index, dtype=object),
)

# metadata comparison
Expand Down
18 changes: 13 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,10 +401,14 @@ def to_numpy(
>>> a.to_numpy(dtype="bool", na_value=False)
array([ True, False, False])
"""
if na_value is lib.no_default:
na_value = libmissing.NA
if dtype is None:
dtype = object
dtype = self.dtype.type

if na_value is lib.no_default and is_float_dtype(dtype):
na_value = np.nan
elif na_value is lib.no_default:
na_value = libmissing.NA

if self._hasna:
if (
not is_object_dtype(dtype)
Expand All @@ -413,8 +417,12 @@ def to_numpy(
):
raise ValueError(
f"cannot convert to '{dtype}'-dtype NumPy array "
"with missing values. Specify an appropriate 'na_value' "
"for this dtype."
"with missing values.\n"
"Please either:\n"
"- convert to 'float'\n"
"- convert to 'object'\n"
"- specify an appropriate 'na_value' for this dtype\n"
"for this dtype.\n"
)
# don't pass copy to astype -> always need a copy since we are mutating
data = self._data.astype(dtype)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bo
return False
else:
try:
if np.any(np.asarray(left_value != right_value)):
if np.any(np.asarray(left_value != right_value, dtype=object)):
return False
except TypeError as err:
if "boolean value of NA is ambiguous" in str(err):
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9734,10 +9734,12 @@ def _where(
other = other._values
if axis == 0:
other = np.reshape(other, (-1, 1))
ind = np.arange(other.shape[1]).repeat(self.shape[1])
other = other.take(ind, axis=1)
elif axis == 1:
other = np.reshape(other, (1, -1))

other = np.broadcast_to(other, self.shape)
ind = np.arange(other.shape[0]).repeat(self.shape[0])
other = other.take(ind, axis=0)

# slice me out of the other
else:
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1742,7 +1742,7 @@ def as_array(
# error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
# attribute "to_numpy"
arr = blk.values.to_numpy( # type: ignore[union-attr]
dtype=dtype,
dtype=dtype or 'object',
na_value=na_value,
).reshape(blk.shape)
else:
Expand Down Expand Up @@ -1778,14 +1778,13 @@ def _interleave(
dtype = interleaved_dtype( # type: ignore[assignment]
[blk.dtype for blk in self.blocks]
)

# TODO: https://github.com/pandas-dev/pandas/issues/22791
# Give EAs some input on what happens here. Sparse needs this.
if isinstance(dtype, SparseDtype):
dtype = dtype.subtype
dtype = cast(np.dtype, dtype)
elif isinstance(dtype, ExtensionDtype):
dtype = np.dtype("object")
dtype = dtype.type
elif is_dtype_equal(dtype, str):
dtype = np.dtype("object")

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2043,7 +2043,7 @@ def injection(obj):
# np.array([0, 255, 255], dtype=np.uint8)
# and the non-injectivity should make a difference somehow
# shouldn't it?
return np.asarray(obj)
return np.asarray(obj, dtype=object)

xs = [injection(x) for x in xs]
labels = list(string.ascii_lowercase[: len(xs)])
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1657,7 +1657,7 @@ def _format_strings(self) -> list[str]:
# Categorical is special for now, so that we can preserve tzinfo
array = values._internal_get_values()
else:
array = np.asarray(values)
array = np.asarray(values, dtype=object)

fmt_values = format_array(
array,
Expand Down
40 changes: 24 additions & 16 deletions pandas/tests/arrays/boolean/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,16 +214,17 @@ def test_coerce_to_array_from_boolean_array():


def test_coerce_to_numpy_array():
# with missing values -> object dtype
# with missing values -> tries but fails to convert
arr = pd.array([True, False, None], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
with pytest.raises(
ValueError, match=r"specify an appropriate 'na_value' for this dtype"
):
result = np.array(arr)

# also with no missing values -> object dtype
# also with no missing values -> successfully converts to bool
arr = pd.array([True, False, True], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, True], dtype="object")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

# force bool dtype
Expand All @@ -233,8 +234,12 @@ def test_coerce_to_numpy_array():
# with missing values will raise error
arr = pd.array([True, False, None], dtype="boolean")
msg = (
"cannot convert to 'bool'-dtype NumPy array with missing values. "
"Specify an appropriate 'na_value' for this dtype."
"^cannot convert to 'bool'-dtype NumPy array with missing values.\n"
"Please either:\n"
"- convert to 'float'\n"
"- convert to 'object'\n"
"- specify an appropriate 'na_value' for this dtype\n"
"for this dtype.\n$"
)
with pytest.raises(ValueError, match=msg):
np.array(arr, dtype="bool")
Expand All @@ -260,16 +265,17 @@ def test_to_boolean_array_from_strings_invalid_string():
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
# default (with or without missing values) -> bool dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="object")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)

arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
with pytest.raises(
ValueError, match="specify an appropriate 'na_value' for this dtype"
):
arr.to_numpy()

arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype="str")
Expand Down Expand Up @@ -304,11 +310,13 @@ def test_to_numpy(box):
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

# converting to int or float without specifying na_value raises
# converting to int without specifying na_value raises
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
arr.to_numpy(dtype="int64")
with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
arr.to_numpy(dtype="float64")
# converting to float without specifying na_value converts NA to nan
result = arr.to_numpy(dtype="float64")
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)


def test_to_numpy_copy():
Expand Down
24 changes: 18 additions & 6 deletions pandas/tests/arrays/floating/test_to_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@
def test_to_numpy(box):
con = pd.Series if box else pd.array

# default (with or without missing values) -> object dtype
# default (with or without missing values) -> float64 dtype
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, 0.3], dtype="object")
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, pd.NA], dtype="object")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)


Expand All @@ -33,8 +33,9 @@ def test_to_numpy_float(box):
tm.assert_numpy_array_equal(result, expected)

arr = con([0.1, 0.2, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"):
result = arr.to_numpy(dtype="float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)

# need to explicitly specify na_value
result = arr.to_numpy(dtype="float64", na_value=np.nan)
Expand Down Expand Up @@ -100,7 +101,18 @@ def test_to_numpy_dtype(box, dtype):
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"])
@pytest.mark.parametrize("dtype", ["float64", "float32"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_doesnt_raise(box, dtype):
# https://github.com/pandas-dev/pandas/issues/48891
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype=dtype)
expected = np.array([0.0, 1.0, np.nan], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_raises(box, dtype):
con = pd.Series if box else pd.array
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_from_dtype_from_float(data):

# from int / list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
result = pd.Series(np.array(data, dtype=object).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)

# from int / array
Expand Down
15 changes: 12 additions & 3 deletions pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_astype_index(all_data, dropna):
other = all_data

dtype = all_data.dtype
idx = pd.Index._with_infer(np.array(other))
idx = pd.Index._with_infer(np.array(other, dtype=object))
assert isinstance(idx, ABCIndex)

result = idx.astype(dtype)
Expand Down Expand Up @@ -143,7 +143,7 @@ def test_astype(all_data):
# coerce to object
s = pd.Series(mixed)
result = s.astype("object")
expected = pd.Series(np.asarray(mixed))
expected = pd.Series(np.asarray(mixed, dtype=object))
tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -274,13 +274,22 @@ def test_to_numpy_dtype(dtype, in_series):
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"])
@pytest.mark.parametrize("dtype", ["int64", "bool"])
def test_to_numpy_na_raises(dtype):
a = pd.array([0, 1, None], dtype="Int64")
with pytest.raises(ValueError, match=dtype):
a.to_numpy(dtype=dtype)


@pytest.mark.parametrize("dtype", ["float64"])
def test_to_numpy_na_doesnt_raise(dtype):
# https://github.com/pandas-dev/pandas/issues/48891
a = pd.array([0, 1, None], dtype="Int64")
result = a.to_numpy(dtype=dtype)
expected = np.array([0.0, 1.0, np.nan])
tm.assert_numpy_array_equal(result, expected)


def test_astype_str():
a = pd.array([1, 2, None], dtype="Int64")
expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/masked_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_ufunc_with_out(self, dtype):
# result |= mask worked because mask could be cast losslessly to
# boolean ndarray. mask2 can't, so this raises
result = np.zeros(3, dtype=bool)
msg = "Specify an appropriate 'na_value' for this dtype"
msg = "specify an appropriate 'na_value' for this dtype"
with pytest.raises(ValueError, match=msg):
result |= mask2

Expand Down
Loading