-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Implement convert_dtypes #30929
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
04b277b
82e62dc
dc1daa0
a9b477e
e54ad4f
8cc238d
f0ba92b
aebba66
40123c7
78be9b8
7bf4f51
f59a7d4
b85d135
26ffc26
888ac31
a6e10b0
34493a0
f990096
4c272ee
585df23
2efb8ea
8e4dfff
c80ce7d
0a331a4
8a5fcf3
39798fa
1e68d03
fa93a84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
from pandas._libs import lib, tslib, tslibs | ||
from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT | ||
from pandas._libs.tslibs.timezones import tz_compare | ||
from pandas._typing import Dtype | ||
from pandas.util._validators import validate_bool_kwarg | ||
|
||
from pandas.core.dtypes.common import ( | ||
|
@@ -34,6 +35,7 @@ | |
is_float_dtype, | ||
is_integer, | ||
is_integer_dtype, | ||
is_numeric_dtype, | ||
is_object_dtype, | ||
is_scalar, | ||
is_string_dtype, | ||
|
@@ -1018,6 +1020,80 @@ def soft_convert_objects( | |
return values | ||
|
||
|
||
def convert_dtypes( | ||
input_array, | ||
Dr-Irv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
convert_string: bool = True, | ||
convert_integer: bool = True, | ||
convert_boolean: bool = True, | ||
) -> Dtype: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we really need to get a DtypeObject in pandas._typing that excludes strings There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PR welcome! (heh, heh) |
||
""" | ||
Convert objects to best possible type, and optionally, | ||
to types supporting ``pd.NA``. | ||
|
||
Parameters | ||
---------- | ||
input_array : ExtensionArray or PandasArray | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "ExtensionArray or PandasArray" is redundant, isnt it? is ndarray not allowed? either way, can input_array be annotated? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel You're correct about the redundancy (this description resulted after lots of discussion above), and I think an ndarray would work, but it is probably untested. With respect to annotation, the issue here is the ordering of imports, so if it were to be typed, it requires changes to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for explaining, my mistake not following the thread in real-time. |
||
convert_string : bool, default True | ||
Whether object dtypes should be converted to ``StringDtype()``. | ||
convert_integer : bool, default True | ||
Whether, if possible, conversion can be done to integer extension types. | ||
convert_boolean : bool, defaults True | ||
Whether object dtypes should be converted to ``BooleanDtypes()``. | ||
|
||
Returns | ||
------- | ||
dtype | ||
new dtype | ||
""" | ||
|
||
if convert_string or convert_integer or convert_boolean: | ||
try: | ||
inferred_dtype = lib.infer_dtype(input_array) | ||
except ValueError: | ||
# Required to catch due to Period. Can remove once GH 23553 is fixed | ||
inferred_dtype = input_array.dtype | ||
|
||
if not convert_string and is_string_dtype(inferred_dtype): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
inferred_dtype = input_array.dtype | ||
|
||
if convert_integer: | ||
target_int_dtype = "Int64" | ||
|
||
if isinstance(inferred_dtype, str) and ( | ||
inferred_dtype == "mixed-integer" | ||
or inferred_dtype == "mixed-integer-float" | ||
): | ||
inferred_dtype = target_int_dtype | ||
if is_integer_dtype(input_array.dtype) and not is_extension_array_dtype( | ||
input_array.dtype | ||
): | ||
from pandas.core.arrays.integer import _dtypes | ||
|
||
inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) | ||
if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( | ||
input_array.dtype | ||
): | ||
inferred_dtype = target_int_dtype | ||
|
||
else: | ||
if is_integer_dtype(inferred_dtype): | ||
inferred_dtype = input_array.dtype | ||
|
||
if convert_boolean: | ||
if is_bool_dtype(input_array.dtype) and not is_extension_array_dtype( | ||
input_array.dtype | ||
): | ||
inferred_dtype = "boolean" | ||
else: | ||
if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": | ||
inferred_dtype = input_array.dtype | ||
|
||
else: | ||
inferred_dtype = input_array.dtype | ||
|
||
return inferred_dtype | ||
|
||
|
||
def maybe_castable(arr) -> bool: | ||
# return False to force a non-fastpath | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ | |
from pandas.util._decorators import Appender, Substitution | ||
from pandas.util._validators import validate_bool_kwarg, validate_percentile | ||
|
||
from pandas.core.dtypes.cast import convert_dtypes | ||
from pandas.core.dtypes.common import ( | ||
_is_unorderable_exception, | ||
ensure_platform_int, | ||
|
@@ -4372,6 +4373,34 @@ def between(self, left, right, inclusive=True) -> "Series": | |
|
||
return lmask & rmask | ||
|
||
# ---------------------------------------------------------------------- | ||
# Convert to types that support pd.NA | ||
|
||
def _convert_dtypes( | ||
self: ABCSeries, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we either a) not annotate self or b) use "Series" instead of ABCSeries (like we have for the return annotation) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When I wrote the code, I didn't know about the @jbrockmendel So now the question is whether these changes are worth a new PR, and whether that could also include doing something with the typing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no worires, ill do this in an upcoming "assorted cleanups" PR |
||
infer_objects: bool = True, | ||
convert_string: bool = True, | ||
convert_integer: bool = True, | ||
convert_boolean: bool = True, | ||
) -> "Series": | ||
input_series = self | ||
if infer_objects: | ||
Dr-Irv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
input_series = input_series.infer_objects() | ||
if is_object_dtype(input_series): | ||
input_series = input_series.copy() | ||
Dr-Irv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if convert_string or convert_integer or convert_boolean: | ||
inferred_dtype = convert_dtypes( | ||
input_series._values, convert_string, convert_integer, convert_boolean | ||
) | ||
try: | ||
result = input_series.astype(inferred_dtype) | ||
except TypeError: | ||
result = input_series.copy() | ||
else: | ||
result = input_series.copy() | ||
return result | ||
|
||
@Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) | ||
def isna(self) -> "Series": | ||
return super().isna() | ||
|
Uh oh!
There was an error while loading. Please reload this page.