Skip to content

Commit a08b0ef

Browse files
author
MarcoGorelli
committed
wip
1 parent 0189674 commit a08b0ef

File tree

5 files changed

+70
-41
lines changed

5 files changed

+70
-41
lines changed

doc/source/whatsnew/v1.5.3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Fixed regressions
1919
- Enforced reversion of ``color`` as an alias for ``c`` and ``size`` as an alias for ``s`` in function :meth:`DataFrame.plot.scatter` (:issue:`49732`)
2020
- Fixed regression in :meth:`SeriesGroupBy.apply` setting a ``name`` attribute on the result if the result was a :class:`DataFrame` (:issue:`49907`)
2121
- Fixed performance regression in setting with the :meth:`~DataFrame.at` indexer (:issue:`49771`)
22+
- Fixed regression in :func:`to_datetime` raising ``ValueError`` when parsing array of ``float`` containing ``np.nan`` (:issue:`50237`)
2223
-
2324

2425
.. ---------------------------------------------------------------------------

pandas/_libs/tslibs/parsing.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,7 @@ def format_is_iso(f: str) -> bint:
870870
but must be consistent. Leading 0s in dates and times are optional.
871871
"""
872872
iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
873-
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
873+
excluded_formats = ["%Y%m", "%Y"]
874874

875875
for date_sep in [" ", "/", "\\", "-", ".", ""]:
876876
for time_sep in [" ", "T"]:

pandas/_libs/tslibs/strptime.pyx

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,21 @@ from pandas._libs.tslibs.nattype cimport (
3434
c_nat_strings as nat_strings,
3535
)
3636
from pandas._libs.tslibs.np_datetime cimport (
37+
NPY_DATETIMEUNIT,
3738
NPY_FR_ns,
3839
check_dts_bounds,
3940
npy_datetimestruct,
4041
npy_datetimestruct_to_datetime,
4142
pydate_to_dt64,
4243
pydatetime_to_dt64,
44+
string_to_dts,
4345
)
4446
from pandas._libs.tslibs.timestamps cimport _Timestamp
45-
from pandas._libs.util cimport is_datetime64_object
47+
from pandas._libs.util cimport (
48+
is_datetime64_object,
49+
is_float_object,
50+
is_integer_object,
51+
)
4652

4753
cnp.import_array()
4854

@@ -89,6 +95,7 @@ def array_strptime(
8995
exact : matches must be exact if True, search if False
9096
errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
9197
"""
98+
from pandas._libs.tslibs.parsing import format_is_iso
9299

93100
cdef:
94101
Py_ssize_t i, n = len(values)
@@ -106,6 +113,9 @@ def array_strptime(
106113
bint found_naive = False
107114
bint found_tz = False
108115
tzinfo tz_out = None
116+
bint iso_format = fmt is not None and format_is_iso(fmt)
117+
NPY_DATETIMEUNIT out_bestunit
118+
int out_local = 0, out_tzoffset = 0
109119

110120
assert is_raise or is_ignore or is_coerce
111121

@@ -185,9 +195,50 @@ def array_strptime(
185195
elif is_datetime64_object(val):
186196
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
187197
continue
198+
elif (
199+
(is_integer_object(val) or is_float_object(val))
200+
and (val != val or val == NPY_NAT)
201+
):
202+
iresult[i] = NPY_NAT
203+
continue
188204
else:
189205
val = str(val)
190206

207+
if (iso_format and not (fmt == "%Y%m%d" and len(val) != 8)):
208+
# There is a fast-path for ISO8601-formatted strings.
209+
# BUT for %Y%m%d, it only works if the string is 8-digits long.
210+
string_to_dts_failed = string_to_dts(
211+
val, &dts, &out_bestunit, &out_local,
212+
&out_tzoffset, False, fmt, exact
213+
)
214+
if string_to_dts_failed:
215+
# An error at this point is a _parsing_ error
216+
# specifically _not_ OutOfBoundsDatetime
217+
if is_coerce:
218+
iresult[i] = NPY_NAT
219+
continue
220+
raise ValueError(
221+
f"time data \"{val}\" at position {i} doesn't "
222+
f"match format \"{fmt}\""
223+
)
224+
# No error reported by string_to_dts, pick back up
225+
# where we left off
226+
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
227+
if out_local == 1:
228+
# Store the out_tzoffset in seconds
229+
# since we store the total_seconds of
230+
# dateutil.tz.tzoffset objects
231+
# out_tzoffset_vals.add(out_tzoffset * 60.)
232+
tz = timezone(timedelta(minutes=out_tzoffset))
233+
result_timezone[i] = tz
234+
# value = tz_localize_to_utc_single(value, tz)
235+
out_local = 0
236+
out_tzoffset = 0
237+
iresult[i] = value
238+
check_dts_bounds(&dts)
239+
240+
continue
241+
191242
# exact matching
192243
if exact:
193244
found = format_regex.match(val)

pandas/core/tools/datetimes.py

Lines changed: 2 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
)
3535
from pandas._libs.tslibs.parsing import (
3636
DateParseError,
37-
format_is_iso,
3837
guess_datetime_format,
3938
)
4039
from pandas._libs.tslibs.strptime import array_strptime
@@ -417,7 +416,6 @@ def _convert_listlike_datetimes(
417416

418417
# warn if passing timedelta64, raise for PeriodDtype
419418
# NB: this must come after unit transformation
420-
orig_arg = arg
421419
try:
422420
arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
423421
except TypeError:
@@ -430,32 +428,19 @@ def _convert_listlike_datetimes(
430428
raise
431429

432430
arg = ensure_object(arg)
433-
require_iso8601 = False
434431

435432
if format is None:
436433
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
437434

438-
# There is a special fast-path for iso8601 formatted datetime strings
439-
require_iso8601 = format is not None and format_is_iso(format)
440-
441-
if format is not None and not require_iso8601:
442-
return _to_datetime_with_format(
443-
arg,
444-
orig_arg,
445-
name,
446-
utc,
447-
format,
448-
exact,
449-
errors,
450-
)
435+
if format is not None:
436+
return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
451437

452438
result, tz_parsed = objects_to_datetime64ns(
453439
arg,
454440
dayfirst=dayfirst,
455441
yearfirst=yearfirst,
456442
utc=utc,
457443
errors=errors,
458-
require_iso8601=require_iso8601,
459444
allow_object=True,
460445
format=format,
461446
exact=exact,
@@ -522,22 +507,6 @@ def _to_datetime_with_format(
522507
"""
523508
Try parsing with the given format.
524509
"""
525-
result = None
526-
527-
# shortcut formatting here
528-
if fmt == "%Y%m%d":
529-
# pass orig_arg as float-dtype may have been converted to
530-
# datetime64[ns]
531-
orig_arg = ensure_object(orig_arg)
532-
try:
533-
# may return None without raising
534-
result = _attempt_YYYYMMDD(orig_arg, errors=errors)
535-
except (ValueError, TypeError, OutOfBoundsDatetime) as err:
536-
raise ValueError(
537-
"cannot convert the input to '%Y%m%d' date format"
538-
) from err
539-
if result is not None:
540-
return _box_as_indexlike(result, utc=utc, name=name)
541510

542511
# fallback
543512
res = _array_strptime_with_fallback(arg, name, utc, fmt, exact, errors)

pandas/tests/tools/test_to_datetime.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,18 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache):
132132
# string with NaT
133133
ser2 = ser.apply(str)
134134
ser2[2] = "nat"
135-
result = to_datetime(ser2, format="%Y%m%d", cache=cache)
135+
with pytest.raises(ValueError, match="unconverted data remains: .0"):
136+
to_datetime(ser2, format="%Y%m%d", cache=cache)
137+
138+
def test_to_datetime_format_YYYYMM_with_nat(self, cache):
139+
# https://github.com/pandas-dev/pandas/issues/50237
140+
ser = Series([198012, 198012] + [198101] * 5)
141+
expected = Series(
142+
[Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5
143+
)
144+
expected[2] = np.nan
145+
ser[2] = np.nan
146+
result = to_datetime(ser, format="%Y%m", cache=cache)
136147
tm.assert_series_equal(result, expected)
137148

138149
def test_to_datetime_format_YYYYMMDD_ignore(self, cache):
@@ -141,7 +152,7 @@ def test_to_datetime_format_YYYYMMDD_ignore(self, cache):
141152
ser = Series([20121231, 20141231, 99991231])
142153
result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache)
143154
expected = Series(
144-
[datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)],
155+
[20121231, 20141231, 99991231],
145156
dtype=object,
146157
)
147158
tm.assert_series_equal(result, expected)
@@ -2852,10 +2863,7 @@ def test_incorrect_value_exception(self):
28522863
)
28532864
def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning):
28542865
# see gh-23830
2855-
msg = (
2856-
"Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 "
2857-
"present at position 0"
2858-
)
2866+
msg = "Out of bounds nanosecond timestamp: 2417-10-27 00:00:00"
28592867
with pytest.raises(OutOfBoundsDatetime, match=msg):
28602868
with tm.assert_produces_warning(warning, match="Could not infer format"):
28612869
to_datetime("2417-10-27 00:00:00", format=format)

0 commit comments

Comments
 (0)