Skip to content

PERF: Improve perf of to_datetime with ISO format #10615

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 20, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ Performance Improvements
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
- Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)


.. _whatsnew_0170.bug_fixes:
Expand Down
24 changes: 19 additions & 5 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,8 +919,8 @@ def test_to_datetime_with_apply(self):
assert_series_equal(result, expected)

td = pd.Series(['May 04', 'Jun 02', ''], index=[1,2,3])
self.assertRaises(ValueError, lambda : pd.to_datetime(td,format='%b %y'))
self.assertRaises(ValueError, lambda : td.apply(pd.to_datetime, format='%b %y'))
self.assertRaises(ValueError, lambda : pd.to_datetime(td,format='%b %y', errors='raise'))
self.assertRaises(ValueError, lambda : td.apply(pd.to_datetime, format='%b %y', errors='raise'))
expected = pd.to_datetime(td, format='%b %y', coerce=True)

result = td.apply(lambda x: pd.to_datetime(x, format='%b %y', coerce=True))
Expand Down Expand Up @@ -4197,6 +4197,20 @@ def test_to_datetime_format_YYYYMMDD(self):
expected = Series(['20121231','20141231','NaT'],dtype='M8[ns]')
assert_series_equal(result, expected)

# GH 10178
def test_to_datetime_format_integer(self):
s = Series([2000, 2001, 2002])
expected = Series([ Timestamp(x) for x in s.apply(str) ])

result = to_datetime(s,format='%Y')
assert_series_equal(result, expected)

s = Series([200001, 200105, 200206])
expected = Series([ Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) ])

result = to_datetime(s,format='%Y%m')
assert_series_equal(result, expected)

def test_to_datetime_format_microsecond(self):
val = '01-Apr-2011 00:00:01.978'
format = '%d-%b-%Y %H:%M:%S.%f'
Expand Down Expand Up @@ -4524,9 +4538,9 @@ def test_day_not_in_month_coerce_false_raise(self):

def test_day_not_in_month_coerce_false_ignore(self):
self.assertEqual(to_datetime('2015-02-29', errors='ignore', coerce=False), '2015-02-29')
self.assertRaises(ValueError, to_datetime, '2015-02-29', errors='ignore', format="%Y-%m-%d", coerce=False)
self.assertRaises(ValueError, to_datetime, '2015-02-32', errors='ignore', format="%Y-%m-%d", coerce=False)
self.assertRaises(ValueError, to_datetime, '2015-04-31', errors='ignore', format="%Y-%m-%d", coerce=False)
self.assertEqual(to_datetime('2015-02-29', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-02-29')
self.assertEqual(to_datetime('2015-02-32', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-02-32')
self.assertEqual(to_datetime('2015-04-31', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-04-31')

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
39 changes: 23 additions & 16 deletions pandas/tseries/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,21 +296,24 @@ def _convert_listlike(arg, box, format):
return result

arg = com._ensure_object(arg)
require_iso8601 = False

if infer_datetime_format and format is None:
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe this should be:

if format is None:

    if infer_datetime_format:
           .......
    else:
          # check for the iso format

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the iso format code needs to be outside the else - it should apply whether or not the format was inferred or specified?


if format is not None:
# There is a special fast-path for iso8601 formatted
# datetime strings, so in those cases don't use the inferred
# format because this path makes process slower in this
# special case
format_is_iso8601 = (
'%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
'%Y-%m-%d %H:%M:%S.%f'.startswith(format)
)
if format_is_iso8601:
format = None
if format is not None:
# There is a special fast-path for iso8601 formatted
# datetime strings, so in those cases don't use the inferred
# format because this path makes process slower in this
# special case
format_is_iso8601 = (
('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
'%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and
format != '%Y'
)
if format_is_iso8601:
require_iso8601 = not infer_datetime_format
format = None

try:
result = None
Expand All @@ -334,16 +337,20 @@ def _convert_listlike(arg, box, format):
raise
result = arg
except ValueError:
# Only raise this error if the user provided the
# datetime format, and not when it was inferred
# if format was inferred, try falling back
# to array_to_datetime - terminate here
# for specified formats
if not infer_datetime_format:
raise
if errors == 'raise':
raise
result = arg

if result is None and (format is None or infer_datetime_format):
result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
result = tslib.array_to_datetime(arg, raise_=errors=='raise',
utc=utc, dayfirst=dayfirst,
yearfirst=yearfirst, freq=freq,
coerce=coerce, unit=unit)
coerce=coerce, unit=unit,
require_iso8601=require_iso8601)

if com.is_datetime64_dtype(result) and box:
result = DatetimeIndex(result, tz='utc' if utc else None)
Expand Down
16 changes: 14 additions & 2 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1808,7 +1808,8 @@ cpdef object _get_rule_month(object source, object default='DEC'):

cpdef array_to_datetime(ndarray[object] values, raise_=False,
dayfirst=False, yearfirst=False, freq=None,
format=None, utc=None, coerce=False, unit=None):
format=None, utc=None, coerce=False, unit=None,
require_iso8601=False):
cdef:
Py_ssize_t i, n = len(values)
object val, py_dt
Expand Down Expand Up @@ -1908,6 +1909,17 @@ cpdef array_to_datetime(ndarray[object] values, raise_=False,
iresult[i] = value
_check_dts_bounds(&dts)
except ValueError:
# if requiring iso8601 strings, skip trying other formats
if require_iso8601:
if coerce:
iresult[i] = iNaT
continue
elif raise_:
raise ValueError("time data %r does match format specified" %
(val,))
else:
return values

try:
py_dt = parse_datetime_string(val, dayfirst=dayfirst,
yearfirst=yearfirst, freq=freq)
Expand Down Expand Up @@ -1971,7 +1983,7 @@ cpdef array_to_datetime(ndarray[object] values, raise_=False,
continue
try:
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
yearfirst=yearfirst, freq=freq)
yearfirst=yearfirst, freq=freq)
_pydatetime_to_dts(oresult[i], &dts)
_check_dts_bounds(&dts)
except Exception:
Expand Down
4 changes: 4 additions & 0 deletions vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ def date_range(start=None, end=None, periods=None, freq=None):
Benchmark('to_datetime(strings)', setup,
start_date=datetime(2012, 7, 11))

timeseries_to_datetime_iso8601_format = \
Benchmark("to_datetime(strings, format='%Y-%m-%d %H:%M:%S')", setup,
start_date=datetime(2012, 7, 11))

setup = common_setup + """
rng = date_range('1/1/2000', periods=10000, freq='D')
strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str)
Expand Down