From 1f0199033e2e72e51fc412373fe48c8d6be38797 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 18 Jul 2015 12:19:10 -0500 Subject: [PATCH] PERF: Improve perf of to_datetime with ISO format --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/tseries/tests/test_timeseries.py | 24 +++++++++++---- pandas/tseries/tools.py | 39 +++++++++++++++---------- pandas/tslib.pyx | 16 ++++++++-- vb_suite/timeseries.py | 4 +++ 5 files changed, 61 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 6ab299eb70eb5..206c5e2e22711 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -328,6 +328,7 @@ Performance Improvements - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`) - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`) - 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`) +- Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`) .. _whatsnew_0170.bug_fixes: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 166760678f3ab..9703accc42695 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -919,8 +919,8 @@ def test_to_datetime_with_apply(self): assert_series_equal(result, expected) td = pd.Series(['May 04', 'Jun 02', ''], index=[1,2,3]) - self.assertRaises(ValueError, lambda : pd.to_datetime(td,format='%b %y')) - self.assertRaises(ValueError, lambda : td.apply(pd.to_datetime, format='%b %y')) + self.assertRaises(ValueError, lambda : pd.to_datetime(td,format='%b %y', errors='raise')) + self.assertRaises(ValueError, lambda : td.apply(pd.to_datetime, format='%b %y', errors='raise')) expected = pd.to_datetime(td, format='%b %y', coerce=True) result = td.apply(lambda x: pd.to_datetime(x, format='%b %y', coerce=True)) @@ -4197,6 +4197,20 @@ def test_to_datetime_format_YYYYMMDD(self): expected = Series(['20121231','20141231','NaT'],dtype='M8[ns]') assert_series_equal(result, expected) + # GH 10178 + def test_to_datetime_format_integer(self): + s = Series([2000, 2001, 2002]) + expected = Series([ Timestamp(x) for x in s.apply(str) ]) + + result = to_datetime(s,format='%Y') + assert_series_equal(result, expected) + + s = Series([200001, 200105, 200206]) + expected = Series([ Timestamp(x[:4] + '-' + x[4:]) for x in s.apply(str) ]) + + result = to_datetime(s,format='%Y%m') + assert_series_equal(result, expected) + def test_to_datetime_format_microsecond(self): val = '01-Apr-2011 00:00:01.978' format = '%d-%b-%Y %H:%M:%S.%f' @@ -4524,9 +4538,9 @@ def test_day_not_in_month_coerce_false_raise(self): def test_day_not_in_month_coerce_false_ignore(self): self.assertEqual(to_datetime('2015-02-29', errors='ignore', coerce=False), '2015-02-29') - self.assertRaises(ValueError, to_datetime, '2015-02-29', errors='ignore', format="%Y-%m-%d", coerce=False) - self.assertRaises(ValueError, to_datetime, '2015-02-32', errors='ignore', format="%Y-%m-%d", coerce=False) - self.assertRaises(ValueError, to_datetime, '2015-04-31', errors='ignore', format="%Y-%m-%d", coerce=False) + self.assertEqual(to_datetime('2015-02-29', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-02-29') + self.assertEqual(to_datetime('2015-02-32', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-02-32') + self.assertEqual(to_datetime('2015-04-31', errors='ignore', format="%Y-%m-%d", coerce=False), '2015-04-31') if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 5ff6a48981ceb..6a1dd934d6bce 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -296,21 +296,24 @@ def _convert_listlike(arg, box, format): return result arg = com._ensure_object(arg) + require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - if format is not None: - # There is a special fast-path for iso8601 formatted - # datetime strings, so in those cases don't use the inferred - # format because this path makes process slower in this - # special case - format_is_iso8601 = ( - '%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or - '%Y-%m-%d %H:%M:%S.%f'.startswith(format) - ) - if format_is_iso8601: - format = None + if format is not None: + # There is a special fast-path for iso8601 formatted + # datetime strings, so in those cases don't use the inferred + # format because this path makes process slower in this + # special case + format_is_iso8601 = ( + ('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or + '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and + format != '%Y' + ) + if format_is_iso8601: + require_iso8601 = not infer_datetime_format + format = None try: result = None @@ -334,16 +337,20 @@ def _convert_listlike(arg, box, format): raise result = arg except ValueError: - # Only raise this error if the user provided the - # datetime format, and not when it was inferred + # if format was inferred, try falling back + # to array_to_datetime - terminate here + # for specified formats if not infer_datetime_format: - raise + if errors == 'raise': + raise + result = arg if result is None and (format is None or infer_datetime_format): - result = tslib.array_to_datetime(arg, raise_=errors == 'raise', + result = tslib.array_to_datetime(arg, raise_=errors=='raise', utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, - coerce=coerce, unit=unit) + coerce=coerce, unit=unit, + require_iso8601=require_iso8601) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index a2fc9b07b16a1..da7cc05621775 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1808,7 +1808,8 @@ cpdef object _get_rule_month(object source, object default='DEC'): cpdef array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False, yearfirst=False, freq=None, - format=None, utc=None, coerce=False, unit=None): + format=None, utc=None, coerce=False, unit=None, + require_iso8601=False): cdef: Py_ssize_t i, n = len(values) object val, py_dt @@ -1908,6 +1909,17 @@ cpdef array_to_datetime(ndarray[object] values, raise_=False, iresult[i] = value _check_dts_bounds(&dts) except ValueError: + # if requiring iso8601 strings, skip trying other formats + if require_iso8601: + if coerce: + iresult[i] = iNaT + continue + elif raise_: + raise ValueError("time data %r does match format specified" % + (val,)) + else: + return values + try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq) @@ -1971,7 +1983,7 @@ cpdef array_to_datetime(ndarray[object] values, raise_=False, continue try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst, freq=freq) _pydatetime_to_dts(oresult[i], &dts) _check_dts_bounds(&dts) except Exception: diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 57fb1ada78691..75147e079bb65 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -157,6 +157,10 @@ def date_range(start=None, end=None, periods=None, freq=None): Benchmark('to_datetime(strings)', setup, start_date=datetime(2012, 7, 11)) +timeseries_to_datetime_iso8601_format = \ + Benchmark("to_datetime(strings, format='%Y-%m-%d %H:%M:%S')", setup, + start_date=datetime(2012, 7, 11)) + setup = common_setup + """ rng = date_range('1/1/2000', periods=10000, freq='D') strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str)