From bfba8dbd1c350c3eb8de34d6efcc43ca51dd303a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 11 Jan 2018 11:32:49 -0800 Subject: [PATCH] Hand written ISO parser for Timedelta construction --- pandas/_libs/tslibs/timedeltas.pyx | 123 +++++++++++++++++++++-------- 1 file changed, 92 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8dba8c15f0b81..b2c9c464c7cbf 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- # cython: profile=False import collections -import re import sys cdef bint PY3 = (sys.version_info[0] >= 3) @@ -236,6 +235,14 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: return (base *m) + (frac *m) +cdef inline _decode_if_necessary(object ts): + # decode ts if necessary + if not PyUnicode_Check(ts) and not PY3: + ts = str(ts).decode('utf-8') + + return ts + + cdef inline parse_timedelta_string(object ts): """ Parse a regular format timedelta string. Return an int64_t (in ns) @@ -258,9 +265,7 @@ cdef inline parse_timedelta_string(object ts): if len(ts) == 0 or ts in nat_strings: return NPY_NAT - # decode ts if necessary - if not PyUnicode_Check(ts) and not PY3: - ts = str(ts).decode('utf-8') + ts = _decode_if_necessary(ts) for c in ts: @@ -507,26 +512,14 @@ def _binary_op_method_timedeltalike(op, name): # ---------------------------------------------------------------------- # Timedelta Construction -iso_pater = re.compile(r"""P - (?P-?[0-9]*)DT - (?P[0-9]{1,2})H - (?P[0-9]{1,2})M - (?P[0-9]{0,2}) - (\. - (?P[0-9]{1,3}) - (?P[0-9]{0,3}) - (?P[0-9]{0,3}) - )?S""", re.VERBOSE) - - -cdef int64_t parse_iso_format_string(object iso_fmt) except? -1: +cdef inline int64_t parse_iso_format_string(object ts) except? -1: """ Extracts and cleanses the appropriate values from a match object with groups for each component of an ISO 8601 duration Parameters ---------- - iso_fmt: + ts: ISO 8601 Duration formatted string Returns @@ -537,25 +530,93 @@ cdef int64_t parse_iso_format_string(object iso_fmt) except? -1: Raises ------ ValueError - If ``iso_fmt`` cannot be parsed + If ``ts`` cannot be parsed """ - cdef int64_t ns = 0 + cdef: + unicode c + int64_t result = 0, r + int p=0 + object dec_unit = 'ms', err_msg + bint have_dot=0, have_value=0, neg=0 + list number=[], unit=[] - match = re.match(iso_pater, iso_fmt) - if match: - match_dict = match.groupdict(default='0') - for comp in ['milliseconds', 'microseconds', 'nanoseconds']: - match_dict[comp] = '{:0<3}'.format(match_dict[comp]) + ts = _decode_if_necessary(ts) - for k, v in match_dict.items(): - ns += timedelta_from_spec(v, '0', k) + err_msg = "Invalid ISO 8601 Duration format - {}".format(ts) - else: - raise ValueError("Invalid ISO 8601 Duration format - " - "{}".format(iso_fmt)) + for c in ts: + # number (ascii codes) + if ord(c) >= 48 and ord(c) <= 57: + + have_value = 1 + if have_dot: + if p == 3 and dec_unit != 'ns': + unit.append(dec_unit) + if dec_unit == 'ms': + dec_unit = 'us' + elif dec_unit == 'us': + dec_unit = 'ns' + p = 0 + p += 1 + + if not len(unit): + number.append(c) + else: + # if in days, pop trailing T + if unit[-1] == 'T': + unit.pop() + elif 'H' in unit or 'M' in unit: + if len(number) > 2: + raise ValueError(err_msg) + r = timedelta_from_spec(number, '0', unit) + result += timedelta_as_neg(r, neg) - return ns + neg = 0 + unit, number = [], [c] + else: + if c == 'P': + pass # ignore leading character + elif c == '-': + if neg or have_value: + raise ValueError(err_msg) + else: + neg = 1 + elif c in ['D', 'T', 'H', 'M']: + unit.append(c) + elif c == '.': + # append any seconds + if len(number): + r = timedelta_from_spec(number, '0', 'S') + result += timedelta_as_neg(r, neg) + unit, number = [], [] + have_dot = 1 + elif c == 'S': + if have_dot: # ms, us, or ns + if not len(number) or p > 3: + raise ValueError(err_msg) + # pad to 3 digits as required + pad = 3 - p + while pad > 0: + number.append('0') + pad -= 1 + + r = timedelta_from_spec(number, '0', dec_unit) + result += timedelta_as_neg(r, neg) + else: # seconds + if len(number) <= 2: + r = timedelta_from_spec(number, '0', 'S') + result += timedelta_as_neg(r, neg) + else: + raise ValueError(err_msg) + else: + raise ValueError(err_msg) + + if not have_value: + # Received string only - never parsed any values + raise ValueError(err_msg) + + return result cdef _to_py_int_float(v):