diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index f305d088e996f..5a348025d0185 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -150,6 +150,7 @@ Enhancements Performance ~~~~~~~~~~~ +- Performance improvements in ``DatetimeIndex.__iter__`` to allow faster iteration (:issue:`7683`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 81e13687441de..72fcfbff677ab 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -390,6 +390,9 @@ def _ops_compat(self, name, op_accessor): is_year_start = _field_accessor('is_year_start', "Logical indicating if first day of year (defined by frequency)") is_year_end = _field_accessor('is_year_end', "Logical indicating if last day of year (defined by frequency)") + def __iter__(self): + return (self._box_func(v) for v in self.asi8) + @property def _box_func(self): """ diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 70cd95341611f..dca2947f6a7a6 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1093,6 +1093,27 @@ def __array_finalize__(self, obj): self.name = getattr(obj, 'name', None) self._reset_identity() + def __iter__(self): + """ + Return an iterator over the boxed values + + Returns + ------- + Timestamps : ndarray + """ + + # convert in chunks of 10k for efficiency + data = self.asi8 + l = len(self) + chunksize = 10000 + chunks = int(l / chunksize) + 1 + for i in range(chunks): + start_i = i*chunksize + end_i = min((i+1)*chunksize,l) + converted = tslib.ints_to_pydatetime(data[start_i:end_i], tz=self.tz, offset=self.offset, box=True) + for v in converted: + yield v + def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None if self.tz != other.tz: @@ -1476,9 +1497,6 @@ def normalize(self): return DatetimeIndex(new_values, freq='infer', name=self.name, tz=self.tz) - def __iter__(self): - return iter(self.asobject) - def searchsorted(self, key, side='left'): if isinstance(key, np.ndarray): key = np.array(key, dtype=_NS_DTYPE, copy=False) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 5948fbf8e5fa7..8c4bb2f5adc5e 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -738,10 +738,6 @@ def astype(self, dtype): return Index(self.values, dtype) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) - def __iter__(self): - for val in self.values: - yield Period(ordinal=val, freq=self.freq) - def searchsorted(self, key, side='left'): if isinstance(key, compat.string_types): key = Period(key, freq=self.freq).ordinal diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 9c374716a84ee..531724cdb6837 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -1027,7 +1027,6 @@ def test_intersection(self): def test_timestamp_equality_different_timezones(self): utc_range = date_range('1/1/2000', periods=20, tz='UTC') - eastern_range = utc_range.tz_convert('US/Eastern') berlin_range = utc_range.tz_convert('Europe/Berlin') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 2fd71521b24d5..c06d8a3ba9a05 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -74,41 +74,72 @@ try: except NameError: # py3 basestring = str -def ints_to_pydatetime(ndarray[int64_t] arr, tz=None): +cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset): + cdef _Timestamp ts_base + ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, + dts.day, dts.hour, dts.min, + dts.sec, dts.us, tz) + + ts_base.value = value + ts_base.offset = offset + ts_base.nanosecond = dts.ps / 1000 + + return ts_base + +cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset): + return datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + +def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): + # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == True) + cdef: Py_ssize_t i, n = len(arr) pandas_datetimestruct dts + object dt + int64_t value ndarray[object] result = np.empty(n, dtype=object) + object (*func_create)(int64_t, pandas_datetimestruct, object, object) + + if box and util.is_string_object(offset): + from pandas.tseries.frequencies import to_offset + offset = to_offset(offset) + + if box: + func_create = create_timestamp_from_ts + else: + func_create = create_datetime_from_ts if tz is not None: if _is_utc(tz): for i in range(n): - if arr[i] == iNaT: - result[i] = np.nan + value = arr[i] + if value == iNaT: + result[i] = NaT else: - pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) - result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) + pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + result[i] = func_create(value, dts, tz, offset) elif _is_tzlocal(tz) or _is_fixed_offset(tz): for i in range(n): - if arr[i] == iNaT: - result[i] = np.nan + value = arr[i] + if value == iNaT: + result[i] = NaT else: - pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) + pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + dt = func_create(value, dts, tz, offset) result[i] = dt + tz.utcoffset(dt) else: trans = _get_transitions(tz) deltas = _get_deltas(tz) for i in range(n): - if arr[i] == iNaT: - result[i] = np.nan + value = arr[i] + if value == iNaT: + result[i] = NaT else: # Adjust datetime64 timestamp, recompute datetimestruct - pos = trans.searchsorted(arr[i], side='right') - 1 + pos = trans.searchsorted(value, side='right') - 1 if _treat_tz_as_pytz(tz): # find right representation of dst etc in pytz timezone new_tz = tz._tzinfos[tz._transition_info[pos]] @@ -116,19 +147,17 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None): # no zone-name change for dateutil tzs - dst etc represented in single object. new_tz = tz - pandas_datetime_to_datetimestruct(arr[i] + deltas[pos], - PANDAS_FR_ns, &dts) - result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, - new_tz) + pandas_datetime_to_datetimestruct(value + deltas[pos], PANDAS_FR_ns, &dts) + result[i] = func_create(value, dts, new_tz, offset) else: for i in range(n): - if arr[i] == iNaT: - result[i] = np.nan + + value = arr[i] + if value == iNaT: + result[i] = NaT else: - pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) - result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us) + pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) + result[i] = func_create(value, dts, None, offset) return result @@ -183,6 +212,7 @@ class Timestamp(_Timestamp): def utcnow(cls): return cls.now('UTC') + def __new__(cls, object ts_input, object offset=None, tz=None, unit=None): cdef _TSObject ts cdef _Timestamp ts_base diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 2b63eeaf99550..bb55b88cf1f34 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -333,3 +333,28 @@ def date_range(start=None, end=None, periods=None, freq=None): timeseries_is_month_start = Benchmark('rng.is_month_start', setup, start_date=datetime(2014, 4, 1)) + +#---------------------------------------------------------------------- +# iterate over DatetimeIndex/PeriodIndex +setup = common_setup + """ +N = 1000000 +M = 10000 +idx1 = date_range(start='20140101', freq='T', periods=N) +idx2 = period_range(start='20140101', freq='T', periods=N) + +def iter_n(iterable, n=None): + i = 0 + for _ in iterable: + i += 1 + if n is not None and i > n: + break +""" + +timeseries_iter_datetimeindex = Benchmark('iter_n(idx1)', setup) + +timeseries_iter_periodindex = Benchmark('iter_n(idx2)', setup) + +timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup) + +timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup) +