Skip to content

Commit 3b39fc0

Browse files
committed
Merge remote-tracking branch 'upstream/master' into quantile_regression
2 parents 9279716 + 19e8fcc commit 3b39fc0

File tree

160 files changed

+3490
-2318
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

160 files changed

+3490
-2318
lines changed

asv_bench/benchmarks/categoricals.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def setup(self):
3434
self.values_all_int8 = np.ones(N, "int8")
3535
self.categorical = pd.Categorical(self.values, self.categories)
3636
self.series = pd.Series(self.categorical)
37+
self.intervals = pd.interval_range(0, 1, periods=N // 10)
3738

3839
def time_regular(self):
3940
pd.Categorical(self.values, self.categories)
@@ -44,6 +45,9 @@ def time_fastpath(self):
4445
def time_datetimes(self):
4546
pd.Categorical(self.datetimes)
4647

48+
def time_interval(self):
49+
pd.Categorical(self.datetimes, categories=self.datetimes)
50+
4751
def time_datetimes_with_nat(self):
4852
pd.Categorical(self.datetimes_with_nat)
4953

asv_bench/benchmarks/io/json.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import sys
2+
13
import numpy as np
24

35
from pandas import DataFrame, concat, date_range, read_json, timedelta_range
@@ -82,6 +84,7 @@ def setup(self, orient, frame):
8284
timedeltas = timedelta_range(start=1, periods=N, freq="s")
8385
datetimes = date_range(start=1, periods=N, freq="s")
8486
ints = np.random.randint(100000000, size=N)
87+
longints = sys.maxsize * np.random.randint(100000000, size=N)
8588
floats = np.random.randn(N)
8689
strings = tm.makeStringIndex(N)
8790
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
@@ -120,6 +123,18 @@ def setup(self, orient, frame):
120123
index=index,
121124
)
122125

126+
self.df_longint_float_str = DataFrame(
127+
{
128+
"longint_1": longints,
129+
"longint_2": longints,
130+
"float_1": floats,
131+
"float_2": floats,
132+
"str_1": strings,
133+
"str_2": strings,
134+
},
135+
index=index,
136+
)
137+
123138
def time_to_json(self, orient, frame):
124139
getattr(self, frame).to_json(self.fname, orient=orient)
125140

@@ -172,6 +187,7 @@ def setup(self):
172187
timedeltas = timedelta_range(start=1, periods=N, freq="s")
173188
datetimes = date_range(start=1, periods=N, freq="s")
174189
ints = np.random.randint(100000000, size=N)
190+
longints = sys.maxsize * np.random.randint(100000000, size=N)
175191
floats = np.random.randn(N)
176192
strings = tm.makeStringIndex(N)
177193
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
@@ -209,6 +225,17 @@ def setup(self):
209225
},
210226
index=index,
211227
)
228+
self.df_longint_float_str = DataFrame(
229+
{
230+
"longint_1": longints,
231+
"longint_2": longints,
232+
"float_1": floats,
233+
"float_2": floats,
234+
"str_1": strings,
235+
"str_2": strings,
236+
},
237+
index=index,
238+
)
212239

213240
def time_floats_with_int_idex_lines(self):
214241
self.df.to_json(self.fname, orient="records", lines=True)
@@ -225,6 +252,9 @@ def time_float_int_lines(self):
225252
def time_float_int_str_lines(self):
226253
self.df_int_float_str.to_json(self.fname, orient="records", lines=True)
227254

255+
def time_float_longint_str_lines(self):
256+
self.df_longint_float_str.to_json(self.fname, orient="records", lines=True)
257+
228258

229259
class ToJSONMem:
230260
def setup_cache(self):
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
ipython analogue:
3+
4+
tr = TimeResolution()
5+
mi = pd.MultiIndex.from_product(tr.params[:-1] + ([str(x) for x in tr.params[-1]],))
6+
df = pd.DataFrame(np.nan, index=mi, columns=["mean", "stdev"])
7+
8+
for unit in tr.params[0]:
9+
for size in tr.params[1]:
10+
for tz in tr.params[2]:
11+
tr.setup(unit, size, tz)
12+
key = (unit, size, str(tz))
13+
print(key)
14+
15+
val = %timeit -o tr.time_get_resolution(unit, size, tz)
16+
17+
df.loc[key] = (val.average, val.stdev)
18+
19+
"""
20+
from datetime import timedelta, timezone
21+
22+
from dateutil.tz import gettz, tzlocal
23+
import numpy as np
24+
import pytz
25+
26+
from pandas._libs.tslibs.resolution import get_resolution
27+
28+
29+
class TimeResolution:
30+
params = (
31+
["D", "h", "m", "s", "us", "ns"],
32+
[1, 100, 10 ** 4, 10 ** 6],
33+
[
34+
None,
35+
timezone.utc,
36+
timezone(timedelta(minutes=60)),
37+
pytz.timezone("US/Pacific"),
38+
gettz("Asia/Tokyo"),
39+
tzlocal(),
40+
],
41+
)
42+
param_names = ["unit", "size", "tz"]
43+
44+
def setup(self, unit, size, tz):
45+
arr = np.random.randint(0, 10, size=size, dtype="i8")
46+
arr = arr.view(f"M8[{unit}]").astype("M8[ns]").view("i8")
47+
self.i8data = arr
48+
49+
def time_get_resolution(self, unit, size, tz):
50+
get_resolution(self.i8data, tz)

asv_bench/benchmarks/tslibs/timestamp.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,29 @@
1-
import datetime
1+
from datetime import datetime, timedelta, timezone
22

3-
import dateutil
3+
from dateutil.tz import gettz, tzlocal, tzutc
44
import numpy as np
55
import pytz
66

77
from pandas import Timestamp
88

9+
# One case for each type of tzinfo object that has its own code path
10+
# in tzconversion code.
11+
_tzs = [
12+
None,
13+
pytz.timezone("Europe/Amsterdam"),
14+
gettz("US/Central"),
15+
pytz.UTC,
16+
tzutc(),
17+
timezone(timedelta(minutes=60)),
18+
tzlocal(),
19+
]
20+
921

1022
class TimestampConstruction:
1123
def setup(self):
1224
self.npdatetime64 = np.datetime64("2020-01-01 00:00:00")
13-
self.dttime_unaware = datetime.datetime(2020, 1, 1, 0, 0, 0)
14-
self.dttime_aware = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC)
25+
self.dttime_unaware = datetime(2020, 1, 1, 0, 0, 0)
26+
self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC)
1527
self.ts = Timestamp("2020-01-01 00:00:00")
1628

1729
def time_parse_iso8601_no_tz(self):
@@ -49,7 +61,6 @@ def time_from_pd_timestamp(self):
4961

5062

5163
class TimestampProperties:
52-
_tzs = [None, pytz.timezone("Europe/Amsterdam"), pytz.UTC, dateutil.tz.tzutc()]
5364
_freqs = [None, "B"]
5465
params = [_tzs, _freqs]
5566
param_names = ["tz", "freq"]
@@ -63,9 +74,6 @@ def time_tz(self, tz, freq):
6374
def time_dayofweek(self, tz, freq):
6475
self.ts.dayofweek
6576

66-
def time_weekday_name(self, tz, freq):
67-
self.ts.day_name
68-
6977
def time_dayofyear(self, tz, freq):
7078
self.ts.dayofyear
7179

@@ -108,9 +116,12 @@ def time_microsecond(self, tz, freq):
108116
def time_month_name(self, tz, freq):
109117
self.ts.month_name()
110118

119+
def time_weekday_name(self, tz, freq):
120+
self.ts.day_name()
121+
111122

112123
class TimestampOps:
113-
params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()]
124+
params = _tzs
114125
param_names = ["tz"]
115126

116127
def setup(self, tz):
@@ -148,7 +159,7 @@ def time_ceil(self, tz):
148159

149160
class TimestampAcrossDst:
150161
def setup(self):
151-
dt = datetime.datetime(2016, 3, 27, 1)
162+
dt = datetime(2016, 3, 27, 1)
152163
self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo
153164
self.ts2 = Timestamp(dt)
154165

asv_bench/benchmarks/tslibs/tslib.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
ipython analogue:
3+
4+
tr = TimeIntsToPydatetime()
5+
mi = pd.MultiIndex.from_product(
6+
tr.params[:-1] + ([str(x) for x in tr.params[-1]],)
7+
)
8+
df = pd.DataFrame(np.nan, index=mi, columns=["mean", "stdev"])
9+
for box in tr.params[0]:
10+
for size in tr.params[1]:
11+
for tz in tr.params[2]:
12+
tr.setup(box, size, tz)
13+
key = (box, size, str(tz))
14+
print(key)
15+
val = %timeit -o tr.time_ints_to_pydatetime(box, size, tz)
16+
df.loc[key] = (val.average, val.stdev)
17+
"""
18+
from datetime import timedelta, timezone
19+
20+
from dateutil.tz import gettz, tzlocal
21+
import numpy as np
22+
import pytz
23+
24+
from pandas._libs.tslib import ints_to_pydatetime
25+
26+
_tzs = [
27+
None,
28+
timezone.utc,
29+
timezone(timedelta(minutes=60)),
30+
pytz.timezone("US/Pacific"),
31+
gettz("Asia/Tokyo"),
32+
tzlocal(),
33+
]
34+
_sizes = [0, 1, 100, 10 ** 4, 10 ** 6]
35+
36+
37+
class TimeIntsToPydatetime:
38+
params = (
39+
["time", "date", "datetime", "timestamp"],
40+
_sizes,
41+
_tzs,
42+
)
43+
param_names = ["box", "size", "tz"]
44+
# TODO: fold? freq?
45+
46+
def setup(self, box, size, tz):
47+
arr = np.random.randint(0, 10, size=size, dtype="i8")
48+
self.i8data = arr
49+
50+
def time_ints_to_pydatetime(self, box, size, tz):
51+
if box == "date":
52+
# ints_to_pydatetime does not allow non-None tz with date;
53+
# this will mean doing some duplicate benchmarks
54+
tz = None
55+
ints_to_pydatetime(self.i8data, tz, box=box)

doc/source/user_guide/computation.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ For example, if we have the following ``DataFrame``:
561561
df
562562
563563
and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size
564-
1, we can create the following ``BaseIndexer``:
564+
1, we can create the following ``BaseIndexer`` subclass:
565565

566566
.. code-block:: ipython
567567
@@ -593,8 +593,22 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other
593593
3 3.0
594594
4 10.0
595595
596+
You can view other examples of ``BaseIndexer`` subclasses `here <https://github.com/pandas-dev/pandas/blob/master/pandas/core/window/indexers.py>`__
597+
596598
.. versionadded:: 1.1
597599

600+
One subclass of note within those examples is the ``VariableOffsetWindowIndexer`` that allows
601+
rolling operations over a non-fixed offset like a ``BusinessDay``.
602+
603+
.. ipython:: python
604+
605+
from pandas.api.indexers import VariableOffsetWindowIndexer
606+
df = pd.DataFrame(range(10), index=pd.date_range('2020', periods=10))
607+
offset = pd.offsets.BDay(1)
608+
indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset)
609+
df
610+
df.rolling(indexer).sum()
611+
598612
For some problems knowledge of the future is available for analysis. For example, this occurs when
599613
each data point is a full time series read from an experiment, and the task is to extract underlying
600614
conditions. In these cases it can be useful to perform forward-looking rolling window computations.

doc/source/user_guide/visualization.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,34 @@ shown by default.
11081108
11091109
plt.close('all')
11101110
1111+
1112+
Controlling the labels
1113+
~~~~~~~~~~~~~~~~~~~~~~
1114+
1115+
.. versionadded:: 1.1.0
1116+
1117+
You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labels
1118+
for x and y axis. By default, pandas will pick up index name as xlabel, while leaving
1119+
it empty for ylabel.
1120+
1121+
.. ipython:: python
1122+
:suppress:
1123+
1124+
plt.figure()
1125+
1126+
.. ipython:: python
1127+
1128+
df.plot()
1129+
1130+
@savefig plot_xlabel_ylabel.png
1131+
df.plot(xlabel="new x", ylabel="new y")
1132+
1133+
.. ipython:: python
1134+
:suppress:
1135+
1136+
plt.close('all')
1137+
1138+
11111139
Scales
11121140
~~~~~~
11131141

0 commit comments

Comments
 (0)