Skip to content

NDFrame.filter -> NDFrame.select #26866

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 73 additions & 5 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4483,8 +4483,10 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False,

def filter(self, items=None, like=None, regex=None, axis=None):
"""
Subset rows or columns of dataframe according to labels in
the specified index.
Filter columns or rows according to labels in the specified index.

.. deprecated:: 0.25.0
Use .select instead.

Note that this routine does not filter a dataframe on its
contents. The filter is applied to the labels of the index.
Expand Down Expand Up @@ -4540,10 +4542,76 @@ def filter(self, items=None, like=None, regex=None, axis=None):
one two three
rabbit 4 5 6
"""
msg = (".filter is deprecated and will be removed in the"
" future. Use .select instead.")
warnings.warn(msg, FutureWarning, stacklevel=2)
return self.select(items=items, like=like, regex=regex, axis=axis)

def select(self, items=None, like=None, regex=None, flags=0, axis=None):
"""
Select columns or rows according to labels in the specified index.

Note that this routine does not filter a dataframe on its
contents. The filter is applied to the labels of the info axis.

Parameters
----------
items : list-like
Keep labels from axis which are in items.
like : str
Keep labels from axis for which "like in label == True".
regex : str (regular expression)
Keep labels from axis for which re.search(regex, label) == True.
flags : int, default 0
re module flags, e.g. re.IGNORECASE. Can only be combined with
regex.
axis : int or string axis name
The axis to filter on. By default this is the info axis,
'index' for Series, 'columns' for DataFrame.

Returns
-------
same type as input object

See Also
--------
DataFrame.loc

Notes
-----
The ``items``, ``like``, and ``regex`` parameters are
enforced to be mutually exclusive.

``axis`` defaults to the info axis that is used when indexing
with ``[]``.

Examples
--------
>>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
... index=['mouse', 'rabbit'],
... columns=['one', 'two', 'three'])

>>> # select columns by name
>>> df.select(items=['one', 'three'])
one three
mouse 1 3
rabbit 4 6

>>> # select columns by regular expression
>>> df.select(regex='e$', axis=1)
one three
mouse 1 3
rabbit 4 6

>>> # select rows containing 'bbi'
>>> df.select(like='bbi', axis=0)
one two three
rabbit 4 5 6
"""
import re

nkw = com.count_not_none(items, like, regex)
if nkw > 1:
num_not_none = com.count_not_none(items, like, regex)
if num_not_none > 1:
raise TypeError('Keyword arguments `items`, `like`, or `regex` '
'are mutually exclusive')

Expand All @@ -4563,7 +4631,7 @@ def f(x):
elif regex:
def f(x):
return matcher.search(ensure_str(x)) is not None
matcher = re.compile(regex)
matcher = re.compile(regex, flags=flags)
values = labels.map(f)
return self.loc(axis=axis)[values]
else:
Expand Down
105 changes: 61 additions & 44 deletions pandas/tests/frame/test_axis_select_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from pandas.util.testing import assert_frame_equal


def test_filter_deprecated(int_frame):
with tm.assert_produces_warning(FutureWarning):
int_frame.filter(like='A')


class TestDataFrameSelectReindex(TestData):
# These are specific reindex-based tests; other indexing tests should go in
# test_indexing
Expand Down Expand Up @@ -775,93 +780,105 @@ def test_align_series_combinations(self):
tm.assert_series_equal(res1, exp2)
tm.assert_frame_equal(res2, exp1)

def test_filter(self):
def test_select(self):
# Items
filtered = self.frame.filter(['A', 'B', 'E'])
assert len(filtered.columns) == 2
assert 'E' not in filtered
selected = self.frame.select(['A', 'B', 'E'])
assert len(selected.columns) == 2
assert 'E' not in selected

filtered = self.frame.filter(['A', 'B', 'E'], axis='columns')
assert len(filtered.columns) == 2
assert 'E' not in filtered
selected = self.frame.select(['A', 'B', 'E'], axis='columns')
assert len(selected.columns) == 2
assert 'E' not in selected

# Other axis
idx = self.frame.index[0:4]
filtered = self.frame.filter(idx, axis='index')
selected = self.frame.select(idx, axis='index')
expected = self.frame.reindex(index=idx)
tm.assert_frame_equal(filtered, expected)
tm.assert_frame_equal(selected, expected)

# like
fcopy = self.frame.copy()
fcopy['AA'] = 1

filtered = fcopy.filter(like='A')
assert len(filtered.columns) == 2
assert 'AA' in filtered
selected = fcopy.select(like='A')
assert len(selected.columns) == 2
assert 'AA' in selected

# like with ints in column names
df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B'])
filtered = df.filter(like='_')
assert len(filtered.columns) == 2
selected = df.select(like='_')
assert len(selected.columns) == 2

# regex with ints in column names
# from PR #10384
df = DataFrame(0., index=[0, 1, 2], columns=['A1', 1, 'B', 2, 'C'])
expected = DataFrame(
0., index=[0, 1, 2], columns=pd.Index([1, 2], dtype=object))
filtered = df.filter(regex='^[0-9]+$')
tm.assert_frame_equal(filtered, expected)
selected = df.select(regex='^[0-9]+$')
tm.assert_frame_equal(selected, expected)

expected = DataFrame(0., index=[0, 1, 2], columns=[0, '0', 1, '1'])
# shouldn't remove anything
filtered = expected.filter(regex='^[0-9]+$')
tm.assert_frame_equal(filtered, expected)
selected = expected.select(regex='^[0-9]+$')
tm.assert_frame_equal(selected, expected)

# pass in None
with pytest.raises(TypeError, match='Must pass'):
self.frame.filter()
self.frame.select()
with pytest.raises(TypeError, match='Must pass'):
self.frame.filter(items=None)
self.frame.select(items=None)
with pytest.raises(TypeError, match='Must pass'):
self.frame.filter(axis=1)
self.frame.select(axis=1)

# test mutually exclusive arguments
with pytest.raises(TypeError, match='mutually exclusive'):
self.frame.filter(items=['one', 'three'], regex='e$', like='bbi')
self.frame.select(items=['one', 'three'], regex='e$', like='bbi')
with pytest.raises(TypeError, match='mutually exclusive'):
self.frame.filter(items=['one', 'three'], regex='e$', axis=1)
self.frame.select(items=['one', 'three'], regex='e$', axis=1)
with pytest.raises(TypeError, match='mutually exclusive'):
self.frame.filter(items=['one', 'three'], regex='e$')
self.frame.select(items=['one', 'three'], regex='e$')
with pytest.raises(TypeError, match='mutually exclusive'):
self.frame.filter(items=['one', 'three'], like='bbi', axis=0)
self.frame.select(items=['one', 'three'], like='bbi', axis=0)
with pytest.raises(TypeError, match='mutually exclusive'):
self.frame.filter(items=['one', 'three'], like='bbi')
self.frame.select(items=['one', 'three'], like='bbi')

# objects
filtered = self.mixed_frame.filter(like='foo')
assert 'foo' in filtered
selected = self.mixed_frame.select(like='foo')
assert 'foo' in selected

# unicode columns, won't ascii-encode
df = self.frame.rename(columns={'B': '\u2202'})
filtered = df.filter(like='C')
assert 'C' in filtered
selected = df.select(like='C')
assert 'C' in selected

def test_select_regex_search(self):
import re

def test_filter_regex_search(self):
fcopy = self.frame.copy()
fcopy['AA'] = 1

# regex
filtered = fcopy.filter(regex='[A]+')
assert len(filtered.columns) == 2
assert 'AA' in filtered
selected = fcopy.select(regex='[A]+')
assert len(selected.columns) == 2
assert 'AA' in selected

# regex, ignore case
selected = fcopy.select(regex='[a]+', flags=re.IGNORECASE)
assert len(selected.columns) == 2
assert 'AA' in selected

# doesn't have to be at beginning
df = DataFrame({'aBBa': [1, 2],
'BBaBB': [1, 2],
'aCCa': [1, 2],
'aCCaBB': [1, 2]})

result = df.filter(regex='BB')
result = df.select(regex='BB')
exp = df[[x for x in df.columns if 'BB' in x]]
assert_frame_equal(result, exp)

# ignore case
result = df.select(regex='bb', flags=re.IGNORECASE)
exp = df[[x for x in df.columns if 'BB' in x]]
assert_frame_equal(result, exp)

Expand All @@ -870,29 +887,29 @@ def test_filter_regex_search(self):
('a', DataFrame({'a': [1, 2]})),
('あ', DataFrame({'あ': [3, 4]}))
])
def test_filter_unicode(self, name, expected):
def test_select_unicode(self, name, expected):
# GH13101
df = DataFrame({'a': [1, 2], 'あ': [3, 4]})

assert_frame_equal(df.filter(like=name), expected)
assert_frame_equal(df.filter(regex=name), expected)
assert_frame_equal(df.select(like=name), expected)
assert_frame_equal(df.select(regex=name), expected)

@pytest.mark.parametrize('name', ['a', 'a'])
def test_filter_bytestring(self, name):
def test_select_bytestring(self, name):
# GH13101
df = DataFrame({b'a': [1, 2], b'b': [3, 4]})
expected = DataFrame({b'a': [1, 2]})

assert_frame_equal(df.filter(like=name), expected)
assert_frame_equal(df.filter(regex=name), expected)
assert_frame_equal(df.select(like=name), expected)
assert_frame_equal(df.select(regex=name), expected)

def test_filter_corner(self):
def test_select_corner(self):
empty = DataFrame()

result = empty.filter([])
result = empty.select([])
assert_frame_equal(result, empty)

result = empty.filter(like='foo')
result = empty.select(like='foo')
assert_frame_equal(result, empty)

def test_take(self):
Expand Down