Skip to content

Commit b95c905

Browse files
committed
ENH: DataFrame.drop_duplicates and DataFrame.duplicated to remove duplicate rows, GH #319
1 parent 9100b1d commit b95c905

File tree

3 files changed

+113
-1
lines changed

3 files changed

+113
-1
lines changed

pandas/core/frame.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1310,6 +1310,54 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None):
13101310
new_labels = labels[mask]
13111311
return self.reindex(**{axis_name : new_labels})
13121312

1313+
def drop_duplicates(self, col_or_columns=None, take_last=False):
1314+
"""
1315+
Return DataFrame with duplicate rows removed, optionally only
1316+
considering certain columns
1317+
1318+
Parameters
1319+
----------
1320+
col_or_columns : column label or sequence of labels, optional
1321+
Only consider certain columns for identifying duplicates, by
1322+
default use all of the columns
1323+
take_last : boolean, default False
1324+
Take the last observed row in a row. Defaults to the first row
1325+
1326+
Returns
1327+
-------
1328+
deduplicated : DataFrame
1329+
"""
1330+
duplicated = self.duplicated(col_or_columns, take_last=take_last)
1331+
return self[-duplicated]
1332+
1333+
def duplicated(self, col_or_columns=None, take_last=False):
1334+
"""
1335+
Return boolean Series denoting duplicate rows, optionally only
1336+
considering certain columns
1337+
1338+
Parameters
1339+
----------
1340+
col_or_columns : column label or sequence of labels, optional
1341+
Only consider certain columns for identifying duplicates, by
1342+
default use all of the columns
1343+
take_last : boolean, default False
1344+
Take the last observed row in a row. Defaults to the first row
1345+
1346+
Returns
1347+
-------
1348+
duplicated : Series
1349+
"""
1350+
if col_or_columns is not None:
1351+
if isinstance(col_or_columns, list):
1352+
keys = zip(*[self[x] for x in col_or_columns])
1353+
else:
1354+
keys = list(self[col_or_columns])
1355+
else:
1356+
keys = zip(*self.values.T)
1357+
1358+
duplicated = lib.duplicated(keys, take_last=take_last)
1359+
return Series(duplicated, index=self.index)
1360+
13131361
#----------------------------------------------------------------------
13141362
# Sorting
13151363

pandas/src/groupby.pyx

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,33 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
557557

558558
return counts
559559

560+
def duplicated(list values, take_last=False):
561+
cdef:
562+
Py_ssize_t i, n
563+
dict seen = {}
564+
object row
565+
566+
n = len(values)
567+
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
568+
569+
if take_last:
570+
for i from n > i >= 0:
571+
row = values[i]
572+
if row in seen:
573+
result[i] = 1
574+
else:
575+
seen[row] = None
576+
result[i] = 0
577+
else:
578+
for i from 0 <= i < n:
579+
row = values[i]
580+
if row in seen:
581+
result[i] = 1
582+
else:
583+
seen[row] = None
584+
result[i] = 0
585+
586+
return result.view(np.bool_)
560587

561588
'''
562589

pandas/tests/test_frame.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2001,6 +2001,44 @@ def test_dropna_corner(self):
20012001
self.assertRaises(ValueError, self.frame.dropna, how='foo')
20022002
self.assertRaises(ValueError, self.frame.dropna, how=None)
20032003

2004+
def test_drop_duplicates(self):
2005+
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
2006+
'foo', 'bar', 'bar', 'foo'],
2007+
'B' : ['one', 'one', 'two', 'two',
2008+
'two', 'two', 'one', 'two'],
2009+
'C' : [1, 1, 2, 2, 2, 2, 1, 2],
2010+
'D' : range(8)})
2011+
2012+
# single column
2013+
result = df.drop_duplicates('A')
2014+
expected = df[:2]
2015+
assert_frame_equal(result, expected)
2016+
2017+
result = df.drop_duplicates('A', take_last=True)
2018+
expected = df.ix[[6, 7]]
2019+
assert_frame_equal(result, expected)
2020+
2021+
# multi column
2022+
result = df.drop_duplicates(['A', 'B'])
2023+
expected = df.ix[[0, 1, 2, 3]]
2024+
assert_frame_equal(result, expected)
2025+
2026+
result = df.drop_duplicates(['A', 'B'], take_last=True)
2027+
expected = df.ix[[0, 5, 6, 7]]
2028+
assert_frame_equal(result, expected)
2029+
2030+
# consider everything
2031+
df2 = df.ix[:, ['A', 'B', 'C']]
2032+
2033+
result = df2.drop_duplicates()
2034+
# in this case only
2035+
expected = df2.drop_duplicates(['A', 'B'])
2036+
assert_frame_equal(result, expected)
2037+
2038+
result = df2.drop_duplicates(take_last=True)
2039+
expected = df2.drop_duplicates(['A', 'B'], take_last=True)
2040+
assert_frame_equal(result, expected)
2041+
20042042
def test_fillna(self):
20052043
self.tsframe['A'][:5] = nan
20062044
self.tsframe['A'][-5:] = nan
@@ -3258,7 +3296,6 @@ def test_series_put_names(self):
32583296
for k, v in series.iteritems():
32593297
self.assertEqual(v.name, k)
32603298

3261-
32623299
class TestDataFrameJoin(unittest.TestCase):
32633300

32643301
def setUp(self):

0 commit comments

Comments
 (0)