From b2e87ea10cb44a7d298bd73fe11e7ad43ea11463 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Sat, 6 May 2017 17:29:48 -0700 Subject: [PATCH 1/2] add validate argument to merge --- doc/source/merging.rst | 54 ++++++++++++- doc/source/whatsnew/v0.21.0.txt | 7 +- pandas/core/frame.py | 18 ++++- pandas/core/reshape/merge.py | 59 +++++++++++++- pandas/tests/reshape/test_merge.py | 124 +++++++++++++++++++++++++++++ 5 files changed, 250 insertions(+), 12 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 170dde87c8363..c814f72cd41ad 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -513,7 +513,8 @@ standard database join operations between DataFrame objects: pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False) + suffixes=('_x', '_y'), copy=True, indicator=False, + validate=None) - ``left``: A DataFrame object - ``right``: Another DataFrame object @@ -551,6 +552,21 @@ standard database join operations between DataFrame objects: .. versionadded:: 0.17.0 +- ``validate`` : string, default None + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": checks if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": checks if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": checks if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + + + .. versionadded:: 0.21.0 + + The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be ``DataFrame``. @@ -711,10 +727,42 @@ Here is another example with duplicate join keys in DataFrames: labels=['left', 'right'], vertical=False); plt.close('all'); + .. warning:: - Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, - may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames. + Joining / merging on duplicate keys can cause a returned frame that is the multiplication of the row dimensions, which may result in memory overflow. It is the user' s responsibility to manage duplicate values in keys before joining large DataFrames. + +.. _merging.validation: + +Checking for duplicate keys +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.21.0 + +Users can use the ``validate`` argument to automatically check whether there are unexpected duplicates in their merge keys. Key uniqueness is checked before merge operations and so should protect against memory overflows. Checking key uniqueness is also a good way to ensure user data structures are as expected. + +In the following example, there are duplicate values of ``B`` in the right DataFrame. As this is not a one-to-one merge -- as specified in the ``validate`` argument -- an exception will be raised. + + +.. ipython:: python + + left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]}) + right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]}) + +.. code-block:: python + + In [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one") + Out [53]: + --------------------------------------------------------------------------- + + MergeError: Merge keys are not unique in right dataset; not a one-to-one merge + +If the user is aware of the duplicates in the right `DataFrame` but wants to ensure there are no duplicates in the left DataFrame, one can use the `validate='one_to_many'` argument instead, which will not raise an exception. + +.. ipython:: python + + pd.merge(left, right, on='B', how='outer', validate="one_to_many") + .. _merging.indicator: diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 0a3a440ced54f..3734dc15be2e9 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,16 +25,15 @@ New features - Added `__fspath__` method to :class`:pandas.HDFStore`, :class:`pandas.ExcelFile`, and :class:`pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) - .. _whatsnew_0210.enhancements.other: Other Enhancements ^^^^^^^^^^^^^^^^^^ + +- The ``validate`` argument for :func:`merge` function now checks whether a merge is one-to-one, one-to-many, many-to-one, or many-to-many. If a merge is found to not be an example of specified merge type, an exception will be raised. For more, see :ref:`here ` (:issue:`16270`) - ``Series.to_dict()`` and ``DataFrame.to_dict()`` now support an ``into`` keyword which allows you to specify the ``collections.Mapping`` subclass that you would like returned. The default is ``dict``, which is backwards compatible. (:issue:`16122`) - ``RangeIndex.append`` now returns a ``RangeIndex`` object when possible (:issue:`16212`) - -- :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default, -this parameter is set to `HIGHEST_PROTOCOL `__ +- :func:`to_pickle` has gained a protocol parameter (:issue:`16252`). By default, this parameter is set to `HIGHEST_PROTOCOL `__ .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d3d56c1e0331..78a369761afc1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -175,6 +175,19 @@ .. versionadded:: 0.17.0 +validate : string, default None + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_may" or "m:m": allowed, but does not result in checks. + + .. versionadded:: 0.21.0 + Examples -------- @@ -4868,12 +4881,13 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', @Appender(_merge_doc, indents=2) def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False): + suffixes=('_x', '_y'), copy=True, indicator=False, + validate=None): from pandas.core.reshape.merge import merge return merge(self, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator) + copy=copy, indicator=indicator, validate=validate) def round(self, decimals=0, *args, **kwargs): """ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7bf25e37340c4..b5c483a52f14f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -46,11 +46,13 @@ @Appender(_merge_doc, indents=0) def merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False): + suffixes=('_x', '_y'), copy=True, indicator=False, + validate=None): op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator) + copy=copy, indicator=indicator, + validate=validate) return op.get_result() @@ -341,6 +343,7 @@ def merge_asof(left, right, on=None, .. versionadded:: 0.20.0 + Returns ------- merged : DataFrame @@ -504,7 +507,8 @@ class _MergeOperation(object): def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False): + suffixes=('_x', '_y'), copy=True, indicator=False, + validate=None): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how @@ -567,6 +571,12 @@ def __init__(self, left, right, how='inner', on=None, # to avoid incompat dtypes self._maybe_coerce_merge_keys() + # If argument passed to validate, + # check if columns specified as unique + # are in fact unique. + if validate is not None: + self._validate(validate) + def get_result(self): if self.indicator: self.left, self.right = self._indicator_pre_merge( @@ -958,6 +968,49 @@ def _validate_specification(self): if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") + def _validate(self, validate): + + # Check uniqueness of each + if self.left_index: + left_unique = self.orig_left.index.is_unique + else: + left_unique = MultiIndex.from_arrays(self.left_join_keys + ).is_unique + + if self.right_index: + right_unique = self.orig_right.index.is_unique + else: + right_unique = MultiIndex.from_arrays(self.right_join_keys + ).is_unique + + # Check data integrity + if validate in ["one_to_one", "1:1"]: + if not left_unique and not right_unique: + raise ValueError("Merge keys are not unique in either left" + " or right dataset; not a one-to-one merge") + elif not left_unique: + raise ValueError("Merge keys are not unique in left dataset;" + " not a one-to-one merge") + elif not right_unique: + raise ValueError("Merge keys are not unique in right dataset;" + " not a one-to-one merge") + + elif validate in ["one_to_many", "1:m"]: + if not left_unique: + raise ValueError("Merge keys are not unique in left dataset;" + "not a one-to-many merge") + + elif validate in ["many_to_one", "m:1"]: + if not right_unique: + raise ValueError("Merge keys are not unique in right dataset;" + " not a many-to-one merge") + + elif validate in ['many_to_many', 'm:m']: + pass + + else: + raise ValueError("Not a valid argument for validate") + def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', **kwargs): diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index d3257243d7a2c..16c58354ad5c9 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -724,6 +724,130 @@ def test_indicator(self): how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) + def test_validation(self): + left = DataFrame({'a': ['a', 'b', 'c', 'd'], + 'b': ['cat', 'dog', 'weasel', 'horse']}, + index=range(4)) + + right = DataFrame({'a': ['a', 'b', 'c', 'd', 'e'], + 'c': ['meow', 'bark', 'um... weasel noise?', + 'nay', 'chirp']}, + index=range(5)) + + # Make sure no side effects. + left_copy = left.copy() + right_copy = right.copy() + + result = merge(left, right, left_index=True, right_index=True, + validate='1:1') + assert_frame_equal(left, left_copy) + assert_frame_equal(right, right_copy) + + # make sure merge still correct + expected = DataFrame({'a_x': ['a', 'b', 'c', 'd'], + 'b': ['cat', 'dog', 'weasel', 'horse'], + 'a_y': ['a', 'b', 'c', 'd'], + 'c': ['meow', 'bark', 'um... weasel noise?', + 'nay']}, + index=range(4), + columns=['a_x', 'b', 'a_y', 'c']) + + result = merge(left, right, left_index=True, right_index=True, + validate='one_to_one') + assert_frame_equal(result, expected) + + expected_2 = DataFrame({'a': ['a', 'b', 'c', 'd'], + 'b': ['cat', 'dog', 'weasel', 'horse'], + 'c': ['meow', 'bark', 'um... weasel noise?', + 'nay']}, + index=range(4)) + + result = merge(left, right, on='a', validate='1:1') + assert_frame_equal(left, left_copy) + assert_frame_equal(right, right_copy) + assert_frame_equal(result, expected_2) + + result = merge(left, right, on='a', validate='one_to_one') + assert_frame_equal(result, expected_2) + + # One index, one column + expected_3 = DataFrame({'b': ['cat', 'dog', 'weasel', 'horse'], + 'a': ['a', 'b', 'c', 'd'], + 'c': ['meow', 'bark', 'um... weasel noise?', + 'nay']}, + columns=['b', 'a', 'c'], + index=range(4)) + + left_index_reset = left.set_index('a') + result = merge(left_index_reset, right, left_index=True, + right_on='a', validate='one_to_one') + assert_frame_equal(result, expected_3) + + # Dups on right + right_w_dups = right.append(pd.DataFrame({'a': ['e'], 'c': ['moo']}, + index=[4])) + merge(left, right_w_dups, left_index=True, right_index=True, + validate='one_to_many') + + with pytest.raises(ValueError): + merge(left, right_w_dups, left_index=True, right_index=True, + validate='one_to_one') + + with pytest.raises(ValueError): + merge(left, right_w_dups, on='a', validate='one_to_one') + + # Dups on left + left_w_dups = left.append(pd.DataFrame({'a': ['a'], 'c': ['cow']}, + index=[3])) + merge(left_w_dups, right, left_index=True, right_index=True, + validate='many_to_one') + + with pytest.raises(ValueError): + merge(left_w_dups, right, left_index=True, right_index=True, + validate='one_to_one') + + with pytest.raises(ValueError): + merge(left_w_dups, right, on='a', validate='one_to_one') + + # Dups on both + merge(left_w_dups, right_w_dups, on='a', validate='many_to_many') + + with pytest.raises(ValueError): + merge(left_w_dups, right_w_dups, left_index=True, + right_index=True, validate='many_to_one') + + with pytest.raises(ValueError): + merge(left_w_dups, right_w_dups, on='a', + validate='one_to_many') + + # Check invalid arguments + with pytest.raises(ValueError): + merge(left, right, on='a', validate='jibberish') + + # Two column merge, dups in both, but jointly no dups. + left = DataFrame({'a': ['a', 'a', 'b', 'b'], + 'b': [0, 1, 0, 1], + 'c': ['cat', 'dog', 'weasel', 'horse']}, + index=range(4)) + + right = DataFrame({'a': ['a', 'a', 'b'], + 'b': [0, 1, 0], + 'd': ['meow', 'bark', 'um... weasel noise?']}, + index=range(3)) + + expected_multi = DataFrame({'a': ['a', 'a', 'b'], + 'b': [0, 1, 0], + 'c': ['cat', 'dog', 'weasel'], + 'd': ['meow', 'bark', + 'um... weasel noise?']}, + index=range(3)) + + with pytest.raises(ValueError): + merge(left, right, on='a', validate='1:1') + + result = merge(left, right, on=['a', 'b'], validate='1:1') + assert_frame_equal(result, expected_multi) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: From 3488f00bbd69ee528e52a9416645050534cf3cd9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 May 2017 10:37:46 +0200 Subject: [PATCH 2/2] small fixup --- doc/source/merging.rst | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index c814f72cd41ad..d956f1ca54e6b 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -552,7 +552,7 @@ standard database join operations between DataFrame objects: .. versionadded:: 0.17.0 -- ``validate`` : string, default None +- ``validate`` : string, default None. If specified, checks if merge is of specified type. * "one_to_one" or "1:1": checks if merge keys are unique in both @@ -561,8 +561,7 @@ standard database join operations between DataFrame objects: dataset. * "many_to_one" or "m:1": checks if merge keys are unique in right dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. - + * "many_to_many" or "m:m": allowed, but does not result in checks. .. versionadded:: 0.21.0 @@ -749,12 +748,10 @@ In the following example, there are duplicate values of ``B`` in the right DataF left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]}) right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]}) -.. code-block:: python +.. code-block:: ipython - In [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one") - Out [53]: - --------------------------------------------------------------------------- - + In [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one") + ... MergeError: Merge keys are not unique in right dataset; not a one-to-one merge If the user is aware of the duplicates in the right `DataFrame` but wants to ensure there are no duplicates in the left DataFrame, one can use the `validate='one_to_many'` argument instead, which will not raise an exception.