From 41aa759965e8593e2286eaf92f498a0e949e05c3 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 7 Dec 2016 22:15:05 -0600 Subject: [PATCH 1/8] add tests for variable length sequences. --- pandas/tests/indexes/test_multi.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 365236f72e80e..5bb7f7a52913c 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1640,6 +1640,19 @@ def test_from_tuples(self): idx = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) self.assertEqual(len(idx), 2) + def test_from_tuples_variable_length(self): + # check that len(MultiIndex) == max(len(iterables)) + T = ((1,), (2, 3), (4, 5, 6)) + + idx = MultiIndex.from_tuples(T) + self.assertEqual(len(idx), 3) + + idx = MultiIndex.from_tuples(set(T)) + self.assertEqual(len(idx), 3) + + idx = MultiIndex.from_tuples(list(T)) + self.assertEqual(len(idx), 3) + def test_argsort(self): result = self.index.argsort() expected = self.index._tuple_index.argsort() From 14da53cc19fbf251f79c19d129eefdeb2c7e697c Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 7 Dec 2016 22:16:32 -0600 Subject: [PATCH 2/8] use zip_longest to avoid truncating tuples. --- pandas/indexes/multi.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 9ab07d87fd13b..95606c5307e1f 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -5,6 +5,11 @@ from functools import partial from sys import getsizeof +try: + from itertools import zip_longest +except ImportError: + from itertools import izip_longest as zip_longest + import numpy as np import pandas.lib as lib import pandas.index as _index @@ -1015,7 +1020,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) else: - arrays = lzip(*tuples) + arrays = list(zip_longest(*tuples)) return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) From 0c118bce72b06d3b3ce7eaee5d4ae6150a1d36e8 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 1 Feb 2017 21:07:05 -0600 Subject: [PATCH 3/8] Check that tuples are all same length. Factor out equal length check into separate method. --- pandas/indexes/multi.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 95606c5307e1f..97599e0fce359 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -964,11 +964,10 @@ def from_arrays(cls, arrays, sortorder=None, names=None): name = None if names is None else names[0] return Index(arrays[0], name=name) - # Check if lengths of all arrays are equal or not, + # Check if lengths of all arrays are equal length or not, # raise ValueError, if not - for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError('all arrays must be same length') + if not _check_equal_length(arrays): + raise ValueError('all arrays must be same length') from pandas.core.categorical import _factorize_from_iterables @@ -988,6 +987,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): ---------- tuples : list / sequence of tuple-likes Each tuple is the index of one row/column. + A ValueError will be raised if all tuples are not the same length. sortorder : int or None Level of sortedness (must be lexicographically sorted by that level) @@ -1012,6 +1012,9 @@ def from_tuples(cls, tuples, sortorder=None, names=None): # I think this is right? Not quite sure... raise TypeError('Cannot infer number of levels from empty list') + if not _check_equal_length(tuples): + raise ValueError('all tuples must be the same length')) + if isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): tuples = tuples._values @@ -1020,7 +1023,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) else: - arrays = list(zip_longest(*tuples)) + arrays = lzip(*tuples)) return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) @@ -2372,5 +2375,25 @@ def _sparsify(label_list, start=0, sentinel=''): return lzip(*result) +def _check_equal_length(seq_of_seqs): + """ + Ensure that all sequences in seq_of_seqs are the same length. + + Since this function is time critical, it does zero error checking. + Two exceptions can result from calling this function. + 1. IndexError: seq_of_seqs is not an indexed sequence. + 2. TypeError: An inner sequence does not support len(). + + This check is up to O(n) and can be expensive, so use only when necessary. + + Return True if all sequences are the same length, otherwise False + """ + L0 = len(seq_of_seqs[0]) + for seq in seq_of_seqs: + if len(seq) != L0: + return False + return True + + def _get_na_rep(dtype): return {np.datetime64: 'NaT', np.timedelta64: 'NaT'}.get(dtype, 'NaN') From 0c2ca79b2e2ae79e4c970c2850d70986d0064c83 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Wed, 1 Feb 2017 21:23:17 -0600 Subject: [PATCH 4/8] remove extra ) No need to import zip_longest anymore --- pandas/indexes/multi.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 97599e0fce359..a5011fa0a97e3 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -5,10 +5,6 @@ from functools import partial from sys import getsizeof -try: - from itertools import zip_longest -except ImportError: - from itertools import izip_longest as zip_longest import numpy as np import pandas.lib as lib @@ -1013,7 +1009,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): raise TypeError('Cannot infer number of levels from empty list') if not _check_equal_length(tuples): - raise ValueError('all tuples must be the same length')) + raise ValueError('all tuples must be the same length') if isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): @@ -1023,7 +1019,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): elif isinstance(tuples, list): arrays = list(lib.to_object_array_tuples(tuples).T) else: - arrays = lzip(*tuples)) + arrays = lzip(*tuples) return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) From 6593c261a171d4484c0c6c71631d9677f8498978 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Thu, 2 Feb 2017 01:27:52 -0600 Subject: [PATCH 5/8] use iterators rather than indexing --- pandas/indexes/multi.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index a5011fa0a97e3..bda2f22f890d2 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -2376,16 +2376,13 @@ def _check_equal_length(seq_of_seqs): Ensure that all sequences in seq_of_seqs are the same length. Since this function is time critical, it does zero error checking. - Two exceptions can result from calling this function. - 1. IndexError: seq_of_seqs is not an indexed sequence. - 2. TypeError: An inner sequence does not support len(). - - This check is up to O(n) and can be expensive, so use only when necessary. + A TypeError will be raised if inner sequence does not support len(). Return True if all sequences are the same length, otherwise False """ - L0 = len(seq_of_seqs[0]) - for seq in seq_of_seqs: + seq_it = iter(seq_of_seqs) + L0 = len(next(seq_it)) + for seq in seq_it: if len(seq) != L0: return False return True From d1d26ff037d42ae4591ded07542cf583ff9b1b07 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Thu, 9 Feb 2017 22:33:13 -0600 Subject: [PATCH 6/8] catch ValueErrors in now invalid test cases. --- pandas/tests/indexes/test_base.py | 8 +++++--- pandas/tests/test_strings.py | 15 ++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2f5b98d145e57..20ef9e897f87f 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1443,9 +1443,11 @@ def test_str_attribute(self): tm.assert_index_equal(idx.str.split(), expected) tm.assert_index_equal(idx.str.split(expand=False), expected) - expected = MultiIndex.from_tuples([('a', 'b', 'c'), ('d', 'e', np.nan), - ('f', np.nan, np.nan)]) - tm.assert_index_equal(idx.str.split(expand=True), expected) + # This is invalid behavior + with self.assertRaises(ValueError): + expected = MultiIndex.from_tuples([('a', 'b', 'c'), ('d', 'e', np.nan), + ('f', np.nan, np.nan)]) + tm.assert_index_equal(idx.str.split(expand=True), expected) # test boolean case, should return np.array instead of boolean Index idx = Index(['a1', 'a2', 'b1', 'b2']) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ce97b09b7e3ca..86573d3272c65 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1984,13 +1984,14 @@ def test_split_to_multiindex_expand(self): tm.assert_index_equal(result, exp) self.assertEqual(result.nlevels, 3) - idx = Index(['some_unequal_splits', 'one_of_these_things_is_not']) - result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA - ), ('one', 'of', 'these', 'things', - 'is', 'not')]) - tm.assert_index_equal(result, exp) - self.assertEqual(result.nlevels, 6) + with self.assertRaises(ValueError): + idx = Index(['some_unequal_splits', 'one_of_these_things_is_not']) + result = idx.str.split('_', expand=True) + exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA + ), ('one', 'of', 'these', 'things', + 'is', 'not')]) + tm.assert_index_equal(result, exp) + self.assertEqual(result.nlevels, 6) with tm.assertRaisesRegexp(ValueError, "expand must be"): idx.str.split('_', expand="not_a_boolean") From 62cbc274940ed0e33719373bc1bbf4452f8d5dcf Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Thu, 9 Feb 2017 22:33:27 -0600 Subject: [PATCH 7/8] test _check_equal_length --- pandas/tests/indexes/test_multi.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 5bb7f7a52913c..de03ebbb616d6 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1640,18 +1640,20 @@ def test_from_tuples(self): idx = MultiIndex.from_tuples(((1, 2), (3, 4)), names=['a', 'b']) self.assertEqual(len(idx), 2) - def test_from_tuples_variable_length(self): - # check that len(MultiIndex) == max(len(iterables)) - T = ((1,), (2, 3), (4, 5, 6)) + def test_equal_length(self): + # Test _check_equal_length + from pandas.indexes.multi import _check_equal_length - idx = MultiIndex.from_tuples(T) - self.assertEqual(len(idx), 3) + seqs = [[1, 2, 3], [2, 3, 4], [0, 1, 0]] + self.assertTrue(_check_equal_length(seqs)) - idx = MultiIndex.from_tuples(set(T)) - self.assertEqual(len(idx), 3) + seqs[-1].append(1) + self.assertFalse(_check_equal_length(seqs)) - idx = MultiIndex.from_tuples(list(T)) - self.assertEqual(len(idx), 3) + # Test TypeError + seqs = [None] + with self.assertRaises(TypeError): + _check_equal_length(seqs) def test_argsort(self): result = self.index.argsort() From 927d439458b563f7059b898120d66f3985baa4f7 Mon Sep 17 00:00:00 2001 From: Ryan Grout Date: Thu, 9 Feb 2017 22:34:31 -0600 Subject: [PATCH 8/8] If we have an empty seq, then we should return True. --- pandas/indexes/multi.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index bda2f22f890d2..3b990729dcd92 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -2379,13 +2379,18 @@ def _check_equal_length(seq_of_seqs): A TypeError will be raised if inner sequence does not support len(). Return True if all sequences are the same length, otherwise False + If seq_of_seqs is empty return True as well. """ seq_it = iter(seq_of_seqs) - L0 = len(next(seq_it)) - for seq in seq_it: - if len(seq) != L0: - return False - return True + try: + L0 = len(next(seq_it)) + except StopIteration: + return True + else: + for seq in seq_it: + if len(seq) != L0: + return False + return True def _get_na_rep(dtype):