diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 3e11552be3612..0e449cb35eaaa 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -761,6 +761,7 @@ This is equivalent to the following .. _basics.reindexing: + Reindexing and altering labels ------------------------------ diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index c1b8044ea305b..aeb6ba7eaca25 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1593,7 +1593,10 @@ can think of ``MultiIndex`` an array of tuples where each tuple is unique. A ``MultiIndex`` can be created from a list of arrays (using ``MultiIndex.from_arrays``), an array of tuples (using ``MultiIndex.from_tuples``), or a crossed set of iterables (using -``MultiIndex.from_product``). +``MultiIndex.from_product``). The ``Index`` constructor will attempt to return +a ``MultiIndex`` when it is passed a list of tuples. The following examples +demo different ways to initialize MultiIndexes. + .. ipython:: python @@ -1601,7 +1604,10 @@ can think of ``MultiIndex`` an array of tuples where each tuple is unique. A ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = list(zip(*arrays)) tuples - index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + + multi_index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + multi_index + s = Series(randn(8), index=index) s diff --git a/doc/source/release.rst b/doc/source/release.rst index c0d4c0c73296f..b05d6fef69f7a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -53,6 +53,9 @@ pandas 0.14.0 New features ~~~~~~~~~~~~ +- ``Index`` returns a MultiIndex if passed a list of tuples + ``DataFrame(dict)`` and ``Series(dict)`` create ``MultiIndex`` + columns and index where applicable (:issue:`4187`) - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) - Added the ``sym_diff`` method to ``Index`` (:issue:`5543`) - Added ``to_julian_date`` to ``TimeStamp`` and ``DatetimeIndex``. The Julian @@ -214,6 +217,8 @@ Bug Fixes ~~~~~~~~~ - Bug in Series ValueError when index doesn't match data (:issue:`6532`) +- Prevent segfault due to MultiIndex not being supported in HDFStore table + format (:issue:`1848`) - Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`) - Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`) - Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`) diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 057f83bff44f2..748d5e74c8166 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -346,6 +346,18 @@ Deprecations Enhancements ~~~~~~~~~~~~ +- DataFrame and Series will create MultiIndex if passed a list of tuples + + .. ipython:: python + + Series({('a', 'b'): 1, ('a', 'a'): 0, + ('a', 'c'): 2, ('b', 'a'): 3, ('b', 'b'): 4}) + pandas.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, + ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, + ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, + ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, + ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + - ``DataFrame.to_latex`` now takes a longtable keyword, which if True will return a table in a longtable environment. (:issue:`6617`) - ``pd.read_clipboard`` will, if 'sep' is unspecified, try to detect data copied from a spreadsheet and parse accordingly. (:issue:`6223`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5ecdd4d8b351d..5d3a92227977b 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -316,9 +316,9 @@ def _init_dict(self, data, index, columns, dtype=None): else: keys = list(data.keys()) if not isinstance(data, OrderedDict): - keys = _try_sort(list(data.keys())) + keys = _try_sort(keys) columns = data_names = Index(keys) - arrays = [data[k] for k in columns] + arrays = [data[k] for k in keys] return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) @@ -4512,7 +4512,7 @@ def extract_index(data): index = None if len(data) == 0: index = Index([]) - elif len(data) > 0 and index is None: + elif len(data) > 0: raw_lengths = [] indexes = [] diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 996a691eca082..4f5bf3ff5b6cd 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1157,8 +1157,7 @@ def groups(self): else: to_groupby = lzip(*(ping.grouper for ping in self.groupings)) to_groupby = Index(to_groupby) - - return self.axis.groupby(to_groupby) + return self.axis.groupby(to_groupby.values) @cache_readonly def group_info(self): diff --git a/pandas/core/index.py b/pandas/core/index.py index 3213f288be4b3..e15ca83256269 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -71,6 +71,8 @@ class Index(IndexOpsMixin, FrozenNDArray): Make a copy of input ndarray name : object Name to be stored in the index + tupleize_cols : bool (default: True) + When True, attempt to create a MultiIndex if possible Notes ----- @@ -99,7 +101,7 @@ class Index(IndexOpsMixin, FrozenNDArray): _engine_type = _index.ObjectEngine def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, - **kwargs): + tupleize_cols=True, **kwargs): # no class inference! if fastpath: @@ -139,8 +141,19 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, elif np.isscalar(data): cls._scalar_data_error(data) - else: + if tupleize_cols and isinstance(data, list) and data: + try: + sorted(data) + has_mixed_types = False + except (TypeError, UnicodeDecodeError): + has_mixed_types = True # python3 only + if isinstance(data[0], tuple) and not has_mixed_types: + try: + return MultiIndex.from_tuples( + data, names=name or kwargs.get('names')) + except (TypeError, KeyError): + pass # python2 - MultiIndex fails on mixed types # other iterable of some kind subarr = com._asarray_tuplesafe(data, dtype=object) @@ -808,7 +821,8 @@ def identical(self, other): """ return (self.equals(other) and all((getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables))) + for c in self._comparables)) and + type(self) == type(other)) def asof(self, label): """ @@ -1755,11 +1769,11 @@ def insert(self, loc, item): ------- new_index : Index """ - index = np.asarray(self) - # because numpy is fussy with tuples - item_idx = Index([item], dtype=index.dtype) - new_index = np.concatenate((index[:loc], item_idx, index[loc:])) - return Index(new_index, name=self.name) + _self = np.asarray(self) + item_idx = Index([item], dtype=self.dtype).values + idx = np.concatenate( + (_self[:loc], item_idx, _self[loc:])) + return Index(idx, name=self.name) def drop(self, labels): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 47721ab371c3b..5fd42fb38622a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -22,9 +22,9 @@ _values_from_object, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, + _try_sort, ABCSparseArray, _maybe_match_name, _ensure_object, SettingWithCopyError) - from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import ( @@ -180,7 +180,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if isinstance(data, OrderedDict): index = Index(data) else: - index = Index(sorted(data)) + index = Index(_try_sort(data)) try: if isinstance(index, DatetimeIndex): # coerce back to datetime objects for lookup diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 046fa3887a32d..eac1e0373f24d 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -58,6 +58,8 @@ def infer_dtype(object _values): _values = list(_values) values = list_to_object_array(_values) + values = getattr(values, 'values', values) + val_kind = values.dtype.type if val_kind in _TYPE_MAP: return _TYPE_MAP[val_kind] @@ -1029,6 +1031,8 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): # kludge, for Series return np.empty(0, dtype='f8') + keys = getattr(keys, 'values', keys) + for i in range(n): val = util.get_value_1d(keys, i) if val in mapping: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1bbcba0e4caad..86ebad3bc216e 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -181,12 +181,12 @@ def test_getitem_list(self): # tuples df = DataFrame(randn(8, 3), columns=Index([('foo', 'bar'), ('baz', 'qux'), - ('peek', 'aboo')], name='sth')) + ('peek', 'aboo')], name=['sth', 'sth2'])) result = df[[('foo', 'bar'), ('baz', 'qux')]] expected = df.ix[:, :2] assert_frame_equal(result, expected) - self.assertEqual(result.columns.name, 'sth') + self.assertEqual(result.columns.names, ['sth', 'sth2']) def test_setitem_list(self): @@ -2490,6 +2490,31 @@ def test_constructor_dict_of_tuples(self): expected = DataFrame(dict((k, list(v)) for k, v in compat.iteritems(data))) assert_frame_equal(result, expected, check_dtype=False) + def test_constructor_dict_multiindex(self): + check = lambda result, expected: tm.assert_frame_equal( + result, expected, check_dtype=True, check_index_type=True, + check_column_type=True, check_names=True) + d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, + ('b', 'a'): {('i', 'i'): 6, ('i', 'j'): 5, ('j', 'i'): 4}, + ('b', 'c'): {('i', 'i'): 7, ('i', 'j'): 8, ('j', 'i'): 9}} + _d = sorted(d.items()) + df = DataFrame(d) + expected = DataFrame( + [x[1] for x in _d], + index=MultiIndex.from_tuples([x[0] for x in _d])).T + expected.index = MultiIndex.from_tuples(expected.index) + check(df, expected) + + d['z'] = {'y': 123., ('i', 'i'): 111, ('i', 'j'): 111, ('j', 'i'): 111} + _d.insert(0, ('z', d['z'])) + expected = DataFrame( + [x[1] for x in _d], + index=Index([x[0] for x in _d], tupleize_cols=False)).T + expected.index = Index(expected.index, tupleize_cols=False) + df = DataFrame(d) + df = df.reindex(columns=expected.columns, index=expected.index) + check(df, expected) + def _check_basic_constructor(self, empty): "mat: 2d matrix with shpae (3, 2) to input. empty - makes sized objects" mat = empty((2, 3), dtype=float) @@ -2913,8 +2938,8 @@ class CustomDict(dict): def test_constructor_ragged(self): data = {'A': randn(10), 'B': randn(8)} - assertRaisesRegexp(ValueError, 'arrays must all be same length', - DataFrame, data) + with assertRaisesRegexp(ValueError, 'arrays must all be same length'): + DataFrame(data) def test_constructor_scalar(self): idx = Index(lrange(3)) @@ -12042,7 +12067,8 @@ def test_index_namedtuple(self): IndexType = namedtuple("IndexType", ["a", "b"]) idx1 = IndexType("foo", "bar") idx2 = IndexType("baz", "bof") - index = Index([idx1, idx2], name="composite_index") + index = Index([idx1, idx2], + name="composite_index", tupleize_cols=False) df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index c6c405306afb8..2280ee4e22c57 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -48,7 +48,8 @@ def setUp(self): intIndex = tm.makeIntIndex(100), floatIndex = tm.makeFloatIndex(100), empty = Index([]), - tuples = Index(lzip(['foo', 'bar', 'baz'], [1, 2, 3])), + tuples = MultiIndex.from_tuples(lzip(['foo', 'bar', 'baz'], + [1, 2, 3])) ) for name, ind in self.indices.items(): setattr(self, name, ind) @@ -230,6 +231,10 @@ def test_identical(self): i2 = i2.rename('foo') self.assert_(i1.identical(i2)) + i3 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')]) + i4 = Index([('a', 'a'), ('a', 'b'), ('b', 'a')], tupleize_cols=False) + self.assertFalse(i3.identical(i4)) + def test_is_(self): ind = Index(range(10)) self.assertTrue(ind.is_(ind)) @@ -987,18 +992,24 @@ def test_equals(self): self.assert_(same_values.equals(self.index)) def test_identical(self): + i = Index(self.index.copy()) + self.assertTrue(i.identical(self.index)) - i = self.index.copy() - same_values = Index(i, dtype=object) - self.assert_(i.identical(same_values)) + same_values_different_type = Index(i, dtype=object) + self.assertFalse(i.identical(same_values_different_type)) - i = self.index.copy() + i = self.index.copy(dtype=object) i = i.rename('foo') same_values = Index(i, dtype=object) - self.assert_(same_values.identical(self.index)) + self.assertTrue(same_values.identical(self.index.copy(dtype=object))) self.assertFalse(i.identical(self.index)) - self.assert_(Index(same_values, name='foo').identical(i)) + self.assertTrue(Index(same_values, name='foo', dtype=object + ).identical(i)) + + self.assertFalse( + self.index.copy(dtype=object) + .identical(self.index.copy(dtype='int64'))) def test_get_indexer(self): target = Int64Index(np.arange(10)) @@ -2210,6 +2221,12 @@ def test_identical(self): mi2 = mi2.set_names(['new1', 'new2']) self.assert_(mi.identical(mi2)) + mi3 = Index(mi.tolist(), names=mi.names) + mi4 = Index(mi.tolist(), names=mi.names, tupleize_cols=False) + self.assert_(mi.identical(mi3)) + self.assert_(not mi.identical(mi4)) + self.assert_(mi.equals(mi4)) + def test_is_(self): mi = MultiIndex.from_tuples(lzip(range(10), range(10))) self.assertTrue(mi.is_(mi)) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 95b7b6ace4e2d..377df1e4417f5 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -633,6 +633,26 @@ def test_constructor_dict(self): expected.ix[1] = 1 assert_series_equal(result, expected) + def test_constructor_dict_multiindex(self): + check = lambda result, expected: tm.assert_series_equal( + result, expected, check_dtype=True, check_index_type=True, + check_series_type=True) + d = {('a', 'a'): 0., ('b', 'a'): 1., ('b', 'c'): 2.} + _d = sorted(d.items()) + ser = Series(d) + expected = Series([x[1] for x in _d], + index=MultiIndex.from_tuples([x[0] for x in _d])) + check(ser, expected) + + d['z'] = 111. + _d.insert(0, ('z', d['z'])) + ser = Series(d) + expected = Series( + [x[1] for x in _d], + index=Index([x[0] for x in _d], tupleize_cols=False)) + ser = ser.reindex(index=expected.index) + check(ser, expected) + def test_constructor_subclass_dict(self): data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) series = Series(data)