BUG: make sure that the multi-index is lex-sorted before passing to _lexsort_indexer (GH8017)

jreback · jreback · commit 763bb0183cec · 2014-09-16T17:32:02.000-04:00
BUG: sparse repr of multi-index frame with a FloatIndex as a level was incorrect
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -667,7 +667,6 @@ Enhancements
 
 
 
-- Bug in ``get`` where an ``IndexError`` would not cause the default value to be returned (:issue:`7725`)
 
 
 
@@ -745,10 +744,10 @@ Bug Fixes
 - Bug in DataFrameGroupby.transform when transforming with a passed non-sorted key (:issue:`8046`)
 - Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
 - Bug in inference in a MultiIndex with ``datetime.date`` inputs (:issue:`7888`)
-
+- Bug in ``get`` where an ``IndexError`` would not cause the default value to be returned (:issue:`7725`)
 - Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may reset nanosecond (:issue:`7697`)
 - Bug in ``offsets.apply``, ``rollforward`` and ``rollback`` may raise ``AttributeError`` if ``Timestamp`` has ``dateutil`` tzinfo (:issue:`7697`)
-
+- Bug in sorting a multi-index frame with a Float64Index (:issue:`8017`)
 
 - Bug in ``is_superperiod`` and ``is_subperiod`` cannot handle higher frequencies than ``S`` (:issue:`7760`, :issue:`7772`, :issue:`7803`)
 
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -625,10 +625,17 @@ def is_numeric_dtype(dtype):
             fmt_columns = columns.format(sparsify=False, adjoin=False)
             fmt_columns = lzip(*fmt_columns)
             dtypes = self.frame.dtypes.values
+
+            # if we have a Float level, they don't use leading space at all
+            restrict_formatting = any([ l.is_floating for l in columns.levels ])
             need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
-            str_columns = list(zip(*[
-                [' ' + y if y not in self.formatters and need_leadsp[x]
-                 else y for y in x] for x in fmt_columns]))
+
+            def space_format(x,y):
+                if y not in self.formatters and need_leadsp[x] and not restrict_formatting:
+                    return ' ' + y
+                return y
+
+            str_columns = list(zip(*[ [ space_format(x,y) for y in x ] for x in fmt_columns ]))
             if self.sparsify:
                 str_columns = _sparsify(str_columns)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2770,6 +2770,12 @@ def trans(v):
                                     na_position=na_position)
 
         elif isinstance(labels, MultiIndex):
+
+            # make sure that the axis is lexsorted to start
+            # if not we need to reconstruct to get the correct indexer
+            if not labels.is_lexsorted():
+                labels = MultiIndex.from_tuples(labels.values)
+
             indexer = _lexsort_indexer(labels.labels, orders=ascending,
                                        na_position=na_position)
             indexer = com._ensure_platform_int(indexer)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1628,6 +1628,7 @@ def sort_index(self, axis=0, ascending=True):
 
         new_axis = labels.take(sort_index)
         return self.reindex(**{axis_name: new_axis})
+
     _shared_docs['reindex'] = """
         Conform %(klass)s to new index with optional filling logic, placing
         NA/NaN in locations having no value in the previous index. A new object
@@ -3558,10 +3559,10 @@ def _tz_convert(ax, tz):
         result = self._constructor(self._data, copy=copy)
         result.set_axis(axis,ax)
         return result.__finalize__(self)
-    
+
     @deprecate_kwarg(old_arg_name='infer_dst', new_arg_name='ambiguous',
                      mapping={True: 'infer', False: 'raise'})
-    def tz_localize(self, tz, axis=0, level=None, copy=True, 
+    def tz_localize(self, tz, axis=0, level=None, copy=True,
                     ambiguous='raise'):
         """
         Localize tz-naive TimeSeries to target time zone
@@ -3583,7 +3584,7 @@ def tz_localize(self, tz, axis=0, level=None, copy=True,
             - 'raise' will raise an AmbiguousTimeError if there are ambiguous times
         infer_dst : boolean, default False (DEPRECATED)
             Attempt to infer fall dst-transition hours based on order
-            
+
         Returns
         -------
         """
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -214,6 +214,44 @@ def test_sort_index_preserve_levels(self):
         result = self.frame.sort_index()
         self.assertEqual(result.index.names, self.frame.index.names)
 
+    def test_sorting_repr_8017(self):
+
+        np.random.seed(0)
+        data = np.random.randn(3,4)
+
+        for gen, extra in [([1.,3.,2.,5.],4.),
+                           ([1,3,2,5],4),
+                           ([Timestamp('20130101'),Timestamp('20130103'),Timestamp('20130102'),Timestamp('20130105')],Timestamp('20130104')),
+                           (['1one','3one','2one','5one'],'4one')]:
+            columns = MultiIndex.from_tuples([('red', i) for i in gen])
+            df = DataFrame(data, index=list('def'), columns=columns)
+            df2 = pd.concat([df,DataFrame('world',
+                                          index=list('def'),
+                                          columns=MultiIndex.from_tuples([('red', extra)]))],axis=1)
+
+            # check that the repr is good
+            # make sure that we have a correct sparsified repr
+            # e.g. only 1 header of read
+            self.assertEqual(str(df2).splitlines()[0].split(),['red'])
+
+            # GH 8017
+            # sorting fails after columns added
+
+            # construct single-dtype then sort
+            result = df.copy().sort_index(axis=1)
+            expected = df.iloc[:,[0,2,1,3]]
+            assert_frame_equal(result, expected)
+
+            result = df2.sort_index(axis=1)
+            expected = df2.iloc[:,[0,2,1,4,3]]
+            assert_frame_equal(result, expected)
+
+            # setitem then sort
+            result = df.copy()
+            result[('red',extra)] = 'world'
+            result = result.sort_index(axis=1)
+            assert_frame_equal(result, expected)
+
     def test_repr_to_string(self):
         repr(self.frame)
         repr(self.ymd)