BUG: Formatting of an index that has nan was inconsistent or wron…

…g (would fill from other values), (GH2850_) BUG: issue in test_index.py/test_format 1) printing of 'nan' rather than the na_rep (NaN) is inconcistent with everywhere else 2) a 'None' in the index is defacto treated as NaN, is this wrong? CLN: constistency among index for NaN/NaT values
pandas-dev · Mar 13, 2013 · aae6213 · aae6213
1 parent a79f08c
commit aae6213
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 11 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -145,6 +145,8 @@ pandas 0.11.0
   - Bug in DataFrame column insertion when the column creation fails, existing frame is left in
     an irrecoverable state (GH3010_)
   - Bug in DataFrame update where non-specified values could cause dtype changes (GH3016_)
+  - Formatting of an index that has ``nan`` was inconsistent or wrong (would fill from 
+    other values), (GH2850_)
 
 .. _GH622: https://github.com/pydata/pandas/issues/622
 .. _GH797: https://github.com/pydata/pandas/issues/797
@@ -161,6 +163,7 @@ pandas 0.11.0
 .. _GH2867: https://github.com/pydata/pandas/issues/2867
 .. _GH2807: https://github.com/pydata/pandas/issues/2807
 .. _GH2849: https://github.com/pydata/pandas/issues/2849
+.. _GH2850: https://github.com/pydata/pandas/issues/2850
 .. _GH2898: https://github.com/pydata/pandas/issues/2898
 .. _GH2892: https://github.com/pydata/pandas/issues/2892
 .. _GH2909: https://github.com/pydata/pandas/issues/2909

diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -173,9 +173,9 @@ def __unicode__(self):
         Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
         """
         if len(self) > 6 and len(self) > np.get_printoptions()['threshold']:
-            data = self[:3].tolist() + ["..."] + self[-3:].tolist()
+            data = self[:3].format() + ["..."] + self[-3:].format()
         else:
-            data = self
+            data = self.format()
 
         prepr = com.pprint_thing(data, escape_chars=('\t', '\r', '\n'))
         return '%s(%s, dtype=%s)' % (type(self).__name__, prepr, self.dtype)
@@ -247,8 +247,14 @@ def _has_complex_internals(self):
 
     def summary(self, name=None):
         if len(self) > 0:
-            index_summary = ', %s to %s' % (com.pprint_thing(self[0]),
-                                            com.pprint_thing(self[-1]))
+            head = self[0]
+            if hasattr(head,'format'):
+                head = head.format()
+            tail = self[-1]
+            if hasattr(tail,'format'):
+                tail = tail.format()
+            index_summary = ', %s to %s' % (com.pprint_thing(head),
+                                            com.pprint_thing(tail))
         else:
             index_summary = ''
 
@@ -419,7 +425,7 @@ def take(self, indexer, axis=0):
         taken = self.view(np.ndarray).take(indexer)
         return self._constructor(taken, name=self.name)
 
-    def format(self, name=False, formatter=None):
+    def format(self, name=False, formatter=None, na_rep='NaN'):
         """
         Render a string representation of the Index
         """
@@ -454,6 +460,14 @@ def format(self, name=False, formatter=None):
         if values.dtype == np.object_:
             result = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n'))
                       for x in values]
+
+            # could have nans
+            mask = isnull(values)
+            if mask.any():
+                result = np.array(result)
+                result[mask] = na_rep
+                result = result.tolist()
+
         else:
             result = _trim_front(format_array(values, None, justify='left'))
         return header + result
@@ -1446,10 +1460,9 @@ def __unicode__(self):
         np.set_printoptions(threshold=50)
 
         if len(self) > 100:
-            values = np.concatenate([self[:50].values,
-                                     self[-50:].values])
+            values = self[:50].format() + self[-50:].format()
         else:
-            values = self.values
+            values = self.format()
 
         summary = com.pprint_thing(values, escape_chars=('\t', '\r', '\n'))
 
@@ -1618,7 +1631,16 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
         stringified_levels = []
         for lev, lab in zip(self.levels, self.labels):
             if len(lev) > 0:
+
                 formatted = lev.take(lab).format(formatter=formatter)
+
+                # we have some NA
+                mask = lab==-1
+                if mask.any():
+                    formatted = np.array(formatted)
+                    formatted[mask] = na_rep
+                    formatted = formatted.tolist()
+
             else:
                 # weird all NA case
                 formatted = [com.pprint_thing(x, escape_chars=('\t', '\r', '\n'))

diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
@@ -603,6 +603,31 @@ def test_long_series(self):
         nmatches = len(re.findall('dtype',str_rep))
         self.assert_(nmatches == 1)
 
+    def test_index_with_nan(self):
+        #  GH 2850
+        df = DataFrame({'id1': {0: '1a3', 1: '9h4'}, 'id2': {0: np.nan, 1: 'd67'},
+                        'id3': {0: '78d', 1: '79d'}, 'value': {0: 123, 1: 64}})
+
+        # multi-index
+        y = df.set_index(['id1', 'id2', 'id3'])
+        result = y.to_string()
+        expected = u'             value\nid1 id2 id3       \n1a3 NaN 78d    123\n9h4 d67 79d     64'
+        self.assert_(result == expected)
+
+        # index
+        y = df.set_index('id2')
+        result = y.to_string()
+        expected = u'     id1  id3  value\nid2                 \nNaN  1a3  78d    123\nd67  9h4  79d     64'
+        self.assert_(result == expected)
+
+        # all-nan in mi
+        df2 = df.copy()
+        df2.ix[:,'id2'] = np.nan
+        y = df2.set_index('id2')
+        result = y.to_string()
+        expected = u'     id1  id3  value\nid2                 \nNaN  1a3  78d    123\nNaN  9h4  79d     64'
+        self.assert_(result == expected)
+
     def test_to_string(self):
         from pandas import read_table
         import re
@@ -1234,10 +1259,16 @@ def test_datetimeindex(self):
         result = s.to_string()
         self.assertTrue('2013-01-02' in result)
 
-        s = Series(2, index=[ Timestamp('20130111'), NaT ]).append(s)
+        # nat in index
+        s2 = Series(2, index=[ Timestamp('20130111'), NaT ])
+        s = s2.append(s)
         result = s.to_string()
         self.assertTrue('NaT' in result)
 
+        # nat in summary
+        result = str(s2.index)
+        self.assertTrue('NaT' in result)
+
     def test_timedelta64(self):
 
         from pandas import date_range

diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -351,12 +351,13 @@ def test_format(self):
         # 2845
         index = Index([1, 2.0+3.0j, np.nan])
         formatted = index.format()
-        expected = [str(index[0]), str(index[1]), str(index[2])]
+        expected = [str(index[0]), str(index[1]), u'NaN']
         self.assertEquals(formatted, expected)
 
+        # is this really allowed?
         index = Index([1, 2.0+3.0j, None])
         formatted = index.format()
-        expected = [str(index[0]), str(index[1]), '']
+        expected = [str(index[0]), str(index[1]), u'NaN']
         self.assertEquals(formatted, expected)
 
         self.strIndex[:0].format()