ENH: Add built-in function for Styler to format the text displayed fo…

…r missing values (pandas-dev#29118) * Add built-in funcion for Styler to format the text displayed for missing values As described in GH pandas-dev#28358, user who wants to control how NA values are printed while applying styles to the output will have to implement their own formatter. (so that the underlying data will not change and can be used for styling)
proost · Dec 19, 2019 · 2c7057e · 2c7057e
1 parent e293770
commit 2c7057e
Show file tree

Hide file tree

Showing 5 changed files with 191 additions and 13 deletions.
diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst
@@ -41,6 +41,7 @@ Style application
    Styler.set_caption
    Styler.set_properties
    Styler.set_uuid
+   Styler.set_na_rep
    Styler.clear
    Styler.pipe
 

diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
@@ -67,6 +67,7 @@
     "df = pd.DataFrame({'A': np.linspace(1, 10, 10)})\n",
     "df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],\n",
     "               axis=1)\n",
+    "df.iloc[3, 3] = np.nan\n",
     "df.iloc[0, 2] = np.nan"
    ]
   },
@@ -402,6 +403,38 @@
     "df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can format the text displayed for missing values by `na_rep`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.style.format(\"{:.2%}\", na_rep=\"-\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These formatting techniques can be used in combination with styling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.style.highlight_max().format(None, na_rep=\"-\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -659,6 +692,7 @@
     "- precision\n",
     "- captions\n",
     "- table-wide styles\n",
+    "- missing values representation\n",
     "- hiding the index or columns\n",
     "\n",
     "Each of these can be specified in two ways:\n",
@@ -800,6 +834,32 @@
     "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Missing values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can control the default missing values representation for the entire table through `set_na_rep` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(df.style\n",
+    "   .set_na_rep(\"FAIL\")\n",
+    "   .format(None, na_rep=\"PASS\", subset=[\"D\"])\n",
+    "   .highlight_null(\"yellow\"))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -122,6 +122,7 @@ Other enhancements
 - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
 - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
 - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
+- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
 - Roundtripping DataFrames with nullable integer or string data types to parquet
   (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
   now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).

diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -8,7 +8,7 @@
 import copy
 from functools import partial
 from itertools import product
-from typing import Optional
+from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple
 from uuid import uuid1
 
 import numpy as np
@@ -71,6 +71,11 @@ class Styler:
         The ``id`` takes the form ``T_<uuid>_row<num_row>_col<num_col>``
         where ``<uuid>`` is the unique identifier, ``<num_row>`` is the row
         number and ``<num_col>`` is the column number.
+    na_rep : str, optional
+        Representation for missing values.
+        If ``na_rep`` is None, no special formatting is applied
+
+        .. versionadded:: 1.0.0
 
     Attributes
     ----------
@@ -126,9 +131,10 @@ def __init__(
         caption=None,
         table_attributes=None,
         cell_ids=True,
+        na_rep: Optional[str] = None,
     ):
-        self.ctx = defaultdict(list)
-        self._todo = []
+        self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list)
+        self._todo: List[Tuple[Callable, Tuple, Dict]] = []
 
         if not isinstance(data, (pd.Series, pd.DataFrame)):
             raise TypeError("``data`` must be a Series or DataFrame")
@@ -149,19 +155,24 @@ def __init__(
         self.precision = precision
         self.table_attributes = table_attributes
         self.hidden_index = False
-        self.hidden_columns = []
+        self.hidden_columns: Sequence[int] = []
         self.cell_ids = cell_ids
+        self.na_rep = na_rep
 
         # display_funcs maps (row, col) -> formatting function
 
         def default_display_func(x):
-            if is_float(x):
+            if self.na_rep is not None and pd.isna(x):
+                return self.na_rep
+            elif is_float(x):
                 display_format = "{0:.{precision}f}".format(x, precision=self.precision)
                 return display_format
             else:
                 return x
 
-        self._display_funcs = defaultdict(lambda: default_display_func)
+        self._display_funcs: DefaultDict[
+            Tuple[int, int], Callable[[Any], str]
+        ] = defaultdict(lambda: default_display_func)
 
     def _repr_html_(self):
         """
@@ -416,16 +427,22 @@ def format_attr(pair):
             table_attributes=table_attr,
         )
 
-    def format(self, formatter, subset=None):
+    def format(self, formatter, subset=None, na_rep: Optional[str] = None):
         """
         Format the text display value of cells.
 
         Parameters
         ----------
-        formatter : str, callable, or dict
+        formatter : str, callable, dict or None
+            If ``formatter`` is None, the default formatter is used
         subset : IndexSlice
             An argument to ``DataFrame.loc`` that restricts which elements
             ``formatter`` is applied to.
+        na_rep : str, optional
+            Representation for missing values.
+            If ``na_rep`` is None, no special formatting is applied
+
+            .. versionadded:: 1.0.0
 
         Returns
         -------
@@ -451,6 +468,10 @@ def format(self, formatter, subset=None):
         >>> df['c'] = ['a', 'b', 'c', 'd']
         >>> df.style.format({'c': str.upper})
         """
+        if formatter is None:
+            assert self._display_funcs.default_factory is not None
+            formatter = self._display_funcs.default_factory()
+
         if subset is None:
             row_locs = range(len(self.data))
             col_locs = range(len(self.data.columns))
@@ -466,16 +487,16 @@ def format(self, formatter, subset=None):
         if is_dict_like(formatter):
             for col, col_formatter in formatter.items():
                 # formatter must be callable, so '{}' are converted to lambdas
-                col_formatter = _maybe_wrap_formatter(col_formatter)
+                col_formatter = _maybe_wrap_formatter(col_formatter, na_rep)
                 col_num = self.data.columns.get_indexer_for([col])[0]
 
                 for row_num in row_locs:
                     self._display_funcs[(row_num, col_num)] = col_formatter
         else:
             # single scalar to format all cells with
+            formatter = _maybe_wrap_formatter(formatter, na_rep)
             locs = product(*(row_locs, col_locs))
             for i, j in locs:
-                formatter = _maybe_wrap_formatter(formatter)
                 self._display_funcs[(i, j)] = formatter
         return self
 
@@ -553,6 +574,7 @@ def _copy(self, deepcopy=False):
             caption=self.caption,
             uuid=self.uuid,
             table_styles=self.table_styles,
+            na_rep=self.na_rep,
         )
         if deepcopy:
             styler.ctx = copy.deepcopy(self.ctx)
@@ -896,6 +918,23 @@ def set_table_styles(self, table_styles):
         self.table_styles = table_styles
         return self
 
+    def set_na_rep(self, na_rep: str) -> "Styler":
+        """
+        Set the missing data representation on a Styler.
+
+        .. versionadded:: 1.0.0
+
+        Parameters
+        ----------
+        na_rep : str
+
+        Returns
+        -------
+        self : Styler
+        """
+        self.na_rep = na_rep
+        return self
+
     def hide_index(self):
         """
         Hide any indices from rendering.
@@ -1487,14 +1526,22 @@ def _get_level_lengths(index, hidden_elements=None):
     return non_zero_lengths
 
 
-def _maybe_wrap_formatter(formatter):
+def _maybe_wrap_formatter(formatter, na_rep: Optional[str]):
     if isinstance(formatter, str):
-        return lambda x: formatter.format(x)
+        formatter_func = lambda x: formatter.format(x)
     elif callable(formatter):
-        return formatter
+        formatter_func = formatter
     else:
         msg = (
             "Expected a template string or callable, got {formatter} "
             "instead".format(formatter=formatter)
         )
         raise TypeError(msg)
+
+    if na_rep is None:
+        return formatter_func
+    elif isinstance(na_rep, str):
+        return lambda x: na_rep if pd.isna(x) else formatter_func(x)
+    else:
+        msg = "Expected a string, got {na_rep} instead".format(na_rep=na_rep)
+        raise TypeError(msg)
diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py
@@ -1009,6 +1009,75 @@ def test_bar_bad_align_raises(self):
         with pytest.raises(ValueError):
             df.style.bar(align="poorly", color=["#d65f5f", "#5fba7d"])
 
+    def test_format_with_na_rep(self):
+        # GH 21527 28358
+        df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
+
+        ctx = df.style.format(None, na_rep="-")._translate()
+        assert ctx["body"][0][1]["display_value"] == "-"
+        assert ctx["body"][0][2]["display_value"] == "-"
+
+        ctx = df.style.format("{:.2%}", na_rep="-")._translate()
+        assert ctx["body"][0][1]["display_value"] == "-"
+        assert ctx["body"][0][2]["display_value"] == "-"
+        assert ctx["body"][1][1]["display_value"] == "110.00%"
+        assert ctx["body"][1][2]["display_value"] == "120.00%"
+
+        ctx = df.style.format("{:.2%}", na_rep="-", subset=["B"])._translate()
+        assert ctx["body"][0][2]["display_value"] == "-"
+        assert ctx["body"][1][2]["display_value"] == "120.00%"
+
+    def test_init_with_na_rep(self):
+        # GH 21527 28358
+        df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
+
+        ctx = Styler(df, na_rep="NA")._translate()
+        assert ctx["body"][0][1]["display_value"] == "NA"
+        assert ctx["body"][0][2]["display_value"] == "NA"
+
+    def test_set_na_rep(self):
+        # GH 21527 28358
+        df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
+
+        ctx = df.style.set_na_rep("NA")._translate()
+        assert ctx["body"][0][1]["display_value"] == "NA"
+        assert ctx["body"][0][2]["display_value"] == "NA"
+
+        ctx = (
+            df.style.set_na_rep("NA")
+            .format(None, na_rep="-", subset=["B"])
+            ._translate()
+        )
+        assert ctx["body"][0][1]["display_value"] == "NA"
+        assert ctx["body"][0][2]["display_value"] == "-"
+
+    def test_format_non_numeric_na(self):
+        # GH 21527 28358
+        df = pd.DataFrame(
+            {
+                "object": [None, np.nan, "foo"],
+                "datetime": [None, pd.NaT, pd.Timestamp("20120101")],
+            }
+        )
+
+        ctx = df.style.set_na_rep("NA")._translate()
+        assert ctx["body"][0][1]["display_value"] == "NA"
+        assert ctx["body"][0][2]["display_value"] == "NA"
+        assert ctx["body"][1][1]["display_value"] == "NA"
+        assert ctx["body"][1][2]["display_value"] == "NA"
+
+        ctx = df.style.format(None, na_rep="-")._translate()
+        assert ctx["body"][0][1]["display_value"] == "-"
+        assert ctx["body"][0][2]["display_value"] == "-"
+        assert ctx["body"][1][1]["display_value"] == "-"
+        assert ctx["body"][1][2]["display_value"] == "-"
+
+    def test_format_with_bad_na_rep(self):
+        # GH 21527 28358
+        df = pd.DataFrame([[None, None], [1.1, 1.2]], columns=["A", "B"])
+        with pytest.raises(TypeError):
+            df.style.format(None, na_rep=-1)
+
     def test_highlight_null(self, null_color="red"):
         df = pd.DataFrame({"A": [0, np.nan]})
         result = df.style.highlight_null()._compute().ctx