pyldavis replaced with tmplot, docs updated, type hints improved

maximtrp · Jul 1, 2021 · a04de59 · a04de59
1 parent ea4748e
commit a04de59
Show file tree

Hide file tree

Showing 10 changed files with 72 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -15,12 +15,12 @@ Refer to [documentation](https://bitermplus.readthedocs.io) to stay up to date.
 
 ## Requirements
 
-* Cython
-* NumPy
-* Pandas
-* SciPy
-* Scikit-learn
-* pyLDAvis (optional)
+* cython
+* numpy
+* pandas
+* scipy
+* scikit-learn
+* tmplot
 
 ## Setup
 
@@ -55,7 +55,6 @@ pip3 install bitermplus
 import bitermplus as btm
 import numpy as np
 import pandas as pd
-import pyLDAvis as plv
 
 # IMPORTING DATA
 df = pd.read_csv(
@@ -86,20 +85,11 @@ perplexity = model.perplexity_
 coherence = model.coherence_
 
 # RESULTS VISUALIZATION
-# Turning on displaying in Jupyter notebook
-plv.enable_notebook()
-# Preparing our results for visualization
-vis = btm.vis_prepare_model(
-    model.matrix_topics_words_,
-    p_zd,
-    docs_lens,
-    model.vocabulary_,
-    tf
-)
-# Displaying the results
-plv.display(vis)
+btm.plot_model(model=model, docs=texts)
 ```
 
+![Report interface](images/topics_terms_plots.png)
+
 ## Tutorial
 
 There is a [tutorial](https://bitermplus.readthedocs.io/en/latest/tutorial.html)

diff --git a/docs/source/bitermplus.plot.rst b/docs/source/bitermplus.plot.rst
@@ -3,4 +3,4 @@ Plotting functions
 
 .. currentmodule:: bitermplus
 
-.. autofunction:: vis_prepare_model
+.. autofunction:: plot_model
diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -21,7 +21,7 @@ Mac OS
 ~~~~~~
 
 First, you need to install XCode CLT and `Homebrew <https://brew.sh>`_.
-Then, install `libomp` using `brew`:
+Then, install ``libomp`` using ``brew``:
 
 .. code-block:: bash
 
@@ -32,9 +32,9 @@ Then, install `libomp` using `brew`:
 Requirements
 ~~~~~~~~~~~~
 
-* Cython
-* NumPy
-* Pandas
-* SciPy
-* Scikit-learn
-* pyLDAvis (optional)
+* cython
+* numpy
+* pandas
+* scipy
+* scikit-learn
+* tmplot
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -66,31 +66,13 @@ To calculate perplexity, we must provide documents vs topics probability matrix
 Visualizing results
 -------------------
 
-For results visualization, we will use `pyLDAvis
-<https://pypi.org/project/pyLDAvis/>`_ package.
+For results visualization, we will use `tmplot
+<https://pypi.org/project/tmplot/>`_ package.
 
 .. code-block:: python
 
-    # Calculate terms frequency
-    term_freq = np.array(X.sum(axis=0)).ravel()
-
-    # Calculate vectorized documents lengths
-    docs_lens = list(map(len, docs_vec))
-
-    # Prepare results for visualization
-    vis = btm.vis_prepare_model(
-        model_ref.matrix_topics_words_,
-        dtd,
-        docs_lens,
-        model_ref.vocabulary_,
-        term_freq
-    )
-    # Enable Jupyter notebook support
-    plv.enable_notebook()
-
-    # Finally, display the results
-    plv.display(vis)
-
+    # Run the interactive report interface
+    btm.plot_model(model=model, docs=texts)
 
 Filtering stable topics
 -----------------------

diff --git a/images/topics_terms_plots.png b/images/topics_terms_plots.png
diff --git a/requirements.txt b/requirements.txt
@@ -3,5 +3,5 @@ cython
 scipy
 pandas
 scikit-learn
-pyLDAvis
+tmplot
 tqdm
diff --git a/setup.cfg b/setup.cfg
@@ -27,7 +27,7 @@ install_requires =
     pandas
     scipy
     scikit-learn
-    pyLDAvis
+    tmplot
     tqdm
 
 [options.packages.find]

diff --git a/src/bitermplus/__init__.py b/src/bitermplus/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.6.6'
+__version__ = '0.6.7'
 
 from bitermplus._btm import BTM
 from bitermplus._util import *

diff --git a/src/bitermplus/_plot.py b/src/bitermplus/_plot.py
@@ -1,40 +1,38 @@
-__all__ = ['vis_prepare_model']
-from pyLDAvis import prepare as plv_prepare
-import numpy as np
+__all__ = ['plot_model']
+from tmplot import report as plot_model
 
+# def vis_prepare_model(
+#         ttd: np.ndarray,
+#         dtd: np.ndarray,
+#         docs_len: np.ndarray,
+#         vocab: np.ndarray,
+#         term_freq: np.ndarray,
+#         **kwargs: dict):
+#     """Simple wrapper around :meth:`pyLDAvis.prepare` method.
 
-def vis_prepare_model(
-        ttd: np.ndarray,
-        dtd: np.ndarray,
-        docs_len: np.ndarray,
-        vocab: np.ndarray,
-        term_freq: np.ndarray,
-        **kwargs: dict):
-    """Simple wrapper around :meth:`pyLDAvis.prepare` method.
+#     Parameters
+#     ----------
+#     ttd : np.ndarray
+#         Topics vs words probabilities matrix (T x W).
+#     dtd : np.ndarray
+#         Document vs topics probabilities (D x T).
+#     docs_len : np.ndarray
+#         The length of each document, i.e. the number of words in each document.
+#         The order of the numbers should be consistent with the ordering of the
+#         docs in `dtd` (D x 1).
+#     vocab : np.ndarray
+#         List of all the words in the corpus used to train the model (W x 1).
+#     term_freq : np.ndarray
+#         The count of each particular term over the entire corpus (W x 1).
+#     **kwargs : dict
+#         Keyword arguments passed to :meth:`pyLDAvis.prepare` method.
 
-    Parameters
-    ----------
-    ttd : np.ndarray
-        Topics vs words probabilities matrix (T x W).
-    dtd : np.ndarray
-        Document vs topics probabilities (D x T).
-    docs_len : np.ndarray
-        The length of each document, i.e. the number of words in each document.
-        The order of the numbers should be consistent with the ordering of the
-        docs in `dtd` (D x 1).
-    vocab : np.ndarray
-        List of all the words in the corpus used to train the model (W x 1).
-    term_freq : np.ndarray
-        The count of each particular term over the entire corpus (W x 1).
-    **kwargs : dict
-        Keyword arguments passed to :meth:`pyLDAvis.prepare` method.
+#     Returns
+#     -------
+#     data : PreparedData
+#         Output of :meth:`pyLDAvis.prepare` method.
+#     """
 
-    Returns
-    -------
-    data : PreparedData
-        Output of :meth:`pyLDAvis.prepare` method.
-    """
-
-    vis_data = plv_prepare(
-        ttd, dtd, docs_len, vocab, term_freq, **kwargs)
-    return vis_data
+#     vis_data = plv_prepare(
+#         ttd, dtd, docs_len, vocab, term_freq, **kwargs)
+#     return vis_data
diff --git a/src/bitermplus/_util.py b/src/bitermplus/_util.py
@@ -4,7 +4,7 @@
     'get_closest_topics', 'get_top_topic_words',
     'get_top_topic_docs']
 
-from typing import List, Union, Tuple, Dict
+from typing import List, Union, Tuple, Dict, Sequence, Any
 from scipy.sparse import csr
 from pandas import DataFrame, Series, concat
 from sklearn.feature_extraction.text import CountVectorizer
@@ -219,17 +219,21 @@ def enum_func(x):
 
                 if method == "klb":
                     val_raw = ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])
-                    all_vs_all_dists[t_ref, t] = val_raw[np.isfinite(val_raw)].sum()
+                    all_vs_all_dists[t_ref, t] = val_raw[
+                        np.isfinite(val_raw)].sum()
 
                 elif method == "sklb":
                     val_raw = ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])\
                         + ssp.kl_div(matrix[t, :], matrix_ref[t_ref, :])
-                    all_vs_all_dists[t_ref, t] = val_raw[np.isfinite(val_raw)].sum()
+                    all_vs_all_dists[t_ref, t] = val_raw[
+                        np.isfinite(val_raw)].sum()
 
                 elif method == "jsd":
-                    val_raw = 0.5 * ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])\
+                    val_raw = 0.5 * ssp.kl_div(
+                        matrix_ref[t_ref, :], matrix[t, :])\
                         + 0.5 * ssp.kl_div(matrix[t, :], matrix_ref[t_ref, :])
-                    all_vs_all_dists[t_ref, t] = val_raw[np.isfinite(val_raw)].sum()
+                    all_vs_all_dists[t_ref, t] = val_raw[
+                        np.isfinite(val_raw)].sum()
 
                 elif method == "jef":
                     p = matrix_ref[t_ref, :]
@@ -350,7 +354,7 @@ def get_stable_topics(
 def get_top_topic_words(
         model: BTM,
         words_num: int = 20,
-        topics_idx: Union[List[int], np.ndarray] = None) -> DataFrame:
+        topics_idx: Sequence[Any] = None) -> DataFrame:
     """Select top topic words from a fitted model.
 
     Parameters
@@ -390,21 +394,21 @@ def _select_words(model, topic_id: int):
 
 
 def get_top_topic_docs(
-        docs: Union[List[str], np.ndarray],
+        docs: Sequence[Any],
         p_zd: np.ndarray,
         docs_num: int = 20,
-        topics_idx: Union[List[int], np.ndarray] = None) -> DataFrame:
+        topics_idx: Sequence[Any] = None) -> DataFrame:
     """Select top topic docs from a fitted model.
 
     Parameters
     ----------
-    docs : Union[List[str], np.ndarray]
+    docs : Sequence[Any]
         Iterable of documents (e.g. list of strings).
     p_zd : np.ndarray
         Documents vs topics probabilities matrix.
     docs_num : int = 20
         The number of documents to select.
-    topics_idx : Union[List, numpy.ndarray] = None
+    topics_idx : Sequence[Any] = None
         Topics indices. Meant to be used to select only stable
         topics.