Skip to content

Commit

Permalink
pyldavis replaced with tmplot, docs updated, type hints improved
Browse files Browse the repository at this point in the history
  • Loading branch information
maximtrp committed Jul 1, 2021
1 parent ea4748e commit a04de59
Show file tree
Hide file tree
Showing 10 changed files with 72 additions and 98 deletions.
28 changes: 9 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ Refer to [documentation](https://bitermplus.readthedocs.io) to stay up to date.

## Requirements

* Cython
* NumPy
* Pandas
* SciPy
* Scikit-learn
* pyLDAvis (optional)
* cython
* numpy
* pandas
* scipy
* scikit-learn
* tmplot

## Setup

Expand Down Expand Up @@ -55,7 +55,6 @@ pip3 install bitermplus
import bitermplus as btm
import numpy as np
import pandas as pd
import pyLDAvis as plv

# IMPORTING DATA
df = pd.read_csv(
Expand Down Expand Up @@ -86,20 +85,11 @@ perplexity = model.perplexity_
coherence = model.coherence_

# RESULTS VISUALIZATION
# Turning on displaying in Jupyter notebook
plv.enable_notebook()
# Preparing our results for visualization
vis = btm.vis_prepare_model(
model.matrix_topics_words_,
p_zd,
docs_lens,
model.vocabulary_,
tf
)
# Displaying the results
plv.display(vis)
btm.plot_model(model=model, docs=texts)
```

![Report interface](images/topics_terms_plots.png)

## Tutorial

There is a [tutorial](https://bitermplus.readthedocs.io/en/latest/tutorial.html)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/bitermplus.plot.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ Plotting functions

.. currentmodule:: bitermplus

.. autofunction:: vis_prepare_model
.. autofunction:: plot_model
14 changes: 7 additions & 7 deletions docs/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Mac OS
~~~~~~

First, you need to install XCode CLT and `Homebrew <https://brew.sh>`_.
Then, install `libomp` using `brew`:
Then, install ``libomp`` using ``brew``:

.. code-block:: bash
Expand All @@ -32,9 +32,9 @@ Then, install `libomp` using `brew`:
Requirements
~~~~~~~~~~~~

* Cython
* NumPy
* Pandas
* SciPy
* Scikit-learn
* pyLDAvis (optional)
* cython
* numpy
* pandas
* scipy
* scikit-learn
* tmplot
26 changes: 4 additions & 22 deletions docs/source/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,31 +66,13 @@ To calculate perplexity, we must provide documents vs topics probability matrix
Visualizing results
-------------------

For results visualization, we will use `pyLDAvis
<https://pypi.org/project/pyLDAvis/>`_ package.
For results visualization, we will use `tmplot
<https://pypi.org/project/tmplot/>`_ package.

.. code-block:: python
# Calculate terms frequency
term_freq = np.array(X.sum(axis=0)).ravel()
# Calculate vectorized documents lengths
docs_lens = list(map(len, docs_vec))
# Prepare results for visualization
vis = btm.vis_prepare_model(
model_ref.matrix_topics_words_,
dtd,
docs_lens,
model_ref.vocabulary_,
term_freq
)
# Enable Jupyter notebook support
plv.enable_notebook()
# Finally, display the results
plv.display(vis)
# Run the interactive report interface
btm.plot_model(model=model, docs=texts)
Filtering stable topics
-----------------------
Expand Down
Binary file added images/topics_terms_plots.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ cython
scipy
pandas
scikit-learn
pyLDAvis
tmplot
tqdm
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ install_requires =
pandas
scipy
scikit-learn
pyLDAvis
tmplot
tqdm

[options.packages.find]
Expand Down
2 changes: 1 addition & 1 deletion src/bitermplus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.6.6'
__version__ = '0.6.7'

from bitermplus._btm import BTM
from bitermplus._util import *
Expand Down
70 changes: 34 additions & 36 deletions src/bitermplus/_plot.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,38 @@
__all__ = ['vis_prepare_model']
from pyLDAvis import prepare as plv_prepare
import numpy as np
__all__ = ['plot_model']
from tmplot import report as plot_model

# def vis_prepare_model(
# ttd: np.ndarray,
# dtd: np.ndarray,
# docs_len: np.ndarray,
# vocab: np.ndarray,
# term_freq: np.ndarray,
# **kwargs: dict):
# """Simple wrapper around :meth:`pyLDAvis.prepare` method.

def vis_prepare_model(
ttd: np.ndarray,
dtd: np.ndarray,
docs_len: np.ndarray,
vocab: np.ndarray,
term_freq: np.ndarray,
**kwargs: dict):
"""Simple wrapper around :meth:`pyLDAvis.prepare` method.
# Parameters
# ----------
# ttd : np.ndarray
# Topics vs words probabilities matrix (T x W).
# dtd : np.ndarray
# Document vs topics probabilities (D x T).
# docs_len : np.ndarray
# The length of each document, i.e. the number of words in each document.
# The order of the numbers should be consistent with the ordering of the
# docs in `dtd` (D x 1).
# vocab : np.ndarray
# List of all the words in the corpus used to train the model (W x 1).
# term_freq : np.ndarray
# The count of each particular term over the entire corpus (W x 1).
# **kwargs : dict
# Keyword arguments passed to :meth:`pyLDAvis.prepare` method.

Parameters
----------
ttd : np.ndarray
Topics vs words probabilities matrix (T x W).
dtd : np.ndarray
Document vs topics probabilities (D x T).
docs_len : np.ndarray
The length of each document, i.e. the number of words in each document.
The order of the numbers should be consistent with the ordering of the
docs in `dtd` (D x 1).
vocab : np.ndarray
List of all the words in the corpus used to train the model (W x 1).
term_freq : np.ndarray
The count of each particular term over the entire corpus (W x 1).
**kwargs : dict
Keyword arguments passed to :meth:`pyLDAvis.prepare` method.
# Returns
# -------
# data : PreparedData
# Output of :meth:`pyLDAvis.prepare` method.
# """

Returns
-------
data : PreparedData
Output of :meth:`pyLDAvis.prepare` method.
"""

vis_data = plv_prepare(
ttd, dtd, docs_len, vocab, term_freq, **kwargs)
return vis_data
# vis_data = plv_prepare(
# ttd, dtd, docs_len, vocab, term_freq, **kwargs)
# return vis_data
24 changes: 14 additions & 10 deletions src/bitermplus/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
'get_closest_topics', 'get_top_topic_words',
'get_top_topic_docs']

from typing import List, Union, Tuple, Dict
from typing import List, Union, Tuple, Dict, Sequence, Any
from scipy.sparse import csr
from pandas import DataFrame, Series, concat
from sklearn.feature_extraction.text import CountVectorizer
Expand Down Expand Up @@ -219,17 +219,21 @@ def enum_func(x):

if method == "klb":
val_raw = ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])
all_vs_all_dists[t_ref, t] = val_raw[np.isfinite(val_raw)].sum()
all_vs_all_dists[t_ref, t] = val_raw[
np.isfinite(val_raw)].sum()

elif method == "sklb":
val_raw = ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])\
+ ssp.kl_div(matrix[t, :], matrix_ref[t_ref, :])
all_vs_all_dists[t_ref, t] = val_raw[np.isfinite(val_raw)].sum()
all_vs_all_dists[t_ref, t] = val_raw[
np.isfinite(val_raw)].sum()

elif method == "jsd":
val_raw = 0.5 * ssp.kl_div(matrix_ref[t_ref, :], matrix[t, :])\
val_raw = 0.5 * ssp.kl_div(
matrix_ref[t_ref, :], matrix[t, :])\
+ 0.5 * ssp.kl_div(matrix[t, :], matrix_ref[t_ref, :])
all_vs_all_dists[t_ref, t] = val_raw[np.isfinite(val_raw)].sum()
all_vs_all_dists[t_ref, t] = val_raw[
np.isfinite(val_raw)].sum()

elif method == "jef":
p = matrix_ref[t_ref, :]
Expand Down Expand Up @@ -350,7 +354,7 @@ def get_stable_topics(
def get_top_topic_words(
model: BTM,
words_num: int = 20,
topics_idx: Union[List[int], np.ndarray] = None) -> DataFrame:
topics_idx: Sequence[Any] = None) -> DataFrame:
"""Select top topic words from a fitted model.
Parameters
Expand Down Expand Up @@ -390,21 +394,21 @@ def _select_words(model, topic_id: int):


def get_top_topic_docs(
docs: Union[List[str], np.ndarray],
docs: Sequence[Any],
p_zd: np.ndarray,
docs_num: int = 20,
topics_idx: Union[List[int], np.ndarray] = None) -> DataFrame:
topics_idx: Sequence[Any] = None) -> DataFrame:
"""Select top topic docs from a fitted model.
Parameters
----------
docs : Union[List[str], np.ndarray]
docs : Sequence[Any]
Iterable of documents (e.g. list of strings).
p_zd : np.ndarray
Documents vs topics probabilities matrix.
docs_num : int = 20
The number of documents to select.
topics_idx : Union[List, numpy.ndarray] = None
topics_idx : Sequence[Any] = None
Topics indices. Meant to be used to select only stable
topics.
Expand Down

0 comments on commit a04de59

Please sign in to comment.