diff --git a/CHANGELOG.md b/CHANGELOG.md index 30c025f67b..b042efafe2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ Changes - NOT BACKWARDS COMPATIBLE! * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113) * Implemented LsiModel.docs_processed attribute +* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation. + Added `topics` parameter to coherencemodel. Can now provide tokenized topics to calculate coherence value (@dsquareindia, #750) 0.13.1, 2016-06-22 @@ -22,7 +24,7 @@ Changes * SparseMatrixSimilarity returns a sparse matrix if `maintain_sparsity` is True (@davechallis, #590) * added functionality for Topics of Words in document - i.e, dynamic topics. (@bhargavvader, #704) - also included tutorial which explains new functionalities, and document word-topic colring. -* Made normalization an explicit transformation. Added 'l1' norm support (@squareindia, #649) +* Made normalization an explicit transformation. Added 'l1' norm support (@dsquareindia, #649) * added term-topics API for most probable topic for word in vocab. (@bhargavvader, #706) * build_vocab takes progress_per parameter for smaller output (@zer0n, #624) * Control whether to use lowercase for computing word2vec accuracy. (@alantian, #607) diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb index b8487b2c30..9cf4b19607 100644 --- a/docs/notebooks/topic_coherence_tutorial.ipynb +++ b/docs/notebooks/topic_coherence_tutorial.ipynb @@ -38,6 +38,7 @@ "\n", "from gensim.models.coherencemodel import CoherenceModel\n", "from gensim.models.ldamodel import LdaModel\n", + "from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet\n", "from gensim.corpora.dictionary import Dictionary\n", "from numpy import array" ] @@ -632,6 +633,110 @@ "print badcm.get_coherence()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Support for wrappers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This API supports gensim's _ldavowpalwabbit_ and _ldamallet_ wrappers as input parameter to `model`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model1 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=50)\n", + "model2 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cm1 = CoherenceModel(model=model1, corpus=corpus, coherence='u_mass')\n", + "cm2 = CoherenceModel(model=model2, corpus=corpus, coherence='u_mass')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-14.075813889\n", + "-15.1740896045\n" + ] + } + ], + "source": [ + "print cm1.get_coherence()\n", + "print cm2.get_coherence()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model1 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=50)\n", + "model2 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cm1 = CoherenceModel(model=model1, texts=texts, coherence='c_v')\n", + "cm2 = CoherenceModel(model=model2, texts=texts, coherence='c_v')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.581114877802\n", + "0.549865328265\n" + ] + } + ], + "source": [ + "print cm1.get_coherence()\n", + "print cm2.get_coherence()" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 8bfde8b082..615e4efacc 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -24,11 +24,12 @@ from gensim.topic_coherence import (segmentation, probability_estimation, direct_confirmation_measure, indirect_confirmation_measure, aggregation) -from gensim.corpora import Dictionary from gensim.matutils import argsort -from gensim.utils import is_corpus +from gensim.utils import is_corpus, FakeDict from gensim.models.ldamodel import LdaModel -from gensim.models.wrappers import LdaVowpalWabbit +from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet + +import numpy as np logger = logging.getLogger(__name__) @@ -43,59 +44,80 @@ class CoherenceModel(interfaces.TransformationABC): 1. constructor, which initializes the four stage pipeline by accepting a coherence measure, 2. the ``get_coherence()`` method, which returns the topic coherence. + One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly + provided if the model does not contain a dictionary already. >>> cm = CoherenceModel(model=tm, corpus=corpus, coherence='u_mass') # tm is the trained topic model >>> cm.get_coherence() + Another way of using this feature is through providing tokenized topics such as: + >>> topics = [['human', 'computer', 'system', 'interface'], + ['graph', 'minors', 'trees', 'eps']] + >>> cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided. + >>> cm.get_coherence() + Model persistency is achieved via its load/save methods. """ - def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c_v'): + def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, coherence='c_v'): """ Args: ---- - model : Pre-trained topic model. + model : Pre-trained topic model. Should be provided if topics is not provided. + topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. + eg. topics = [['human', 'machine', 'computer', 'interface'], + ['graph', 'trees', 'binary', 'widths']] texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator. corpus : Gensim document corpus. - dictionary : Gensim dictionary mapping of id word to create corpus. + dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed. + If both are provided, dictionary will be used. coherence : Coherence measure to be used. Supported values are: - u_mass - c_v + 'u_mass' + 'c_v' + For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. + For 'c_v' texts should be provided. Corpus is not needed. """ + if model is None and topics is None: + raise ValueError("One of model or topics has to be provided.") + elif topics is not None and dictionary is None: + raise ValueError("dictionary has to be provided if topics are to be used.") if texts is None and corpus is None: raise ValueError("One of texts or corpus has to be provided.") + # Check if associated dictionary is provided. + if dictionary is None: + if isinstance(model.id2word, FakeDict): + raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" + " should be set as the associated dictionary.") + else: + self.dictionary = model.id2word + else: + self.dictionary = dictionary + # Check for correct inputs for u_mass coherence measure. if coherence == 'u_mass': if is_corpus(corpus)[0]: - if dictionary is None: - if model.id2word[0] == 0: - raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model" - "should be set as the dictionary.") - else: - self.dictionary = model.id2word - else: - self.dictionary = dictionary self.corpus = corpus elif texts is not None: self.texts = texts - if dictionary is None: - self.dictionary = Dictionary(self.texts) - else: - self.dictionary = dictionary self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] else: raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence) - + # Check for correct inputs for c_v coherence measure. elif coherence == 'c_v': if texts is None: raise ValueError("'texts' should be provided for %s coherence." % coherence) else: self.texts = texts - self.dictionary = Dictionary(self.texts) - self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] - else: raise ValueError("%s coherence is not currently supported." % coherence) self.model = model - self.topics = self._get_topics() + if model is not None: + self.topics = self._get_topics() + elif topics is not None: + self.topics = [] + for topic in topics: + t_i = [] + for t in range(len(topic)): + t_i.append(dictionary.token2id[topic[t]]) + self.topics.append(np.array(t_i)) self.coherence = coherence # Set pipeline parameters: if self.coherence == 'u_mass': @@ -116,7 +138,7 @@ def __str__(self): def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" - topics = [] # FIXME : Meant to work for LDAModel, LdaVowpalWabbit right now. Make it work for others. + topics = [] if isinstance(self.model, LdaModel): for topic in self.model.state.get_lambda(): bestn = argsort(topic, topn=10, reverse=True) @@ -125,6 +147,13 @@ def _get_topics(self): for topic in self.model._get_topics(): bestn = argsort(topic, topn=10, reverse=True) topics.append(bestn) + elif isinstance(self.model, LdaMallet): + for topic in self.model.word_topics: + bestn = argsort(topic, topn=10, reverse=True) + topics.append(bestn) + else: + raise ValueError("This topic model is not currently supported. Supported topic models are" + "LdaModel, LdaVowpalWabbit and LdaMallet.") return topics def get_coherence(self): diff --git a/gensim/test/test_aggregation.py b/gensim/test/test_aggregation.py new file mode 100644 index 0000000000..44e3d16f65 --- /dev/null +++ b/gensim/test/test_aggregation.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking transformation algorithms (the models package). +""" + +import logging +import unittest + +from gensim.topic_coherence import aggregation + +class TestAggregation(unittest.TestCase): + def setUp(self): + self.confirmed_measures = [1.1, 2.2, 3.3, 4.4] + + def testArithmeticMean(self): + """Test arithmetic_mean()""" + obtained = aggregation.arithmetic_mean(self.confirmed_measures) + expected = 2.75 + self.assertEqual(obtained, expected) + +if __name__ == '__main__': + logging.root.setLevel(logging.WARNING) + unittest.main() diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py new file mode 100644 index 0000000000..057f73d01d --- /dev/null +++ b/gensim/test/test_coherencemodel.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated tests for checking transformation algorithms (the models package). +""" + +import logging +import unittest +import os +import os.path +import tempfile + +from gensim.models.coherencemodel import CoherenceModel +from gensim.models.ldamodel import LdaModel +from gensim.models.wrappers import LdaMallet +from gensim.models.wrappers import LdaVowpalWabbit +from gensim.corpora.dictionary import Dictionary + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + +# set up vars used in testing ("Deerwester" from the web tutorial) +texts = [['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey']] +dictionary = Dictionary(texts) +corpus = [dictionary.doc2bow(text) for text in texts] + + +def testfile(): + # temporary data will be stored to this file + return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') + +class TestCoherenceModel(unittest.TestCase): + def setUp(self): + # Suppose given below are the topics which two different LdaModels come up with. + # `topics1` is clearly better as it has a clear distinction between system-human + # interaction and graphs. Hence both the coherence measures for `topics1` should be + # greater. + self.topics1 = [['human', 'computer', 'system', 'interface'], + ['graph', 'minors', 'trees', 'eps']] + self.topics2 = [['user', 'graph', 'minors', 'system'], + ['time', 'graph', 'survey', 'minors']] + self.ldamodel = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=0, iterations=0) + mallet_home = os.environ.get('MALLET_HOME', None) + self.mallet_path = os.path.join(mallet_home, 'bin', 'mallet') if mallet_home else None + if self.mallet_path: + self.malletmodel = LdaMallet(mallet_path=self.mallet_path, corpus=corpus, id2word=dictionary, num_topics=2, iterations=0) + vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None) + if not vw_path: + msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model" + logging.info(msg) + self.vw_path = None + else: + self.vw_path = vw_path + self.vwmodel = LdaVowpalWabbit(self.vw_path, corpus=corpus, id2word=dictionary, num_topics=2, passes=0) + + def testUMass(self): + """Test U_Mass topic coherence algorithm on given topics""" + cm1 = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + cm2 = CoherenceModel(topics=self.topics2, corpus=corpus, dictionary=dictionary, coherence='u_mass') + self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + + def testCv(self): + """Test C_v topic coherence algorithm on given topics""" + cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_v') + cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v') + self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + + def testUMassLdaModel(self): + """Perform sanity check to see if u_mass coherence works with LDA Model""" + try: + cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass') + except: + raise + + def testCvLdaModel(self): + """Perform sanity check to see if c_v coherence works with LDA Model""" + try: + cm = CoherenceModel(model=self.ldamodel, texts=texts, coherence='c_v') + except: + raise + + def testUMassMalletModel(self): + """Perform sanity check to see if u_mass coherence works with LDA Mallet gensim wrapper""" + if not self.mallet_path: + return + try: + cm = CoherenceModel(model=self.malletmodel, corpus=corpus, coherence='u_mass') + except: + raise + + def testCvMalletModel(self): + """Perform sanity check to see if c_v coherence works with LDA Mallet gensim wrapper""" + if not self.mallet_path: + return + try: + cm = CoherenceModel(model=self.malletmodel, texts=texts, coherence='c_v') + except: + raise + + def testUMassVWModel(self): + """Perform sanity check to see if u_mass coherence works with LDA VW gensim wrapper""" + if not self.vw_path: + return + try: + cm = CoherenceModel(model=self.vwmodel, corpus=corpus, coherence='u_mass') + except: + raise + + def testCvVWModel(self): + """Perform sanity check to see if c_v coherence works with LDA VW gensim wrapper""" + if not self.vw_path: + return + try: + cm = CoherenceModel(model=self.vwmodel, texts=texts, coherence='c_v') + except: + raise + + def testErrors(self): + """Test if errors are raised on bad input""" + # not providing dictionary + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, coherence='u_mass') + # not providing texts for c_v and instead providing corpus + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='c_v') + # not providing corpus or texts for u_mass + self.assertRaises(ValueError, CoherenceModel, topics=self.topics1, dictionary=dictionary, coherence='u_mass') + + def testPersistence(self): + fname = testfile() + model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + model.save(fname) + model2 = CoherenceModel.load(fname) + self.assertTrue(model.get_coherence() == model2.get_coherence()) + + def testPersistenceCompressed(self): + fname = testfile() + '.gz' + model = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') + model.save(fname) + model2 = CoherenceModel.load(fname) + self.assertTrue(model.get_coherence() == model2.get_coherence()) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index eaa1b66841..9a783a472a 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -44,6 +44,7 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): def log_ratio_measure(segmented_topics, per_topic_postings, num_docs): """ + Popularly known as PMI. This function calculates the log-ratio-measure which is used by coherence measures such as c_v. This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index e41cb778f1..1af0dae8e8 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -59,10 +59,14 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam ---- topics : Topics obtained from the trained topic model. segmented_topics : segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - per_topic_postings : per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. + per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics. measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma : Gamma value for computing W', W* vectors. num_docs : Total number of documents in corresponding corpus. + + Returns: + ------- + s_cos_sim : array of cosine similarity of the context vectors for each segmentation """ if measure == 'nlr': measure = direct_confirmation_measure.normalized_log_ratio_measure