piskvorky · tmylk · Jul 5, 2016 · Jun 23, 2016 · Jun 24, 2016 · Jun 24, 2016
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ Changes
   - NOT BACKWARDS COMPATIBLE!
 * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113)
 * Implemented LsiModel.docs_processed attribute
+* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation.
+  Added `topics` parameter to coherencemodel. Can now provide tokenized topics to calculate coherence value (@dsquareindia, #750)
 
 0.13.1, 2016-06-22
 
@@ -22,7 +24,7 @@ Changes
 * SparseMatrixSimilarity returns a sparse matrix if `maintain_sparsity` is True (@davechallis, #590)
 * added functionality for Topics of Words in document - i.e, dynamic topics. (@bhargavvader, #704)
   - also included tutorial which explains new functionalities, and document word-topic colring.
-* Made normalization an explicit transformation. Added 'l1' norm support (@squareindia, #649)
+* Made normalization an explicit transformation. Added 'l1' norm support (@dsquareindia, #649)
 * added term-topics API for most probable topic for word in vocab. (@bhargavvader, #706)
 * build_vocab takes progress_per parameter for smaller output (@zer0n, #624)
 * Control whether to use lowercase for computing word2vec accuracy. (@alantian, #607)

diff --git a/docs/notebooks/topic_coherence_tutorial.ipynb b/docs/notebooks/topic_coherence_tutorial.ipynb
@@ -38,6 +38,7 @@
     "\n",
     "from gensim.models.coherencemodel import CoherenceModel\n",
     "from gensim.models.ldamodel import LdaModel\n",
+    "from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet\n",
     "from gensim.corpora.dictionary import Dictionary\n",
     "from numpy import array"
    ]
@@ -632,6 +633,110 @@
     "print badcm.get_coherence()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Support for wrappers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This API supports gensim's _ldavowpalwabbit_ and _ldamallet_ wrappers as input parameter to `model`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model1 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=50)\n",
+    "model2 = LdaVowpalWabbit('/home/devashish/vw-8', corpus=corpus, num_topics=2, id2word=dictionary, passes=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "cm1 = CoherenceModel(model=model1, corpus=corpus, coherence='u_mass')\n",
+    "cm2 = CoherenceModel(model=model2, corpus=corpus, coherence='u_mass')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-14.075813889\n",
+      "-15.1740896045\n"
+     ]
+    }
+   ],
+   "source": [
+    "print cm1.get_coherence()\n",
+    "print cm2.get_coherence()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model1 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=50)\n",
+    "model2 = LdaMallet('/home/devashish/mallet-2.0.8RC3/bin/mallet',corpus=corpus , num_topics=2, id2word=dictionary, iterations=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "cm1 = CoherenceModel(model=model1, texts=texts, coherence='c_v')\n",
+    "cm2 = CoherenceModel(model=model2, texts=texts, coherence='c_v')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.581114877802\n",
+      "0.549865328265\n"
+     ]
+    }
+   ],
+   "source": [
+    "print cm1.get_coherence()\n",
+    "print cm2.get_coherence()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
@@ -24,11 +24,12 @@
 from gensim.topic_coherence import (segmentation, probability_estimation,
                                     direct_confirmation_measure, indirect_confirmation_measure,
                                     aggregation)
-from gensim.corpora import Dictionary
 from gensim.matutils import argsort
-from gensim.utils import is_corpus
+from gensim.utils import is_corpus, FakeDict
 from gensim.models.ldamodel import LdaModel
-from gensim.models.wrappers import LdaVowpalWabbit
+from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet
+
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -43,59 +44,80 @@ class CoherenceModel(interfaces.TransformationABC):
     1. constructor, which initializes the four stage pipeline by accepting a coherence measure,
     2. the ``get_coherence()`` method, which returns the topic coherence.
 
+    One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly
+    provided if the model does not contain a dictionary already.
     >>> cm = CoherenceModel(model=tm, corpus=corpus, coherence='u_mass')  # tm is the trained topic model
     >>> cm.get_coherence()
 
+    Another way of using this feature is through providing tokenized topics such as:
+    >>> topics = [['human', 'computer', 'system', 'interface'],
+                  ['graph', 'minors', 'trees', 'eps']]
+    >>> cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=dictionary, coherence='u_mass') # note that a dictionary has to be provided.
+    >>> cm.get_coherence()
+
     Model persistency is achieved via its load/save methods.
     """
-    def __init__(self, model, texts=None, corpus=None, dictionary=None, coherence='c_v'):
+    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, coherence='c_v'):
         """
         Args:
         ----
-        model : Pre-trained topic model.
+        model : Pre-trained topic model. Should be provided if topics is not provided.
+        topics : List of tokenized topics. If this is preferred over model, dictionary should be provided.
+                 eg. topics = [['human', 'machine', 'computer', 'interface'],
+                               ['graph', 'trees', 'binary', 'widths']]
         texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator.
         corpus : Gensim document corpus.
-        dictionary : Gensim dictionary mapping of id word to create corpus.
+        dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, this is not needed.
+                     If both are provided, dictionary will be used.
         coherence : Coherence measure to be used. Supported values are:
-                    u_mass
-                    c_v
+                    'u_mass'
+                    'c_v'
+                    For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
+                    For 'c_v' texts should be provided. Corpus is not needed.
         """
+        if model is None and topics is None:
+            raise ValueError("One of model or topics has to be provided.")
+        elif topics is not None and dictionary is None:
+            raise ValueError("dictionary has to be provided if topics are to be used.")
         if texts is None and corpus is None:
             raise ValueError("One of texts or corpus has to be provided.")
+        # Check if associated dictionary is provided.
+        if dictionary is None:
+            if isinstance(model.id2word, FakeDict):
+                raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
+                                 " should be set as the associated dictionary.")
+            else:
+                self.dictionary = model.id2word
+        else:
+            self.dictionary = dictionary
+        # Check for correct inputs for u_mass coherence measure.
         if coherence == 'u_mass':
             if is_corpus(corpus)[0]:
-                if dictionary is None:
-                    if model.id2word[0] == 0:
-                        raise ValueError("The associated dictionary should be provided with the corpus or 'id2word' for topic model"
-                                         "should be set as the dictionary.")
-                    else:
-                        self.dictionary = model.id2word
-                else:
-                    self.dictionary = dictionary
                 self.corpus = corpus
             elif texts is not None:
                 self.texts = texts
-                if dictionary is None:
-                    self.dictionary = Dictionary(self.texts)
-                else:
-                    self.dictionary = dictionary
                 self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
             else:
                 raise ValueError("Either 'corpus' with 'dictionary' or 'texts' should be provided for %s coherence." % coherence)
-
+        # Check for correct inputs for c_v coherence measure.
         elif coherence == 'c_v':
             if texts is None:
                 raise ValueError("'texts' should be provided for %s coherence." % coherence)
             else:
                 self.texts = texts
-                self.dictionary = Dictionary(self.texts)
-                self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
-
         else:
             raise ValueError("%s coherence is not currently supported." % coherence)
 
         self.model = model
-        self.topics = self._get_topics()
+        if model is not None:
+            self.topics = self._get_topics()
+        elif topics is not None:
+            self.topics = []
+            for topic in topics:
+                t_i = []
+                for t in range(len(topic)):
+                    t_i.append(dictionary.token2id[topic[t]])
+                self.topics.append(np.array(t_i))
         self.coherence = coherence
         # Set pipeline parameters:
         if self.coherence == 'u_mass':
@@ -116,7 +138,7 @@ def __str__(self):
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
-        topics = []  # FIXME : Meant to work for LDAModel, LdaVowpalWabbit right now. Make it work for others.
+        topics = []
         if isinstance(self.model, LdaModel):
             for topic in self.model.state.get_lambda():
                 bestn = argsort(topic, topn=10, reverse=True)
@@ -125,6 +147,13 @@ def _get_topics(self):
             for topic in self.model._get_topics():
                 bestn = argsort(topic, topn=10, reverse=True)
                 topics.append(bestn)
+        elif isinstance(self.model, LdaMallet):
+            for topic in self.model.word_topics:
+                bestn = argsort(topic, topn=10, reverse=True)
+                topics.append(bestn)
+        else:
+            raise ValueError("This topic model is not currently supported. Supported topic models are"
+                             "LdaModel, LdaVowpalWabbit and LdaMallet.")
         return topics
 
     def get_coherence(self):

diff --git a/gensim/test/test_aggregation.py b/gensim/test/test_aggregation.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""
+Automated tests for checking transformation algorithms (the models package).
+"""
+
+import logging
+import unittest
+
+from gensim.topic_coherence import aggregation
+
+class TestAggregation(unittest.TestCase):
+    def setUp(self):
+        self.confirmed_measures = [1.1, 2.2, 3.3, 4.4]
+
+    def testArithmeticMean(self):
+        """Test arithmetic_mean()"""
+        obtained = aggregation.arithmetic_mean(self.confirmed_measures)
+        expected = 2.75
+        self.assertEqual(obtained, expected)
+
+if __name__ == '__main__':
+    logging.root.setLevel(logging.WARNING)
+    unittest.main()