piskvorky · tmylk · Jan 27, 2017 · Jan 20, 2017 · Jan 24, 2017 · Jan 26, 2017
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -62,6 +62,8 @@
 import logging
 import warnings
 from collections import defaultdict
+import itertools as it
+import types
 
 from six import iteritems, string_types
 
@@ -70,6 +72,30 @@
 logger = logging.getLogger(__name__)
 
 
+def _is_single(sentence):
+    """Returns a tuple consisting of the given `sentence` and a boolean.
+    The bool is `True` if the given iterator is a single document and `False` otherwise.
+    This should allow corpus inputs to be generators or generators of generators of strings."""
+    try:
+        # This should work with generators and generators of generators
+        if not sentence:
+            return sentence, True
+        elif isinstance(sentence, types.GeneratorType):
+            nxt = sentence.next()
+            sentence = it.chain([nxt], sentence)
+            if isinstance(nxt, string_types):
+                return sentence, True
+            # If it's not a string, assume it's an iterable of such
+            else:
+                return sentence, False
+        elif isinstance(sentence[0], string_types):
+            return sentence, True
+        else:
+            return sentence, False
+    except:
+        return sentence, False
+
+
 class Phrases(interfaces.TransformationABC):
     """
     Detect phrases, based on collected collocation counts. Adjacent words that appear
@@ -226,6 +252,7 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
                             continue
                         last_bigram = False
 
+    # TODO: Modify type in docstring to indicate that generators work too
     def __getitem__(self, sentence):
         """
         Convert the input tokens `sentence` (=list of unicode strings) into phrase
@@ -246,10 +273,8 @@ def __getitem__(self, sentence):
 
         """
         warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
-        try:
-            is_single = not sentence or isinstance(sentence[0], string_types)
-        except:
-            is_single = False
+
+        sentence, is_single = _is_single(sentence)
         if not is_single:
             # if the input is an entire corpus (rather than a single sentence),
             # return an iterable stream.
@@ -327,7 +352,7 @@ def __init__(self, phrases_model):
                 logger.info('Phraser added %i phrasegrams', count)
         logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams))
 
-
+    # TODO: Modify type in docstring to indicate that generators work too
     def __getitem__(self, sentence):
         """
         Convert the input tokens `sentence` (=list of unicode strings) into phrase
@@ -339,10 +364,7 @@ def __getitem__(self, sentence):
         into phrases on the fly, one after another.
 
         """
-        try:
-            is_single = not sentence or isinstance(sentence[0], string_types)
-        except:
-            is_single = False
+        sentence, is_single = _is_single(sentence)
         if not is_single:
             # if the input is an entire corpus (rather than a single sentence),
             # return an iterable stream.

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
@@ -38,6 +38,10 @@
 unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences]
 
 
+def gen_sentences():
+    return ((w for w in sentence) for sentence in sentences)
+
+
 class TestPhrasesCommon(unittest.TestCase):
     """ Tests that need to be run for both Prases and Phraser classes."""
     def setUp(self):
@@ -51,6 +55,11 @@ def testSentenceGeneration(self):
         # test that we generate the same amount of sentences as the input
         self.assertEqual(len(sentences), len(list(self.bigram_default[sentences])))
 
+    def testSentenceGenerationWithGenerator(self):
+        """Test basic bigram production when corpus is a generator."""
+        self.assertEqual(len(list(gen_sentences())),
+                         len(list(self.bigram_default[gen_sentences()])))
+
     def testBigramConstruction(self):
         """Test Phrases bigram construction building."""
         # with this setting we should get response_time and graph_minors
@@ -75,6 +84,20 @@ def testBigramConstruction(self):
         self.assertTrue(u'graph_minors' in self.bigram[sentences[-1]])
         self.assertTrue(u'human_interface' in self.bigram[sentences[-1]])
 
+    def testBigramConstructionFromGenerator(self):
+        """Test Phrases bigram construction building when corpus is a generator"""
+        bigram1_seen = False
+        bigram2_seen = False
+
+        for s in self.bigram[gen_sentences()]:
+            if not bigram1_seen and 'response_time' in s:
+                bigram1_seen = True
+            if not bigram2_seen and 'graph_minors' in s:
+                bigram2_seen = True
+            if bigram1_seen and bigram2_seen:
+                break
+        self.assertTrue(bigram1_seen and bigram2_seen)
+
     def testEncoding(self):
         """Test that both utf8 and unicode input work; output must be unicode."""
         expected = [u'survey', u'user', u'computer', u'system', u'response_time']