Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phrases and Phraser allow a generator corpus #1099

Merged
merged 3 commits into from
Jan 27, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 31 additions & 9 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
import logging
import warnings
from collections import defaultdict
import itertools as it
import types

from six import iteritems, string_types

Expand All @@ -70,6 +72,30 @@
logger = logging.getLogger(__name__)


def _is_single(sentence):
"""Returns a tuple consisting of the given `sentence` and a boolean.
The bool is `True` if the given iterator is a single document and `False` otherwise.
This should allow corpus inputs to be generators or generators of generators of strings."""
try:
# This should work with generators and generators of generators
if not sentence:
return sentence, True
elif isinstance(sentence, types.GeneratorType):
nxt = sentence.next()
sentence = it.chain([nxt], sentence)
if isinstance(nxt, string_types):
return sentence, True
# If it's not a string, assume it's an iterable of such
else:
return sentence, False
elif isinstance(sentence[0], string_types):
return sentence, True
else:
return sentence, False
except:
return sentence, False


class Phrases(interfaces.TransformationABC):
"""
Detect phrases, based on collected collocation counts. Adjacent words that appear
Expand Down Expand Up @@ -226,6 +252,7 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
continue
last_bigram = False

# TODO: Modify type in docstring to indicate that generators work too
def __getitem__(self, sentence):
"""
Convert the input tokens `sentence` (=list of unicode strings) into phrase
Expand All @@ -246,10 +273,8 @@ def __getitem__(self, sentence):

"""
warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
try:
is_single = not sentence or isinstance(sentence[0], string_types)
except:
is_single = False

sentence, is_single = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
Expand Down Expand Up @@ -327,7 +352,7 @@ def __init__(self, phrases_model):
logger.info('Phraser added %i phrasegrams', count)
logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams))


# TODO: Modify type in docstring to indicate that generators work too
def __getitem__(self, sentence):
"""
Convert the input tokens `sentence` (=list of unicode strings) into phrase
Expand All @@ -339,10 +364,7 @@ def __getitem__(self, sentence):
into phrases on the fly, one after another.

"""
try:
is_single = not sentence or isinstance(sentence[0], string_types)
except:
is_single = False
sentence, is_single = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
Expand Down
23 changes: 23 additions & 0 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences]


def gen_sentences():
return ((w for w in sentence) for sentence in sentences)


class TestPhrasesCommon(unittest.TestCase):
""" Tests that need to be run for both Prases and Phraser classes."""
def setUp(self):
Expand All @@ -51,6 +55,11 @@ def testSentenceGeneration(self):
# test that we generate the same amount of sentences as the input
self.assertEqual(len(sentences), len(list(self.bigram_default[sentences])))

def testSentenceGenerationWithGenerator(self):
"""Test basic bigram production when corpus is a generator."""
self.assertEqual(len(list(gen_sentences())),
len(list(self.bigram_default[gen_sentences()])))

def testBigramConstruction(self):
"""Test Phrases bigram construction building."""
# with this setting we should get response_time and graph_minors
Expand All @@ -75,6 +84,20 @@ def testBigramConstruction(self):
self.assertTrue(u'graph_minors' in self.bigram[sentences[-1]])
self.assertTrue(u'human_interface' in self.bigram[sentences[-1]])

def testBigramConstructionFromGenerator(self):
"""Test Phrases bigram construction building when corpus is a generator"""
bigram1_seen = False
bigram2_seen = False

for s in self.bigram[gen_sentences()]:
if not bigram1_seen and 'response_time' in s:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what would be lost if just assert 'response_time' in self.bigram[gen_sentences()]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a cop-out, but I just copied the format of the existing test :P

To stream-line that section you could do:

assert len(set(['response_time', 'graph_minors']).intersection(set(it.chain.from_iterable(self.bigram[gen_sentences()])))) == 2

However, the current formulation in both my test and testBigramConsturction short-circuits so that it doesn't have to go through the entire input. This would allow longer test corpora in the future.

In either case, I didn't want to question or rethink the work of whoever designed the tests originally, I just wanted to make sure my changes were non-breaking. Do you want me to change the tests?

bigram1_seen = True
if not bigram2_seen and 'graph_minors' in s:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
self.assertTrue(bigram1_seen and bigram2_seen)

def testEncoding(self):
"""Test that both utf8 and unicode input work; output must be unicode."""
expected = [u'survey', u'user', u'computer', u'system', u'response_time']
Expand Down