Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phrases and Phraser allow a generator corpus #1099

Merged
merged 3 commits into from
Jan 27, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 29 additions & 10 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,39 @@
import logging
import warnings
from collections import defaultdict
import itertools as it

from six import iteritems, string_types
from six import iteritems, string_types, next

from gensim import utils, interfaces

logger = logging.getLogger(__name__)


def _is_single(obj):
"""
Check whether `obj` is a single document or an entire corpus.
Returns (is_single, new) 2-tuple, where `new` yields the same
sequence as `obj`.

`obj` is a single document if it is an iterable of strings. It
is a corpus if it is an iterable of documents.
"""
obj_iter = iter(obj)
try:
peek = next(obj_iter)
obj_iter = it.chain([peek], obj_iter)
except StopIteration:
# An empty object is a single document
return True, obj
if isinstance(peek, string_types):
# It's a document, return the iterator
return True, obj_iter
else:
# If the first item isn't a string, assume obj is a corpus
return False, obj_iter


class Phrases(interfaces.TransformationABC):
"""
Detect phrases, based on collected collocation counts. Adjacent words that appear
Expand Down Expand Up @@ -246,10 +271,8 @@ def __getitem__(self, sentence):

"""
warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
try:
is_single = not sentence or isinstance(sentence[0], string_types)
except:
is_single = False

is_single, sentence = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
Expand Down Expand Up @@ -327,7 +350,6 @@ def __init__(self, phrases_model):
logger.info('Phraser added %i phrasegrams', count)
logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams))


def __getitem__(self, sentence):
"""
Convert the input tokens `sentence` (=list of unicode strings) into phrase
Expand All @@ -339,10 +361,7 @@ def __getitem__(self, sentence):
into phrases on the fly, one after another.

"""
try:
is_single = not sentence or isinstance(sentence[0], string_types)
except:
is_single = False
is_single, sentence = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
Expand Down
36 changes: 36 additions & 0 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences]


def gen_sentences():
return ((w for w in sentence) for sentence in sentences)


class TestPhrasesCommon(unittest.TestCase):
""" Tests that need to be run for both Prases and Phraser classes."""
def setUp(self):
Expand All @@ -46,11 +50,29 @@ def setUp(self):
self.bigram_utf8 = Phrases(sentences, min_count=1, threshold=1)
self.bigram_unicode = Phrases(unicode_sentences, min_count=1, threshold=1)

def testEmptyInputsOnBigramConstruction(self):
"""Test that empty inputs don't throw errors and return the expected result."""
# Empty list -> empty list
self.assertEqual(list(self.bigram_default[[]]), [])
# Empty iterator -> empty list
self.assertEqual(list(self.bigram_default[iter(())]), [])
# List of empty list -> list of empty list
self.assertEqual(list(self.bigram_default[[[], []]]), [[], []])
# Iterator of empty list -> list of empty list
self.assertEqual(list(self.bigram_default[iter([[], []])]), [[], []])
# Iterator of empty iterator -> list of empty list
self.assertEqual(list(self.bigram_default[(iter(()) for i in range(2))]), [[], []])

def testSentenceGeneration(self):
"""Test basic bigram using a dummy corpus."""
# test that we generate the same amount of sentences as the input
self.assertEqual(len(sentences), len(list(self.bigram_default[sentences])))

def testSentenceGenerationWithGenerator(self):
"""Test basic bigram production when corpus is a generator."""
self.assertEqual(len(list(gen_sentences())),
len(list(self.bigram_default[gen_sentences()])))

def testBigramConstruction(self):
"""Test Phrases bigram construction building."""
# with this setting we should get response_time and graph_minors
Expand All @@ -75,6 +97,20 @@ def testBigramConstruction(self):
self.assertTrue(u'graph_minors' in self.bigram[sentences[-1]])
self.assertTrue(u'human_interface' in self.bigram[sentences[-1]])

def testBigramConstructionFromGenerator(self):
"""Test Phrases bigram construction building when corpus is a generator"""
bigram1_seen = False
bigram2_seen = False

for s in self.bigram[gen_sentences()]:
if not bigram1_seen and 'response_time' in s:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what would be lost if just assert 'response_time' in self.bigram[gen_sentences()]

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a cop-out, but I just copied the format of the existing test :P

To stream-line that section you could do:

assert len(set(['response_time', 'graph_minors']).intersection(set(it.chain.from_iterable(self.bigram[gen_sentences()])))) == 2

However, the current formulation in both my test and testBigramConsturction short-circuits so that it doesn't have to go through the entire input. This would allow longer test corpora in the future.

In either case, I didn't want to question or rethink the work of whoever designed the tests originally, I just wanted to make sure my changes were non-breaking. Do you want me to change the tests?

bigram1_seen = True
if not bigram2_seen and 'graph_minors' in s:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
self.assertTrue(bigram1_seen and bigram2_seen)

def testEncoding(self):
"""Test that both utf8 and unicode input work; output must be unicode."""
expected = [u'survey', u'user', u'computer', u'system', u'response_time']
Expand Down