Skip to content

Commit

Permalink
Merge branch 'gojomo-load_misc' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
tmylk committed Aug 14, 2016
2 parents d9b5667 + 97ad095 commit 15ee57f
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 8 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Changes
=======
0.13.2

0.13.2, TBD

* wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, #771)
- assigning wordtopics value of word_topics to keep backward compatibility, for now
Expand All @@ -12,6 +13,7 @@ Changes
* Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation.
Added `topics` parameter to coherencemodel. Can now provide tokenized topics to calculate coherence value (@dsquareindia, #750)
* Added a check for empty (no words) documents before starting to run the DTM wrapper if model = "fixed" is used (DIM model) as this causes the an error when such documents are reached in training. (@eickho, #806)
* New parameters `limit`, `datatype` for load_word2vec_format(); `lockf` for intersect_word2vec_format (@gojomo, #817)

0.13.1, 2016-06-22

Expand Down
37 changes: 31 additions & 6 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1054,7 +1054,8 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

@classmethod
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict'):
def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
limit=None, datatype=REAL):
"""
Load the input-hidden weight matrix from the original C word2vec-tool format.
Expand All @@ -1070,6 +1071,18 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
If you trained the C model using non-utf8 encoding for words, specify that
encoding in `encoding`.
`unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
file may include word tokens truncated in the middle of a multibyte unicode character
(as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
`limit` sets a maximum number of word-vectors to read from the file. The default,
None, means read all.
`datatype` (experimental) can coerce dimensions to a non-default float type (such
as np.float16) to save memory. (Such types may result in much slower bulk operations
or incompatibility with optimized routines.)
"""
counts = None
if fvocab is not None:
Expand All @@ -1084,8 +1097,10 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
with utils.smart_open(fname) as fin:
header = utils.to_unicode(fin.readline(), encoding=encoding)
vocab_size, vector_size = map(int, header.split()) # throws for invalid file format
if limit:
vocab_size = min(vocab_size, limit)
result = cls(size=vector_size)
result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)

def add_word(word, weights):
word_id = len(result.vocab)
Expand Down Expand Up @@ -1114,13 +1129,18 @@ def add_word(word, weights):
ch = fin.read(1)
if ch == b' ':
break
if ch == b'':
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
if ch != b'\n': # ignore newlines in front of words (some binary files have)
word.append(ch)
word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
weights = fromstring(fin.read(binary_len), dtype=REAL)
add_word(word, weights)
else:
for line_no, line in enumerate(fin):
for line_no in xrange(vocab_size):
line = fin.readline()
if line == b'':
raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
if len(parts) != vector_size + 1:
raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
Expand All @@ -1137,14 +1157,18 @@ def add_word(word, weights):
logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
return result

def intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicode_errors='strict'):
def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'):
"""
Merge the input-hidden weight matrix from the original C word2vec-tool format
given, where it intersects with the current vocabulary. (No words are added to the
existing vocabulary, but intersecting words adopt the file's weights, and
non-intersecting words are left alone.)
`binary` is a boolean indicating whether the data is in binary word2vec format.
`lockf` is a lock-factor value to be set for any imported word-vectors; the
default value of 0.0 prevents further updating of the vector during subsequent
training. Use 1.0 to allow further training updates of merged vectors.
"""
overlap_count = 0
logger.info("loading projection weights from %s" % (fname))
Expand All @@ -1170,7 +1194,7 @@ def intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicod
if word in self.vocab:
overlap_count += 1
self.syn0[self.vocab[word].index] = weights
self.syn0_lockf[self.vocab[word].index] = 0.0 # lock it
self.syn0_lockf[self.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes
else:
for line_no, line in enumerate(fin):
parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
Expand Down Expand Up @@ -1710,7 +1734,8 @@ def __iter__(self):
while True:
text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM
if text == rest: # EOF
sentence.extend(rest.split()) # return the last chunk of words, too (may be shorter/longer)
words = utils.to_unicode(text).split()
sentence.extend(words) # return the last chunk of words, too (may be shorter/longer)
if sentence:
yield sentence
break
Expand Down
1 change: 0 additions & 1 deletion gensim/scripts/make_wiki_online_nodebug.py

This file was deleted.

110 changes: 110 additions & 0 deletions gensim/scripts/make_wiki_online_nodebug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
bz2-compressed dump of Wikipedia articles, in XML format.
This actually creates three files:
* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
Matrix Matrix format
* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
The output Matrix Market files can then be compressed (e.g., by bzip2) to save
disk space; gensim's corpus iterators can work with compressed input, too.
`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
removing tokens that appear in more than 10%% of all documents). Defaults to
100,000.
If you have the `pattern` package installed, this script will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .
Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
"""


import logging
import os.path
import sys

from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel


# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
DEFAULT_DICT_SIZE = 100000


if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

# check and process input arguments
if len(sys.argv) < 3:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp = sys.argv[1:3]

if not os.path.isdir(os.path.dirname(outp)):
raise SystemExit("Error: The output directory does not exist. Create the directory and try again.")

if len(sys.argv) > 3:
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program

if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
dictionary.save_as_text(outp + '_wordids.txt.bz2')
wiki.save(outp + '_corpus.pkl.bz2')
dictionary.allow_update = False
else:
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
del wiki

# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '_bow.mm')

# build tfidf, ~50min
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(outp + '.tfidf_model')

# save tfidf vectors in matrix market format
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

logger.info("finished running %s" % program)
24 changes: 24 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,30 @@ def testPersistenceWord2VecFormat(self):
norm_only_model.init_sims(replace=True)
self.assertFalse(numpy.allclose(model['human'], norm_only_model['human']))
self.assertTrue(numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human']))
limited_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, limit=3)
self.assertEquals(len(limited_model.syn0), 3)
half_precision_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, datatype=numpy.float16)
self.assertEquals(binary_model.syn0.nbytes, half_precision_model.syn0.nbytes * 2)

def testTooShortBinaryWord2VecFormat(self):
tfile = testfile()
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(tfile, binary=True)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True)

def testTooShortTextWord2VecFormat(self):
tfile = testfile()
model = word2vec.Word2Vec(sentences, min_count=1)
model.init_sims()
model.save_word2vec_format(tfile, binary=False)
f = open(tfile, 'r+b')
f.write(b'13') # write wrong (too-long) vector count
f.close()
self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=False)

def testPersistenceWord2VecFormatNonBinary(self):
"""Test storing/loading the entire model in word2vec non-binary format."""
Expand Down

0 comments on commit 15ee57f

Please sign in to comment.