Merge branch 'gojomo-load_misc' into develop

piskvorky · Aug 14, 2016 · 15ee57f · 15ee57f
2 parents d9b5667 + 97ad095
commit 15ee57f
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 Changes
 =======
-0.13.2
+
+0.13.2, TBD
 
 * wordtopics has changed to word_topics in ldamallet, and fixed issue #764. (@bhargavvader, #771) 
   - assigning wordtopics value of word_topics to keep backward compatibility, for now
@@ -12,6 +13,7 @@ Changes
 * Added LdaMallet support. Added LdaVowpalWabbit, LdaMallet example to notebook. Added test suite for coherencemodel and aggregation.
   Added `topics` parameter to coherencemodel. Can now provide tokenized topics to calculate coherence value (@dsquareindia, #750)
 * Added a check for empty (no words) documents before starting to run the DTM wrapper if model = "fixed" is used (DIM model) as this    causes the an error when such documents are reached in training. (@eickho, #806)
+* New parameters `limit`, `datatype` for load_word2vec_format(); `lockf` for intersect_word2vec_format (@gojomo, #817)
 
 0.13.1, 2016-06-22
 

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -1054,7 +1054,8 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False):
                     fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
 
     @classmethod
-    def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict'):
+    def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict',
+                             limit=None, datatype=REAL):
         """
         Load the input-hidden weight matrix from the original C word2vec-tool format.
 
@@ -1070,6 +1071,18 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
         If you trained the C model using non-utf8 encoding for words, specify that
         encoding in `encoding`.
 
+        `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
+        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
+        file may include word tokens truncated in the middle of a multibyte unicode character
+        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
+
+        `limit` sets a maximum number of word-vectors to read from the file. The default,
+        None, means read all.
+
+        `datatype` (experimental) can coerce dimensions to a non-default float type (such
+        as np.float16) to save memory. (Such types may result in much slower bulk operations
+        or incompatibility with optimized routines.)
+
         """
         counts = None
         if fvocab is not None:
@@ -1084,8 +1097,10 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
         with utils.smart_open(fname) as fin:
             header = utils.to_unicode(fin.readline(), encoding=encoding)
             vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
+            if limit:
+                vocab_size = min(vocab_size, limit)
             result = cls(size=vector_size)
-            result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
+            result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)
 
             def add_word(word, weights):
                 word_id = len(result.vocab)
@@ -1114,13 +1129,18 @@ def add_word(word, weights):
                         ch = fin.read(1)
                         if ch == b' ':
                             break
+                        if ch == b'':
+                            raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
                         if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                             word.append(ch)
                     word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
                     weights = fromstring(fin.read(binary_len), dtype=REAL)
                     add_word(word, weights)
             else:
-                for line_no, line in enumerate(fin):
+                for line_no in xrange(vocab_size):
+                    line = fin.readline()
+                    if line == b'':
+                        raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
                     parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
                     if len(parts) != vector_size + 1:
                         raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
@@ -1137,14 +1157,18 @@ def add_word(word, weights):
         logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
         return result
 
-    def intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicode_errors='strict'):
+    def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'):
         """
         Merge the input-hidden weight matrix from the original C word2vec-tool format
         given, where it intersects with the current vocabulary. (No words are added to the
         existing vocabulary, but intersecting words adopt the file's weights, and
         non-intersecting words are left alone.)
 
         `binary` is a boolean indicating whether the data is in binary word2vec format.
+
+        `lockf` is a lock-factor value to be set for any imported word-vectors; the
+        default value of 0.0 prevents further updating of the vector during subsequent
+        training. Use 1.0 to allow further training updates of merged vectors.
         """
         overlap_count = 0
         logger.info("loading projection weights from %s" % (fname))
@@ -1170,7 +1194,7 @@ def intersect_word2vec_format(self, fname, binary=False, encoding='utf8', unicod
                     if word in self.vocab:
                         overlap_count += 1
                         self.syn0[self.vocab[word].index] = weights
-                        self.syn0_lockf[self.vocab[word].index] = 0.0  # lock it
+                        self.syn0_lockf[self.vocab[word].index] = lockf  # lock-factor: 0.0 stops further changes
             else:
                 for line_no, line in enumerate(fin):
                     parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")
@@ -1710,7 +1734,8 @@ def __iter__(self):
             while True:
                 text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
                 if text == rest:  # EOF
-                    sentence.extend(rest.split())  # return the last chunk of words, too (may be shorter/longer)
+                    words = utils.to_unicode(text).split()
+                    sentence.extend(words)  # return the last chunk of words, too (may be shorter/longer)
                     if sentence:
                         yield sentence
                     break

diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py
diff --git a/gensim/scripts/make_wiki_online_nodebug.py b/gensim/scripts/make_wiki_online_nodebug.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
+# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+
+"""
+USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE]
+
+Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
+bz2-compressed dump of Wikipedia articles, in XML format.
+
+This actually creates three files:
+
+* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
+* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
+  Matrix Matrix format
+* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
+* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
+
+The output Matrix Market files can then be compressed (e.g., by bzip2) to save
+disk space; gensim's corpus iterators can work with compressed input, too.
+
+`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
+removing tokens that appear in more than 10%% of all documents). Defaults to
+100,000.
+
+If you have the `pattern` package installed, this script will use a fancy
+lemmatization to get a lemma of each token (instead of plain alphabetic
+tokenizer). The package is available at https://github.com/clips/pattern .
+
+Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en
+"""
+
+
+import logging
+import os.path
+import sys
+
+from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
+from gensim.models import TfidfModel
+
+
+# Wiki is first scanned for all distinct word types (~7M). The types that
+# appear in more than 10% of articles are removed and from the rest, the
+# DEFAULT_DICT_SIZE most frequent types are kept.
+DEFAULT_DICT_SIZE = 100000
+
+
+if __name__ == '__main__':
+    program = os.path.basename(sys.argv[0])
+    logger = logging.getLogger(program)
+
+    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
+    logging.root.setLevel(level=logging.INFO)
+    logger.info("running %s" % ' '.join(sys.argv))
+
+    # check and process input arguments
+    if len(sys.argv) < 3:
+        print(globals()['__doc__'] % locals())
+        sys.exit(1)
+    inp, outp = sys.argv[1:3]
+
+    if not os.path.isdir(os.path.dirname(outp)):
+        raise SystemExit("Error: The output directory does not exist. Create the directory and try again.")
+
+    if len(sys.argv) > 3:
+        keep_words = int(sys.argv[3])
+    else:
+        keep_words = DEFAULT_DICT_SIZE
+    online = 'online' in program
+    lemmatize = 'lemma' in program
+    debug = 'nodebug' not in program
+
+    if online:
+        dictionary = HashDictionary(id_range=keep_words, debug=debug)
+        dictionary.allow_update = True # start collecting document frequencies
+        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
+        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
+        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
+        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
+        dictionary.save_as_text(outp + '_wordids.txt.bz2')
+        wiki.save(outp + '_corpus.pkl.bz2')
+        dictionary.allow_update = False
+    else:
+        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
+        # only keep the most frequent words (out of total ~8.2m unique tokens)
+        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
+        # save dictionary and bag-of-words (term-document frequency matrix)
+        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
+        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
+        # load back the id->word mapping directly from file
+        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
+        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
+    del wiki
+
+    # initialize corpus reader and word->id mapping
+    mm = MmCorpus(outp + '_bow.mm')
+
+    # build tfidf, ~50min
+    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
+    tfidf.save(outp + '.tfidf_model')
+
+    # save tfidf vectors in matrix market format
+    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
+    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
+
+    logger.info("finished running %s" % program)
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -110,6 +110,30 @@ def testPersistenceWord2VecFormat(self):
         norm_only_model.init_sims(replace=True)
         self.assertFalse(numpy.allclose(model['human'], norm_only_model['human']))
         self.assertTrue(numpy.allclose(model.syn0norm[model.vocab['human'].index], norm_only_model['human']))
+        limited_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, limit=3)
+        self.assertEquals(len(limited_model.syn0), 3)
+        half_precision_model = word2vec.Word2Vec.load_word2vec_format(testfile(), binary=True, datatype=numpy.float16)
+        self.assertEquals(binary_model.syn0.nbytes, half_precision_model.syn0.nbytes * 2)
+
+    def testTooShortBinaryWord2VecFormat(self):
+        tfile = testfile()
+        model = word2vec.Word2Vec(sentences, min_count=1)
+        model.init_sims()
+        model.save_word2vec_format(tfile, binary=True)
+        f = open(tfile, 'r+b')
+        f.write(b'13')  # write wrong (too-long) vector count
+        f.close()
+        self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=True)
+
+    def testTooShortTextWord2VecFormat(self):
+        tfile = testfile()
+        model = word2vec.Word2Vec(sentences, min_count=1)
+        model.init_sims()
+        model.save_word2vec_format(tfile, binary=False)
+        f = open(tfile, 'r+b')
+        f.write(b'13')  # write wrong (too-long) vector count
+        f.close()
+        self.assertRaises(EOFError, word2vec.Word2Vec.load_word2vec_format, tfile, binary=False)
 
     def testPersistenceWord2VecFormatNonBinary(self):
         """Test storing/loading the entire model in word2vec non-binary format."""