-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'gojomo-load_misc' into develop
- Loading branch information
Showing
4 changed files
with
168 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz> | ||
# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
|
||
""" | ||
USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX [VOCABULARY_SIZE] | ||
Convert articles from a Wikipedia dump to (sparse) vectors. The input is a | ||
bz2-compressed dump of Wikipedia articles, in XML format. | ||
This actually creates three files: | ||
* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids | ||
* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in | ||
Matrix Matrix format | ||
* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation | ||
* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump | ||
The output Matrix Market files can then be compressed (e.g., by bzip2) to save | ||
disk space; gensim's corpus iterators can work with compressed input, too. | ||
`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after | ||
removing tokens that appear in more than 10%% of all documents). Defaults to | ||
100,000. | ||
If you have the `pattern` package installed, this script will use a fancy | ||
lemmatization to get a lemma of each token (instead of plain alphabetic | ||
tokenizer). The package is available at https://github.com/clips/pattern . | ||
Example: python -m gensim.scripts.make_wikicorpus ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki_en | ||
""" | ||
|
||
|
||
import logging | ||
import os.path | ||
import sys | ||
|
||
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus | ||
from gensim.models import TfidfModel | ||
|
||
|
||
# Wiki is first scanned for all distinct word types (~7M). The types that | ||
# appear in more than 10% of articles are removed and from the rest, the | ||
# DEFAULT_DICT_SIZE most frequent types are kept. | ||
DEFAULT_DICT_SIZE = 100000 | ||
|
||
|
||
if __name__ == '__main__': | ||
program = os.path.basename(sys.argv[0]) | ||
logger = logging.getLogger(program) | ||
|
||
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') | ||
logging.root.setLevel(level=logging.INFO) | ||
logger.info("running %s" % ' '.join(sys.argv)) | ||
|
||
# check and process input arguments | ||
if len(sys.argv) < 3: | ||
print(globals()['__doc__'] % locals()) | ||
sys.exit(1) | ||
inp, outp = sys.argv[1:3] | ||
|
||
if not os.path.isdir(os.path.dirname(outp)): | ||
raise SystemExit("Error: The output directory does not exist. Create the directory and try again.") | ||
|
||
if len(sys.argv) > 3: | ||
keep_words = int(sys.argv[3]) | ||
else: | ||
keep_words = DEFAULT_DICT_SIZE | ||
online = 'online' in program | ||
lemmatize = 'lemma' in program | ||
debug = 'nodebug' not in program | ||
|
||
if online: | ||
dictionary = HashDictionary(id_range=keep_words, debug=debug) | ||
dictionary.allow_update = True # start collecting document frequencies | ||
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) | ||
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) | ||
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` | ||
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) | ||
dictionary.save_as_text(outp + '_wordids.txt.bz2') | ||
wiki.save(outp + '_corpus.pkl.bz2') | ||
dictionary.allow_update = False | ||
else: | ||
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) | ||
# only keep the most frequent words (out of total ~8.2m unique tokens) | ||
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) | ||
# save dictionary and bag-of-words (term-document frequency matrix) | ||
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h | ||
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') | ||
# load back the id->word mapping directly from file | ||
# this seems to save more memory, compared to keeping the wiki.dictionary object from above | ||
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') | ||
del wiki | ||
|
||
# initialize corpus reader and word->id mapping | ||
mm = MmCorpus(outp + '_bow.mm') | ||
|
||
# build tfidf, ~50min | ||
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) | ||
tfidf.save(outp + '.tfidf_model') | ||
|
||
# save tfidf vectors in matrix market format | ||
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB | ||
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) | ||
|
||
logger.info("finished running %s" % program) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters