-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
vw-doc2lda
69 lines (59 loc) · 1.89 KB
/
vw-doc2lda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/perl -w
# vim: ts=4 sw=4 et
#
# Convert one document to one line that can be fed as one example to vw LDA
#
use Getopt::Std;
use vars qw($opt_f);
getopts('f') || die "Usage: $0 [-f] <file>\n";
# We always need to ignore non-words and lowercase everything
my $PRECMD = q{tr -cs '[:alpha:]' '\012' <} . "$ARGV[0]" .
q{ |tr '[:upper:]' '[:lower:]'};
# This optional step '-f' converts the list of words in the order
# of their appearance in the doc, into a sorted BY-FREQUENCY
# list with counts. Pass -f to get it.
my $POSTCMD = $opt_f ? '|sort |uniq -c |sort -n' : '';
#
# Remove stop-words & convert output from the shell pipe-line:
# <count1> <word1>
# <count2> <word2>
# ...
# into one line per doc for VW consumption:
# | <word1>:<count1> <word2>:<count2> ...
#
my %StopWord = ();
my $StopWords = "the be to of and a in that have i it for not on
with he as you do at this but his by from they we say her she or
an will my one all would there their what so up out if about who
get which go me when make can like time no just him know take
into your some could them see other than then its how our any
these us are is has using used also such may";
@StopWords = split(/\s+/, $StopWords);
@StopWord{@StopWords} = 1;
sub skip_word($) {
my $word = shift;
return 1 if (exists $StopWord{$word});
# All 1-char words are also considered stop-words
return 1 if (length($word) < 2);
0;
}
print '|';
open(my $inp, "$PRECMD $POSTCMD |");
while (<$inp>) {
my ($word, $count) = ('', 1);
if (/^\s*(\d+)\s+(\S+)\s*$/) {
($word, $count) = ($2, $1);
} elsif (/^(\S+)$/) {
($word, $count) = ($1, 1);
} else {
next;
}
next if skip_word($word);
if ($count > 1) {
printf " %s:%s", $word, $count;
} else {
printf " %s", $word;
}
}
close($inp);
print "\n";