forked from kno10/WikipediaEntities
-
Notifications
You must be signed in to change notification settings - Fork 1
/
subset-recommended.py
68 lines (59 loc) · 1.8 KB
/
subset-recommended.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python
import gzip, re, sys
# Minimum phrase length (characters)
minlen = 3
# Minimum number of occurrences
mincount = 50
# Minimum trust value
mintrust = 90
mintrustexact = 80
# Results with exact matches only
exactonly = True
# Minimum contrast, i.e. second may have at most trust < besttrust-mincontrast
mincontrast = 20
# skip entities that start dont start with a letter
start_alpha_only = True
# only output the phrase itself, not the corresponding top entity
phrase_only = True
# Match the percentage at the end only:
pat = re.compile(r"^(.*?):[0-9:]+:([0-9]+):([0-9]+)%$")
# Output to stdout:
ou = sys.stdout
with gzip.open(sys.argv[1]) as infile:
for line in infile:
line = line.decode('utf8')
line = line.split('\t')
phrase, count, used = line[0], int(line[1]), int(line[2])
if used < mincount:
continue
if len(phrase) < minlen:
continue
if start_alpha_only and not phrase[0].isalpha():
continue
m = pat.match(line[3])
if not m:
print >> sys.stderr, "Did not match:", line
continue
trust = float(m.group(3))
isexact = not (m.group(2) == '0')
if isexact:
if trust < mintrustexact:
continue
else:
if trust < mintrust:
continue
if exactonly and not isexact:
continue
if len(line) > 4:
m2 = pat.match(line[4])
if not m2:
print >> sys.stderr, "Did not match:", line
continue
trust2 = float(m2.group(3))
if trust2 >= trust - mincontrast:
continue
ou.write(phrase)
ou.write("\t")
if not phrase_only:
ou.write(m.group(1))
ou.write("\n")