forked from mhagiwara/cc-kedict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
validate.py
65 lines (43 loc) · 2.06 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import yaml
MANDATORY_FIELDS = ['word', 'pos', 'defs']
OPTIONAL_FIELDS = ['index', 'romaja', 'hanja', 'conj', 'notes', 'tags', 'syns', 'ants', 'rels', 'cf', 'ders']
POS = ['n', 'v', 'a', 'pron', 'propn', 'intj', 'det', 'part', 'adv', 'num', 'abbrev', 'suf', 'pref']
TAGS = ['topik1', 'ik-b1']
def validate_entry(entry):
# TODO: Use JSON Schema instead to validate
if not isinstance(entry, dict):
raise ValueError('Entry {!r} is not a dict'.format(entry))
for field in MANDATORY_FIELDS:
if field not in entry:
raise ValueError('Entry {!r} does not have the mandatory `{}` field'.format(entry, field))
unrecognized_fields = set(entry.keys()) - set(MANDATORY_FIELDS) - set(OPTIONAL_FIELDS)
if unrecognized_fields:
raise ValueError('Entry {!r} has unrecognized field(s): {!r}'.format(entry, unrecognized_fields))
if entry['pos'] not in POS:
raise ValueError('Unrecognized POS tag: {}'.format(entry['pos']))
if 'conj' in entry:
if not isinstance(entry['conj'], list):
raise ValueError('Conjugation {!r} is not a list'.format(entry['conj']))
if 'tags' in entry:
if not isinstance(entry['tags'], list):
raise ValueError('Tags {!r} is not a list'.format(entry['tags']))
for tag in entry['tags']:
if tag not in TAGS:
raise ValueError('Unrecognized tag: {}'.format(tag))
def validate_dictionary(data):
if not isinstance(data, list):
raise ValueError('The dictionary data is not a list')
for entry in data:
validate_entry(entry)
for entry1, entry2 in zip(data, data[1:]):
word1, word2 = entry1['word'], entry2['word']
if not word1 <= word2:
raise ValueError('word: {} and {} are not in an alphabetical order'.format(word1, word2))
def main():
with open('kedict.yml') as f:
data = yaml.load(f, Loader=yaml.FullLoader)
validate_dictionary(data)
print('{} entries found.'.format(len(data)))
print('Validation: PASSED')
if __name__ == '__main__':
main()