-
Notifications
You must be signed in to change notification settings - Fork 1
/
util.py
105 lines (90 loc) · 3.3 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Generates feature vectors for logical images in the dataset.
import json
import math
import nltk
import numpy as np
import os
import random
import sys
import tensorflow as tf
from scipy.misc import imread
MAX_NUM_ITEMS = 8
EOS_TOK = "_EOS"
NIL_TOK = "_NIL"
UNK_TOK = "_UNK"
def img_path_dict(root):
img_paths = { }
for directory in os.listdir(root):
dirpath = os.path.join(root, directory)
if os.path.isdir(dirpath):
for filename in os.listdir(dirpath):
if filename.endswith(".png"):
# Image filenames in the format split-x-y-z.png, where x-y is the
# pres id and pres pos, and z is the permutation number
code = "-".join(filename.split("-")[1:3])
if not code in img_paths:
img_paths[code] = dirpath
return img_paths
class example():
def __init__(self, sentence, sentence_length, label, image):
self.sentence = sentence
self.sentence_length = sentence_length
self.label = label
self.image = image
def load_examples(lines, tok_to_id, max_len, img_path_dict, split_name, how_many = 6):
examples = [ ]
for line in lines:
code = line["identifier"]
sentence = nltk.word_tokenize(line["sentence"].lower()) + [EOS_TOK]
# Pad sentence
sent_length = len(sentence)
if sent_length <= max_len:
pad_length = max_len - sent_length
sentence += [NIL_TOK] * pad_length
id_seq = [ ]
for tok in sentence:
if tok in tok_to_id:
id_seq.append(tok_to_id[tok])
else:
id_seq.append(tok_to_id[UNK_TOK])
label = 0
if line["label"] == "true":
label = 1
image_dir = img_path_dict[code]
for i in range(how_many):
filename = split_name + "-" + code + "-" + str(i) + ".png"
full_path = os.path.join(image_dir, filename)
if not os.path.exists(full_path):
# First find in the next full path...
next_path = "/".join(image_dir.split("/")[:-1]) + "/" + str(int(image_dir.split("/")[-1]) + 1)
full_path = os.path.join(next_path, filename)
if not os.path.exists(full_path):
next_path = "/".join(image_dir.split("/")[:-1]) + "/" + str(int(image_dir.split("/")[-1]) - 1)
full_path = os.path.join(next_path, filename)
if not os.path.exists(full_path):
print("Can't find full image path " + str(full_path))
else:
image_data = imread(full_path)[:,:,:-1]
examples.append(example(id_seq, sent_length, label, image_data))
else:
image_data = imread(full_path)[:,:,:-1]
examples.append(example(id_seq, sent_length, label, image_data))
else:
image_data = imread(full_path)[:,:,:-1]
examples.append(example(id_seq, sent_length, label, image_data))
return examples
def vocab(lines):
tok_to_id = { EOS_TOK : 0, NIL_TOK : 1, UNK_TOK : 2}
id_to_tok = [ EOS_TOK, NIL_TOK, UNK_TOK]
max_len = 0
for line in lines:
sentence = line["sentence"]
tokenized_sentence = nltk.word_tokenize(sentence.lower())
if len(tokenized_sentence) > max_len:
max_len = len(tokenized_sentence)
for word in tokenized_sentence:
if not word in tok_to_id:
tok_to_id[word] = len(id_to_tok)
id_to_tok.append(word)
max_len += 1
return tok_to_id, id_to_tok, max_len