-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfrecord_converter
98 lines (75 loc) · 3.62 KB
/
tfrecord_converter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 15 03:25:06 2017
"""
from random import shuffle
import glob, sys
from PIL import Image
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
class TfConverter():
def __init__(self):
pass
def prepare(self, training_set_dir, shuffle_data=True):
images_loc = glob.glob(training_set_dir)
labels = [0 if 'cat' in img_loc else 1 for img_loc in images_loc] # 0 = cat, 1 = dog
# to reshuffle data
if shuffle_data:
c = list(zip(images_loc, labels)) # zip addr and label and make one pair as list element
shuffle(c) # shuffle them
addrs, labels = zip(*c) # separate them and then extract as addr and labels in same order
# now addr contains all the addr and labels contains all the corresponding labels
else:
addrs = images_loc
return addrs, labels
def train_test_valid_set(self, addrs, labels):
train_addrs = addrs[0:int(0.6*len(addrs))]
train_labels = labels[0:int(0.6*len(labels))]
train_set = (train_addrs, train_labels)
valid_addrs = addrs[int(0.6*len(addrs)): int(0.8*len(addrs))]
valid_labels = labels[int(0.6*len(labels)): int(0.8*len(addrs))]
valid_set = (valid_addrs, valid_labels)
test_addrs = addrs[int(0.8*len(addrs)):]
test_labels = labels[int(0.8*len(labels)):]
test_set = (test_addrs, test_labels)
return train_set, valid_set, test_set
def load_image(self, image_loc, dim):
'''read an image and resize to dim - a 2-tuple'''
# trick to convert BGR to RGB or vice versa
# im = im[:, :, ::-1] or
#r, g, b = im.split()
#im = Image.merge("RGB", (b, g, r))#or
im = Image.open(image_loc)
im = im.resize(dim, Image.LANCZOS)
im = np.asarray(im)
return im
def create_records(self, dataset, filename):
''' dataset is dataset tuple object which have image address in dim = 0 and labels in dim = 1
dataset_name is the name of dataset, one of train, valid or test
filename is the name of file where to store tfrecrds
'''
writer = tf.python_io.TFRecordWriter(filename)
for i in range(len(dataset[0])): #train addrs
#print how many images are saved every 1000 images
if not i % 1000:
print('writing data: {}/{}'.format(i, len(dataset[0])))
sys.stdout.flush()
img = self.load_image(dataset[0][i], (224, 224))
print('size: '+str(np.shape(img)))
label = dataset[1][i]
feature = {
'label': self._int64_feature(label),
'image': self._bytes_feature(tf.compat.as_bytes(img.tostring()))
}
#create an exmample protocol buffer
example = tf.train.Example(features = tf.train.Features(feature=feature))
# serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
def _int64_feature(self, value):
return tf.train.Feature(int64_list = tf.train.Int64List(value=[value]))
def _bytes_feature(self, value):
return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))