-
Notifications
You must be signed in to change notification settings - Fork 2.7k
/
tutorial12-tensorflowdatasets.py
135 lines (102 loc) · 3.58 KB
/
tutorial12-tensorflowdatasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import matplotlib.pyplot
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0], True)
(ds_train, ds_test), ds_info = tfds.load(
"mnist",
split=["train", "test"],
shuffle_files=True,
as_supervised=True, # will return tuple (img, label) otherwise dict
with_info=True, # able to get info about dataset
)
# fig = tfds.show_examples(ds_train, ds_info, rows=4, cols=4)
# print(ds_info)
def normalize_img(image, label):
"""Normalizes images"""
return tf.cast(image, tf.float32) / 255.0, label
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 128
# Setup for train dataset
ds_train = ds_train.map(normalize_img, num_parallel_calls=AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits["train"].num_examples)
ds_train = ds_train.batch(BATCH_SIZE)
ds_train = ds_train.prefetch(AUTOTUNE)
# Setup for test Dataset
ds_test = ds_train.map(normalize_img, num_parallel_calls=AUTOTUNE)
ds_test = ds_train.batch(128)
ds_test = ds_train.prefetch(AUTOTUNE)
model = keras.Sequential(
[
keras.Input((28, 28, 1)),
layers.Conv2D(32, 3, activation="relu"),
layers.Flatten(),
tf.keras.layers.Dense(10, activation="softmax"),
]
)
model.compile(
optimizer=keras.optimizers.Adam(0.001),
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=["accuracy"],
)
model.fit(ds_train, epochs=5, verbose=2)
model.evaluate(ds_test)
(ds_train, ds_test), ds_info = tfds.load(
"imdb_reviews",
split=["train", "test"],
shuffle_files=True,
as_supervised=True, # will return tuple (img, label) otherwise dict
with_info=True, # able to get info about dataset
)
tokenizer = tfds.features.text.Tokenizer()
def build_vocabulary():
vocabulary = set()
for text, _ in ds_train:
vocabulary.update(tokenizer.tokenize(text.numpy().lower()))
return vocabulary
vocabulary = build_vocabulary()
encoder = tfds.features.text.TokenTextEncoder(
list(vocabulary), oov_token="<UNK>", lowercase=True, tokenizer=tokenizer
)
def my_enc(text_tensor, label):
encoded_text = encoder.encode(text_tensor.numpy())
return encoded_text, label
def encode_map_fn(text, label):
# py_func doesn't set the shape of the returned tensors.
encoded_text, label = tf.py_function(
my_enc, inp=[text, label], Tout=(tf.int64, tf.int64)
)
# `tf.data.Datasets` work best if all components have a shape set
# so set the shapes manually:
encoded_text.set_shape([None])
label.set_shape([])
return encoded_text, label
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(1000)
ds_train = ds_train.padded_batch(32, padded_shapes=([None], ()))
ds_train = ds_train.prefetch(AUTOTUNE)
ds_test = ds_test.map(encode_map_fn)
ds_test = ds_test.padded_batch(32, padded_shapes=([None], ()))
model = keras.Sequential(
[
layers.Masking(mask_value=0),
layers.Embedding(input_dim=len(vocabulary) + 2, output_dim=32),
layers.GlobalAveragePooling1D(),
layers.Dense(64, activation="relu"),
layers.Dense(1),
]
)
model.compile(
loss=keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=keras.optimizers.Adam(3e-4, clipnorm=1),
metrics=["accuracy"],
)
model.fit(ds_train, epochs=15, verbose=2)
model.evaluate(ds_test)