forked from peerdavid/capsnet-limitations
-
Notifications
You must be signed in to change notification settings - Fork 2
/
conflicting_bundle.py
191 lines (158 loc) · 8.02 KB
/
conflicting_bundle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
By David Peer.
Implementation of conflicting bundles. See also https://arxiv.org/abs/2011.02956
Usage:
import conflicting_bundles as cb
[...]
cb = cb.bundle_entropy(model, ds, train_batch_size=64, train_learning_rate=1e-3, num_classes=10)
"""
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import math
def bundle_entropy(model, train_ds, train_batch_size, train_lr,
num_classes, evaluation_size=512, all_layers=False):
""" Given a dataset train_ds and a model, this function returns
foreach a^l the number of bundles and the bundle entropy at time
step t.
Limitation: This implementation is currently only for a single
GPU. I.e. you can train your model with multiple GPUs, and evaluate
cb with a single gpu.
param: model - The model that should be evaluated. Note: We assume that model.cb exists.
model.cb is a list of tuples with (a, layer) pairs.
E.g.: model.cb = [(a_1, hidden_layer_1), (a_2, hidden_layer_2)]
param: train_ds - Training dataset. Its important to NOT use the test set as we want to check
how the training was negatively influenced. See https://arxiv.org/abs/2011.02956
param: num_classes - Number of classes needed to calculate the bundle entropy
param: train_batch_size - The batch size that was used for training. See https://arxiv.org/abs/2011.02956
param: train_lr - The batch size that was used for training. See https://arxiv.org/abs/2011.02956
param: evaluation_size - Subset size of train_ds to use for bundle calculation
param: all_layers - False to evaluate only a^L, otherwise a^1 to a^L are evaluated
returns: [[num_bundles_1, bundle_entropy_1], ... [num_bundles_L, bundle_entropy_L]]
"""
train_batch_size = float(train_batch_size)
layer_eval = 0 if all_layers else -1
A, Y = [], []
# Execute in eager mode to get access to model.cb
def inference(x, y):
model(x, y, training=False)
for x, y in train_ds:
if len(Y) * train_batch_size >= evaluation_size:
continue
inference(x, y)
if not hasattr(model, 'cb'):
print("(Warning) The provided model has no cb attribute set.")
print(
"Please set a^(l) values in the array cb to measure the bundle entropy.")
return None
cb = model.cb[layer_eval:]
layers = [c[1] for c in cb]
A.append([c[0] for c in cb])
Y.append(y)
Y = tf.concat(Y, axis=0)
# If needed we return the conflicts for each layer. For evaluation only
# a^{(L)} is needed, but e.g. to use it with auto-tune all conflicting
# layers are needed. Therefore if all layers are evaluated the complexity
# is O(L * |X|)
res = []
A = zip(*A)
for i, a in enumerate(A):
a = tf.cast(tf.concat(a, axis=0), tf.float32)
a = tf.concat(a, axis=0)
a = tf.reshape(a, [int(train_batch_size), -1])
# As written in the paper we not directly compare a_i and a_j in order
# to consider also floating point representations during backpropagation
# Instead of doing an inequallity check using \gamma, we do an equality
# check after subtracting the values from the maximum weights which
# is equivalent. Note that this is not possible if gamma should be
# larger than the floating point resolution.
weights_amplitude = _get_weight_amplitude(model)
equality_check = weights_amplitude - a * \
train_lr * (1.0 / train_batch_size)
equality_check = tf.reshape(
equality_check, [tf.shape(equality_check)[0], -1])
num_bundle, bundle_entropy = _calculate_bundles(
equality_check, Y, num_classes)
res.append([num_bundle, bundle_entropy])
print("%d, %d, %d, %.5f" % (int(i/2), i %
2, num_bundle, bundle_entropy), flush=True)
return res
def _get_weight_amplitude(layer):
""" With this function we approximate the weight
amplitude of a layer. A layer could consist of multiple
sub layers (e.g. a vgg block with multiple layers).
Therefore, we take the mean amplitude of each weight in
trainable_weights.
param: layer - Layer for which the amplitude should be known
return: Single floating point value of the max. weight amplitude of some sub layers
"""
if not hasattr(layer, 'trainable_weights') or len(layer.trainable_weights) <= 0:
return 0.0
ret = []
for weights in layer.trainable_weights:
ret.append(tf.reduce_max(tf.abs(weights)))
return tf.reduce_max(ret)
def _calculate_bundles(X, Y, num_classes):
""" Calculates all bundles of X and calculates the bundle entropy using
the label information from Y. Iterates over X and therefore the
complexity is O(X) if calculate_bundle can be fully vectorized.
"""
# Create array for each feature and assign a unique bundlenumber...
bundle = tf.zeros(tf.shape(X)[0], dtype=tf.float32)
bundle_entropy = tf.constant(0.0, dtype=tf.float32)
for i in range(len(X)):
if bundle[i] != 0:
continue
bundle, bundle_entropy = _calculate_single_bundle(
i, bundle, X, Y, bundle_entropy, num_classes)
num_bundles = tf.reduce_max(bundle)
return num_bundles, bundle_entropy / float(tf.shape(X)[0])
def _calculate_single_bundle(i, bundle, X, Y, bundle_entropy, num_classes):
""" This function calculates a bundle which contains all x which are similar
than X[i] using vectorization such that only O(|X|) is needed to calculate
bundles for one layer. The idea is to use equality operators, mask
all others out, set the bundle and reuse this information vector
for the next samples. As a vector contains all equivalent samples
also the bundle entropy at time step t can immediately be calculated.
"""
dim_X = tf.cast(tf.shape(X)[1], tf.float32)
next_bundle_id = tf.reduce_max(bundle) + tf.constant(1.0, dtype=tf.float32)
num_samples = tf.shape(X)[0]
x = X[i]
# Ignore all that are already bundleed (bundle > 0)
zero_out = tf.cast(tf.less_equal(bundle, 0.0), tf.float32)
# Set bundle id for current x[i]
bundle += tf.one_hot(i, num_samples, dtype=tf.float32) * \
next_bundle_id * zero_out
# Also ignore current x[i]
zero_out = tf.cast(tf.less_equal(bundle, 0.0), tf.float32)
# Get all equivalent components (single component only of possible ndim inputs)
equal_components = tf.cast(tf.equal(x, X), tf.float32)
# All components must be equivalent, therefore check (using the sum and dim_X) whether this is the case
num_equal_components = tf.reduce_sum(equal_components, axis=-1)
same_bundle = tf.cast(tf.greater_equal(
num_equal_components, dim_X), tf.float32)
# And bundle all components
bundle += same_bundle * next_bundle_id * zero_out
# Calculate the bundle entropy for the current bundle (same_bundle) using the entropy
bundle_only = tf.cast(tf.boolean_mask(Y, same_bundle), tf.int32)
bundle_class_prob = tf.math.bincount(
bundle_only, minlength=num_classes, maxlength=num_classes, dtype=tf.float32)
bundle_class_prob /= tf.reduce_sum(bundle_class_prob)
bundle_size = tf.cast(tf.reduce_sum(same_bundle), tf.float32)
entropy = -tf.reduce_sum(bundle_class_prob *
tf.math.log(bundle_class_prob+1e-5), axis=-1)
bundle_entropy += tf.maximum(0.0, entropy) * bundle_size
return bundle, bundle_entropy
def conflicts_integral(conflicts_int, conflicts, epochs):
if conflicts_int == None:
return [[c[0], c[1] / float(epochs)] for c in conflicts]
assert len(conflicts_int) == len(
conflicts), "Dimension of exponential moving average does not match."
ret = []
for c in range(len(conflicts)):
ret.append([
conflicts[c][0],
conflicts_int[c][1] + (conflicts[c][1] / float(epochs))
])
return ret