-
Notifications
You must be signed in to change notification settings - Fork 10
/
attack.py
378 lines (338 loc) · 16.8 KB
/
attack.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim
import numpy as np
import argparse
from train_vgg19 import vgg19
from utils import Noisy,transform,random_label
import os
class L3Attack(torch.autograd.Function):
@staticmethod
def forward(self, model, img, target_lable, dataset, allstep, sink_lr, s_radius):
return L3_function(model, img, target_lable, dataset=dataset, allstep=allstep, lr=sink_lr, s_radius=s_radius)
@staticmethod
def backward(self, grad_output):
return None, grad_output, None, None, None, None, None
class L4Attack(torch.autograd.Function):
@staticmethod
def forward(self, model, img, dataset, allstep, sink_lr, u_radius):
return L4_function(model, img, dataset=dataset, allstep=allstep, lr=sink_lr, u_radius=u_radius)
@staticmethod
def backward(self, grad_output):
return None, grad_output, None, None, None, None
"""
Return the variable used for L3 function specified in paper
[lr] specifies the learning rate of the attack algorithm A
[s_radius] specifies the maximum l infinity distance between the origianl [img]
"""
def L3_function(model,
img,
target_lable,
dataset,
allstep,
lr,
s_radius,
margin=20,
use_margin=False):
x_var = torch.autograd.Variable(img.clone().cuda(), requires_grad=True)
optimizer_s = optim.SGD([x_var], lr=lr)
with torch.enable_grad():
for step in range(allstep):
optimizer_s.zero_grad()
output = model(transform(x_var, dataset=dataset))
if use_margin:
target_lable = target_lable[0].item()
_, top2_1 = output.data.cpu().topk(2)
argmax11 = top2_1[0][0]
if argmax11 == target_l:
argmax11 = top2_1[0][1]
loss = (output[0][argmax11] - output[0][target_l] + margin).clamp(min=0)
else:
loss = F.cross_entropy(output, target_lable)
loss.backward()
x_var.data = torch.clamp(x_var - lr * x_var.grad.data, min=0, max=1)
x_var.data = torch.clamp(x_var - img, min=-s_radius, max=s_radius) + img
return x_var
"""
Return the variable used for L4 function specified in paper
[lr] specifies the learning rate of the attack algorithm A
[s_radius] specifies the maximum l infinity distance between the origianl [img]
"""
def L4_function(model,
img,
dataset,
allstep,
lr,
u_radius,
margin=20,
use_margin=False):
x_var = torch.autograd.Variable(img.clone().cuda(), requires_grad=True)
true_label = model(transform(x_var.clone(), dataset=dataset)).data.max(1, keepdim=True)[1][0].item()
optimizer_s = optim.SGD([x_var], lr=lr)
with torch.enable_grad():
for step in range(allstep):
optimizer_s.zero_grad()
output = model(transform(x_var, dataset=dataset))
if use_margin:
_, top2_1 = output.data.cpu().topk(2)
argmax11 = top2_1[0][0]
if argmax11 == true_label:
argmax11 = top2_1[0][1]
loss = (output[0][true_label] - output[0][argmax11] + margin).clamp(min=0)
else:
loss = -F.cross_entropy(output, torch.LongTensor([true_label]).cuda())
loss.backward()
x_var.data = torch.clamp(x_var - lr * x_var.grad.data, min=0, max=1)
x_var.data = torch.clamp(x_var - img, min=-u_radius, max=u_radius) + img
return x_var
def noisy_img(img, n_radius):
return img + n_radius * torch.randn_like(img)
""" Return the probability-match cross entropy """
def cross_entropy(pred, target):
logsoftmax = nn.LogSoftmax()
return torch.mean(torch.sum(- target * logsoftmax(pred), dim=1))
def target_distribution(original_softmax, target_label):
true_label = original_softmax.max(1, keepdim=True)[1][0].item()
target_l = original_softmax.clone()
temp = target_l.clone()[0, int(true_label)]
target_l[0, int(true_label)] = target_l[0, int(target_label)]
target_l[0, int(target_label)] = temp
return target_l
""" Return the best effort whitebox or gray box PGD attack
[allstep] specifies the number of steps
[lr] specifies the learning rate
[radius] specifies the maximum l infinity distance between the origianl [img]
[lbd] specifies the weight we add on L1 loss
[setting] can be 'white' or 'gray'
"""
def PGD(model,
img,
dataset='imagenet',
allstep=30,
lr=0.03,
radius=0.1,
lbd=2,
setting='white',
noise_radius=0.1,
targeted_lr = 0.005,
targeted_radius = 0.03,
untargeted_lr = 0.1,
untargeted_radius = 0.03):
model.eval()
x_var = torch.autograd.Variable(img.clone().cuda(), requires_grad=True)
true_label = model(transform(x_var.clone(), dataset=dataset)).data.max(1, keepdim=True)[1][0].item()
original_softmax = F.softmax(model(transform(x_var.clone(), dataset=dataset))).data
optimizer = optim.Adam([x_var], lr=lr)
target_label = random_label(true_label, dataset=dataset)
target_l = torch.LongTensor([target_label]).cuda()
target_dist = target_distribution(original_softmax, target_label)
for i in range(allstep):
optimizer.zero_grad()
total_loss = 0
output_ori = model(transform(x_var, dataset=dataset))
loss1 = cross_entropy(output_ori, target_dist) # loss of original image, should descend
if setting == 'white':
total_loss += lbd * loss1
noise_var = noisy(x_var, noise_radius)
output_noise = model(transform(noise_var, dataset=dataset))
loss2 = torch.norm(F.softmax(output_noise) - F.softmax(output_ori),
1) # l1(noisy_img-origin_img), should descend
total_loss += loss2
new_target = torch.LongTensor([random_label(target_label, dataset=dataset)]).cuda()
t_attack_var = t_attack(model, x_var, new_target, dataset, 1, targeted_lr, targeted_radius) # 1 step t_attack
output_t_attack = model(transform(t_attack_var, dataset=dataset))
loss3 = F.cross_entropy(output_t_attack,
new_target) # 1 step of targeted attack image, should be new_target, descend
total_loss += loss3
u_attack_var = u_attack(model, x_var, dataset, 1, untargeted_lr, untargeted_radius) # 1 step u_attack, if you want to do white box attack for inception, then you will need to change 0.1 to 3 here
output_u_attack = model(transform(u_attack_var, dataset=dataset))
loss4 = F.cross_entropy(output_u_attack,
target_l) # 1 step of u_targeted attack, should be away from target_l, ascend
total_loss -= loss4
elif setting == 'gray':
total_loss += loss1
else:
raise "attack setting is not supported"
total_loss.backward()
optimizer.step()
x_var.data = torch.clamp(torch.clamp(x_var, min=0, max=1) - img, min=-radius, max=radius) + img
return x_var
""" Return the best effort whitebox or gray box CW attack
[allstep] specifies the number of steps
[lr] specifies the learning rate
[radius] specifies the maximum l infinity distance between the origianl [img]
[lbd] specifies the weight we add on L1 loss
[setting] can be 'white' or 'gray'
"""
def CW(model,
img,
dataset='imagenet',
allstep=30,
lr=0.03,
radius=0.1,
margin=20.0,
lbd=2,
setting='white',
noise_radius=0.1,
targeted_lr = 0.005,
targeted_radius = 0.03,
untargeted_lr = 0.1,
untargeted_radius = 0.03):
model.eval()
x_var = torch.autograd.Variable(img.clone().cuda(), requires_grad=True)
true_label = model(transform(x_var.clone(), dataset=dataset)).data.max(1, keepdim=True)[1][0].item()
optimizer = optim.Adam([x_var], lr=lr)
target_label = random_label(true_label, dataset=dataset)
for step in range(allstep):
optimizer.zero_grad()
total_loss = 0
output_ori = model(transform(x_var, dataset=dataset))
_, top2_1 = output_ori.data.cpu().topk(2)
argmax11 = top2_1[0][0]
if argmax11 == target_label:
argmax11 = top2_1[0][1]
loss1 = (output_ori[0][argmax11] - output_ori[0][target_label] + margin).clamp(min=0)
if setting == 'white':
total_loss += lbd * loss1 # loss of original image, should descend
noise_var = noisy(x_var, noise_radius)
output_noise = model(transform(noise_var, dataset=dataset))
loss2 = torch.norm(F.softmax(output_noise) - F.softmax(output_ori),
1) # l1(noisy_img-origin_img), should descend
total_loss += loss2
new_tl = random_label(target_label, dataset=dataset)
new_target = torch.LongTensor([new_tl]).cuda()
t_attack_var = t_attack(model, x_var, new_target, dataset, 1, targeted_lr, targeted_radius) # 1 step t_attack
output_t_attack = model(transform(t_attack_var, dataset=dataset))
_, top2_3 = output_t_attack.data.cpu().topk(2)
argmax13 = top2_3[0][0]
if argmax13 == new_tl:
argmax13 = top2_3[0][1]
loss3 = (output_t_attack[0][argmax13] - output_t_attack[0][new_tl] + margin).clamp(
min=0) # 1 step of targeted attack image, should be new_target, descend
total_loss += loss3 # loss of sink image, should descend
u_attack_var = u_attack(model, x_var, dataset, 1, untargeted_lr, untargeted_radius) # 1 step u_attack, if you want to do white box attack for inception, then you will need to change 0.1 to 3 here
output_u_attack = model(transform(u_attack_var, dataset=dataset))
_, top2_4 = output_u_attack.data.cpu().topk(2)
argmax14 = top2_4[0][1]
if argmax14 == target_label:
argmax14 = top2_4[0][0]
loss4 = (output_u_attack[0][argmax14] - output_u_attack[0][target_label] + margin).clamp(
min=0) # 1 step of u_targeted attack, should be away from target_l, ascend
total_loss -= loss4
elif setting == 'gray':
total_loss += loss1
else:
raise "attack setting is not supported"
total_loss.backward()
optimizer.step()
x_var.data = torch.clamp(torch.clamp(x_var, min=0, max=1) - img, min=-radius, max=radius) + img
return x_var
parser = argparse.ArgumentParser(description='PyTorch White Box Adversary Generation')
parser.add_argument('--real_dir', type=str, required=True, help='directory to store images correctly classified')
parser.add_argument('--adv_dir', type=str, required=True, help='directory to store adversarial images')
parser.add_argument('--name', type=str, default='_demo_',required=True, help='the name of the adversarial example')
parser.add_argument('--dataset', type=str, default='imagenet', help='dataset, imagenet or cifar')
parser.add_argument('--setting', type=str, default='white', help='attack, white or gray')
parser.add_argument('--allstep', type=int, default=50, help='number of steps to run an iterative attack')
parser.add_argument('--base', type=str, default="resnet", help='model, vgg for cifar and resnet/inception for imagenet')
parser.add_argument('--lowbd', type=int, default=0, help='index of the first adversarial example to load')
parser.add_argument('--upbd', type=int, default=1000, help='index of the last adversarial example to load')
parser.add_argument('--radius', type=float, default=0.1, help='adversarial radius')
args = parser.parse_args()
t_attack = L3Attack.apply
u_attack = L4Attack.apply
noisy = Noisy.apply
real_d = os.path.join(args.real_dir,args.base)
adv_d = os.path.join(args.adv_dir,args.base)
if args.dataset == 'imagenet':
data_dir = './imagenetdata/'
os.makedirs(adv_d, exist_ok=True)
os.makedirs(os.path.join(adv_d,'pgd'), exist_ok=True)
os.makedirs(os.path.join(adv_d,'cw'), exist_ok=True)
os.makedirs(real_d, exist_ok=True)
noise_radius = 0.1 #specifies the noise radius in l1 norm detection, used in C1 to measure robustness.
targeted_lr = 0.005 #specifies the learning rate for targeted attack detection criterion, used in C2t
targeted_radius = 0.03 #specifies the radius of targeted attack detection criterion, used in C2t
untargeted_radius = 0.03 #specifies the radius of untargeted attack detection criterion, used in C2u
#### use ImageFolder to load images, need to map label correct with target_transform
testset = torchvision.datasets.ImageFolder(root=data_dir,
transform=torchvision.transforms.Compose([
torchvision.transforms.Resize(256),
torchvision.transforms.CenterCrop(224),
torchvision.transforms.ToTensor(),]),
)
if args.base == 'resnet':
model = models.resnet101(pretrained=True)
untargeted_lr = 0.1 #specifies the learning rate for untargeted attack detection criterion, used in C2u
elif args.base == 'inception':
model = models.inception_v3(pretrained=True, transform_input=False)
untargeted_lr = 3 #specifies the learning rate for untargeted attack detection criterion, used in C2u
else:
raise Exception('No such model predefined.')
model = torch.nn.DataParallel(model).cuda()
elif args.dataset == 'cifar':
data_dir = './cifardata/'
os.makedirs(adv_d, exist_ok=True)
os.makedirs(os.path.join(adv_d,'pgd'), exist_ok=True)
os.makedirs(os.path.join(adv_d,'cw'), exist_ok=True)
os.makedirs(real_d, exist_ok=True)
noise_radius = 0.01 #specifies the noise radius in l1 norm detection, used in C1 to measure robustness.
targeted_lr = 0.0005 #specifies the learning rate for targeted attack detection criterion, used in C2t
targeted_radius = 0.5 #specifies the radius of targeted attack detection criterion, used in C2t
untargeted_radius = 0.5 #specifies the radius of untargeted attack detection criterion, used in C2u
untargeted_lr = 1 #specifies the learning rate for untargeted attack detection criterion, used in C2u
testset = torchvision.datasets.CIFAR10(root=data_dir, train=False, download=True,
transform=torchvision.transforms.Compose(
[torchvision.transforms.ToTensor(), ]))
if args.base == "vgg":
model = vgg19()
model.features = torch.nn.DataParallel(model.features)
model.cuda()
checkpoint = torch.load('./vgg19model/model_best.pth.tar')#save directory for vgg19 model
else:
raise Exception('No such model predefined.')
model.load_state_dict(checkpoint['state_dict'])
else:
raise Exception('Not supported dataset.')
model.eval()
title = args.name + str(args.allstep)
numcout = 0
for i in range(args.lowbd, args.upbd):
view_data, view_data_label = testset[i]
view_data = view_data.unsqueeze(0).cuda()
view_data_label = view_data_label * torch.ones(1).cuda().long()
model.eval()
predicted_label = model(transform(view_data.clone(), dataset=args.dataset)).data.max(1, keepdim=True)[1][0]
if predicted_label != view_data_label:
continue#note that only load images that were classified correctly
torch.save(view_data, os.path.join(real_d, str(numcout) + '_img.pt'))
torch.save(view_data_label, os.path.join(real_d, str(numcout) + '_label.pt'))
torch.save(PGD(model,
view_data,
dataset = args.dataset,
allstep = args.allstep,
radius = args.radius,
setting = args.setting,
noise_radius = noise_radius,
targeted_lr = targeted_lr,
targeted_radius = targeted_radius,
untargeted_lr = untargeted_lr,
untargeted_radius = untargeted_radius),
os.path.join(os.path.join(adv_d, 'pgd'), str(numcout) + title + '.pt'))
torch.save(CW(model,
view_data,
dataset = args.dataset,
allstep = args.allstep,
radius = args.radius,
setting = args.setting,
noise_radius = noise_radius,
targeted_lr = targeted_lr,
targeted_radius = targeted_radius,
untargeted_lr = untargeted_lr,
untargeted_radius = untargeted_radius),
os.path.join(os.path.join(adv_d, 'cw'), str(numcout) + title + '.pt'))
numcout += 1
print('Finish generating white box adversaries')