-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_reader.py
303 lines (270 loc) · 14.5 KB
/
data_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import pandas as pd
import numpy as np
from constants import *
class DataReader:
"""
The class to read data set from the given file
"""
def __init__(self, data_set=DEFAULT_SET, label_column=LABEL_COL, batch_size=BATCH_SIZE,
distribution=DEFAULT_DISTRIBUTION, reserved=0):
"""
Load the data from the given data path
:param path: the path of csv file to load data
:param label_column: the column index of csv file to store the labels
:param label_size: The number of overall classes in the given data set
"""
# load the csv file
if data_set == PURCHASE100:
path = PURCHASE100_PATH
data_frame = pd.read_csv(path, header=None)
# extract the label
self.labels = torch.tensor(data_frame[label_column].to_numpy(), dtype=torch.int64).to(DEVICE)
self.labels -= 1
data_frame.drop(label_column, inplace=True, axis=1)
# extract the data
self.data = torch.tensor(data_frame.to_numpy(), dtype=torch.float).to(DEVICE)
elif data_set == CIFAR_10:
samples = np.vstack(
[np.genfromtxt(CIFAR_10_PATH+"train{}.csv".format(x), delimiter=',') for x in range(4)]
)
self.data = torch.tensor(samples[:, :-1], dtype=torch.float).to(DEVICE)
self.labels = torch.tensor(samples[:, -1], dtype=torch.int64).to(DEVICE)
elif data_set == LOCATION30:
path = LOCATION30_PATH
data_frame = pd.read_csv(path, header=None)
# extract the label
self.labels = torch.tensor(data_frame[label_column].to_numpy(), dtype=torch.int64).to(DEVICE)
self.labels -= 1
data_frame.drop(label_column, inplace=True, axis=1)
# extract the data
self.data = torch.tensor(data_frame.to_numpy(), dtype=torch.float).to(DEVICE)
elif data_set == TEXAS100:
path = TEXAS100_PATH
self.data = np.load(path)
self.labels = self.data['labels']
self.data = self.data['features']
self.labels = np.argmax(self.labels, axis=1)
self.labels = torch.tensor(self.labels, dtype=torch.int64).to(DEVICE)
self.data = torch.tensor(self.data, dtype=torch.float).to(DEVICE)
self.data = self.data.to(DEVICE)
self.labels = self.labels.to(DEVICE)
# if there is no reserved data samples defined, then set the reserved data samples to 0
try:
reserved = RESERVED_SAMPLE
except NameError:
reserved = 0
# if there is no FLtrust set defined, then set it to 0
try:
fl_trust_samples = FL_TRUST_SET
except NameError:
fl_trust_samples = 0
# initialize the training and testing batches indices
self.train_set = None
self.test_set = None
self.train_set_last_batch = None
self.test_set_last_batch =None
overall_size = self.labels.size(0)
if distribution is None:
# divide data samples into batches, drop the last bit of data samples to make sure each batch is full sized
overall_size -= 16
rand_perm = torch.randperm(self.labels.size(0)).to(DEVICE)
self.reserve_set = rand_perm[overall_size:]
print("cover dataset size is {}".format(reserved))
#initialize the fl trust set
overall_size -= fl_trust_samples
self.fl_trust = rand_perm[overall_size+reserved:]
print("FL TRUST dataset size is {}".format(fl_trust_samples))
all_size = overall_size
overall_size -= overall_size % batch_size
rand_perm_last = rand_perm[overall_size:all_size]
rand_perm = rand_perm[:overall_size]
print("cover dataset size is {}".format(reserved))
self.last_batch_indices = self.reserve_set
self.batch_indices = rand_perm.reshape((-1, batch_size)).to(DEVICE)
self.train_test_split()
print("Data set "+DEFAULT_SET+
" has been loaded, overall {} records, batch size = {}, testing batches: {}, training batches: {}"
.format(overall_size, batch_size, self.test_set.size(0), self.train_set.size(0)))
def train_test_split(self, ratio=TRAIN_TEST_RATIO, batch_training=BATCH_TRAINING):
"""
Split the data set into training set and test set according to the given ratio
:param ratio: tuple (float, float) the ratio of train set and test set
:param batch_training: True to train by batch, False will not
:return: None
"""
if batch_training:
train_count = round(self.batch_indices.size(0) * ratio[0] / sum(ratio))
last_count = 8
self.train_set = self.batch_indices[:train_count].to(DEVICE)
self.test_set = self.batch_indices[train_count:].to(DEVICE)
self.train_set_last_batch = self.last_batch_indices[:last_count].to(DEVICE)
self.test_set_last_batch = self.last_batch_indices[last_count:].to(DEVICE)
else:
train_count = round(self.data.size(0) * ratio[0] / sum(ratio))
rand_perm = torch.randperm(self.data.size(0)).to(DEVICE)
self.train_set = rand_perm[:train_count].to(DEVICE)
self.test_set = rand_perm[train_count:].to(DEVICE)
def get_train_set(self, participant_index=0):
"""
Get the indices for each training batch
:param participant_index: the index of a particular participant, must be less than the number of participants
:return: tensor[number_of_batches_allocated, BATCH_SIZE] the indices for each training batch
"""
batches_per_participant = self.train_set.size(0) // NUMBER_OF_PARTICIPANTS
lower_bound = participant_index * batches_per_participant
upper_bound = (participant_index + 1) * batches_per_participant
return self.train_set[lower_bound: upper_bound]
def get_test_set(self, participant_index=0):
"""
Get the indices for each test batch
:param participant_index: the index of a particular participant, must be less than the number of participants
:return: tensor[number_of_batches_allocated, BATCH_SIZE] the indices for each test batch
"""
batches_per_participant = self.test_set.size(0) // NUMBER_OF_PARTICIPANTS
lower_bound = participant_index * batches_per_participant
upper_bound = (participant_index + 1) * batches_per_participant
return self.test_set[lower_bound: upper_bound]
def get_last_train_batch(self):
"""
Get the last batch of training data
:return: tuple (tensor, tensor) the tensor representing the data and labels
"""
return self.train_set_last_batch
def get_last_test_batch(self):
"""
Get the last batch of testing data
:return: tuple (tensor, tensor) the tensor representing the data and labels
"""
self.test_set_last_batch = self.test_set_last_batch.reshape((-1))
return self.test_set_last_batch
def get_batch(self, batch_indices):
"""
Get the batch of data according to given batch indices
:param batch_indices: tensor[BATCH_SIZE], the indices of a particular batch
:return: tuple (tensor, tensor) the tensor representing the data and labels
"""
return self.data[batch_indices], self.labels[batch_indices]
def get_honest_node_member(self,participant_index = 0):
"""
Get the member sample indices for each training batch
:param participant_index: the index of a particular participant, must be less than the number of participants
:return: tensor[number_of_batches_allocated, BATCH_SIZE] the indices for each training batch
"""
member_list = []
train_flatten = self.train_set.flatten().to(DEVICE)
for j in range(len(train_flatten)):
if train_flatten[j] in self.get_train_set(participant_index):
member_eachx, member_eachy = self.get_batch(train_flatten[j])
member_list.append(train_flatten[j])
member_total = torch.tensor(member_list).to(DEVICE)
return member_total
def get_honest_node_nonmember(self,participant_index = 0):
"""
Get the non-member sample indices for each training batch
:param participant_index: the index of a particular participant, must be less than the number of participants
:return: tensor[number_of_batches_allocated, BATCH_SIZE] the indices for each training batch
"""
self.train_set = torch.concat(self.train_set,self.reserve_set)
def get_black_box_batch(self, member_rate=BLACK_BOX_MEMBER_RATE, attack_batch_size=NUMBER_OF_ATTACK_SAMPLES):
"""
Generate normal batches for black box training
:param member_rate The rate of member data samples
:param attack_batch_size the number of data samples allocated to the black-box attacker
:return: tuple (tensor, tensor, tensor) the tensor representing the data and labels
"""
member_count = round(attack_batch_size * member_rate)
non_member_count = attack_batch_size - member_count
train_flatten = self.train_set.flatten().to(DEVICE)
test_flatten = self.test_set.flatten().to(DEVICE)
member_indices = train_flatten[torch.randperm(len(train_flatten))[:member_count]].to(DEVICE)
non_member_indices = test_flatten[2:][torch.randperm((len(test_flatten)))[:non_member_count]].to(DEVICE)
result = torch.cat([member_indices, non_member_indices]).to(DEVICE)
result = result[torch.randperm(len(result))].to(DEVICE)
return result, member_indices, non_member_indices
def get_black_box_batch_fixed(self, member_rate=BLACK_BOX_MEMBER_RATE, attack_batch_size=NUMBER_OF_ATTACK_SAMPLES):
"""
Generate batches for black box training, e.g. 2 member samples in same class and rest of the samples are non-member samples
:param member_rate The rate of member data samples
:param attack_batch_size the number of data samples allocated to the black-box attacker
:return: tuple (tensor, tensor, tensor) the tensor representing the data and labels
"""
participant_member_count = 2
member_list = []
non_member_count = attack_batch_size - participant_member_count
participant_nonmember_count = non_member_count // NUMBER_OF_PARTICIPANTS
train_flatten = self.train_set.flatten().to(DEVICE)
test_flatten = self.test_set.flatten().to(DEVICE)
for j in range(len(train_flatten)):
if train_flatten[j] in self.get_train_set(0):
if len(member_list) < participant_member_count:
member_eachx, member_eachy = self.get_batch(train_flatten[j])
if int(member_eachy) == 2:
member_list.append(train_flatten[j])
else:
break
member_indices = torch.tensor(member_list).to(DEVICE)
member_class_list = []
member_x,member_y = self.get_batch(member_indices)
for i in member_y:
member_class_list.append(i)
same_class_list = []
diff = 0
for index,i in enumerate(test_flatten):
test_x, test_y = self.get_batch(i)
if test_y not in member_class_list:
same_class_list.append(i)
diff_class_test_flatten = torch.tensor(same_class_list)
non_member_indices = diff_class_test_flatten[torch.randperm((len(diff_class_test_flatten)))[:non_member_count]].to(DEVICE)
non_member_x,nonmember_y = self.get_batch(non_member_indices)
result = torch.cat([member_indices, non_member_indices]).to(DEVICE)
return result, member_indices, non_member_indices
def get_black_box_batch_fixed_balance_class(self, member_rate=BLACK_BOX_MEMBER_RATE, attack_batch_size=NUMBER_OF_ATTACK_SAMPLES):
"""
Generate batches for black box training, e.g. 2 member samples in different class and rest of the samples are non-member samples
:param member_rate The rate of member data samples
:param attack_batch_size the number of data samples allocated to the black-box attacker
:return: tuple (tensor, tensor, tensor) the tensor representing the data and labels
"""
participant_member_count = 2
member_list = []
non_member_count = attack_batch_size - participant_member_count
participant_nonmember_count = non_member_count // NUMBER_OF_PARTICIPANTS
train_flatten = self.train_set.flatten().to(DEVICE)
test_flatten = self.test_set.flatten().to(DEVICE)
for j in range(len(train_flatten)):
if train_flatten[j] in self.get_train_set(0):
if len(member_list) < participant_member_count:
member_eachx, member_eachy = self.get_batch(train_flatten[j])
if int(member_eachy) == 2:
member_list.append(train_flatten[j])
else:
break
member_indices = torch.tensor(member_list).to(DEVICE)
member_class_list = []
member_x,member_y = self.get_batch(member_indices)
for i in member_y:
member_class_list.append(i)
non_member_indices = test_flatten[torch.randperm((len(test_flatten)))[:non_member_count]].to(DEVICE)
non_member_x,nonmember_y = self.get_batch(non_member_indices)
result = torch.cat([member_indices, non_member_indices]).to(DEVICE)
return result, member_indices, non_member_indices
def del_samples(self,index,train_set,last_batch):
"""
Delete the samples from the training set
:param index The index of the sample to be deleted
:param train_set The training set
:param last_batch The last batch of the training set
:return: tuple (tensor, tensor) the tensor representing the data and labels
"""
flatten_set = train_set.flatten()
if last_batch!= None:
flatten_set = torch.cat((flatten_set,last_batch.flatten()),0)
flatten_set = flatten_set.cpu().numpy().tolist()
flatten_set.remove(index)
over_train_size = len(flatten_set)
over_full_size = over_train_size- over_train_size % BATCH_SIZE
full_indeices = torch.tensor(flatten_set[:over_full_size]).to(DEVICE)
train_set = full_indeices.reshape((-1, BATCH_SIZE)).to(DEVICE)
last_indeices = torch.tensor(flatten_set[over_full_size:over_train_size]).to(DEVICE)
last_batch = last_indeices
return train_set,last_batch