-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
222 lines (195 loc) · 8.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import argparse
import time
from itertools import count
from pathlib import Path
import numpy as np
import torch
import torch.nn.functional as F
import yaml
from ignite.metrics import FID
from torch.utils.tensorboard import SummaryWriter
from gms import common
# TRAINING SCRIPT
# cmd line args with their default values. models can add additional args of their own
# DG = (D)efault G, where G is a config object. Single letters C, H, F were already taken, so G. lol
# TODO: make this support default None values, would require rework. maybe like dataclass
DG = common.AttrDict()
DG.model = 'vae'
DG.bs = 64
DG.hidden_size = 256
DG.device = 'cuda'
DG.epochs = 50
DG.save_n = 5
DG.logdir = Path('./logs/')
DG.lr = 3e-4
DG.class_cond = 0
DG.binarize = 1
DG.pad32 = 0
DG.mode = 'train'
# path to a model that you want to load and keep training
DG.weights_from = Path('.')
# for computing a latent space for eval metrics
DG.autoencoder = Path('./weights/autoencoder.pt')
# for conditional generation eval metrics
DG.classifier = Path('./weights/classifier.pt')
DG.eval_heavy = 0
DG.skip_training = 0
def load_model_and_data():
"""
Parse command line arguments, load model and data.
"""
# initial parse of the args to allow user to specify model, weights_from, etc
parser = argparse.ArgumentParser()
for key, value in DG.items():
parser.add_argument(f'--{key}', type=common.args_type(value), default=value)
tempG, _ = parser.parse_known_args()
# grab the extra parameters and default argumments from the model or weights_from hps.yaml
defaults = {}
if tempG.weights_from != Path('.'):
# if we have a weights_from, load the config from there.
loaded_hp_file = tempG.weights_from.parent / 'hps.yaml'
with open(loaded_hp_file) as f:
loadedG = common.AttrDict(yaml.load(f, Loader=yaml.Loader))
for key, value in loadedG.items():
defaults[key] = value
if key not in tempG:
parser.add_argument(f'--{key}', type=type(value), default=value)
Model = common.discover_models()[loadedG.model]
else:
Model = common.discover_models()[tempG.model]
for key, value in Model.DG.items():
defaults[key] = value
if key not in tempG:
parser.add_argument(f'--{key}', type=type(value), default=value)
defaults['logdir'] = tempG.logdir / tempG.model
if 'full_cmd' in defaults:
defaults.pop('full_cmd')
# do the final parse of cmd line args for what user passes in and instantiate everything
parser.set_defaults(**defaults)
G = common.AttrDict(parser.parse_args().__dict__)
model = Model(G=G).to(G.device)
if G.weights_from != Path('.'):
model.load_state_dict(
torch.load(G.weights_from, map_location=G.device), strict=False
)
train_ds, test_ds = common.load_mnist(G.bs, binarize=G.binarize, pad32=G.pad32)
print('num_vars', common.count_vars(model))
autoencoder = torch.jit.load(G.autoencoder).to(G.device) if G.eval_heavy else None
classifier = (
torch.jit.load(G.classifier).to(G.device)
if G.eval_heavy and G.class_cond
else None
)
return model, train_ds, test_ds, autoencoder, classifier, G
@torch.inference_mode()
def eval_heavy(logger, model, test_ds, autoencoder, classifier, G):
"""
More computationally intensive eval that draws many samples from the generative model and computes
FID and precision/recall metrics.
"""
TOTAL_SAMPLES = 500 # beyond this, we started getting diminishing returns
# sample the model and get latent vectors for samples as well as test set examples
# so that we can then compare the two distributions (sampled vs. test).
sample_ct = 0
all_z_sample = []
all_z_real = []
fid_buffer = FID(num_features=64, feature_extractor=autoencoder, device=G.device)
metrics = {}
# aggregate latent vectors
if G.class_cond:
metrics['classifier_loss'] = []
all_z_cond_sample = []
for test_batch in test_ds:
test_x, test_y = test_batch[0].to(G.device), test_batch[1].to(G.device)
bs = test_x.shape[0]
if G.class_cond:
# generate class conditioned sample and evaluate it with the classifier
cond_samp = model.sample(bs, y=test_y)
preds = classifier(cond_samp)
metrics['classifier_loss'].append(F.cross_entropy(preds, test_y).item())
all_z_cond_sample.append(autoencoder(cond_samp))
samp = model.sample(bs, y=-torch.ones_like(test_y))
fid_buffer.update((samp, test_x))
all_z_real.append(autoencoder(test_x))
all_z_sample.append(autoencoder(samp))
sample_ct += bs
if sample_ct >= TOTAL_SAMPLES:
break
# evaluate metrics
metrics['ignite_fid'] = fid_buffer.compute()
z_samp = torch.cat(all_z_sample)
z_real = torch.cat(all_z_real)
metrics['fid'] = common.compute_fid(z_samp.cpu().numpy(), z_real.cpu().numpy())
metrics.update(common.precision_recall_f1(real=z_real, gen=z_samp))
if G.class_cond:
z_cond_samp = torch.cat(all_z_cond_sample)
cond_metrics = common.precision_recall_f1(real=z_real, gen=z_cond_samp)
cond_metrics['fid'] = common.compute_fid(
z_cond_samp.cpu().numpy(), z_real.cpu().numpy()
)
metrics.update(common.prefix_dict('cond_', cond_metrics))
# metrics.update(common.prefix_dict('class_conditioned.', cond_metrics))
for key, val in metrics.items():
logger[f'eval/{key}'] += [np.mean(common.to_numpy(val))]
def train(model, train_ds, test_ds, autoencoder, classifier, G):
writer = SummaryWriter(G.logdir)
logger = common.dump_logger({}, writer, 0, G)
# TRAINING AND EVAL LOOP
for epoch in count(0):
# TEST
# (eval first so we can see random/weights_from/teacher init behavior)
model.eval()
with torch.no_grad():
# if we define an explicit loss function, use it to test how we do on the test set.
if hasattr(model, 'loss'):
for test_batch in test_ds:
test_x, test_y = test_batch[0].to(G.device), test_batch[1].to(
G.device
)
_, test_metrics = model.loss(test_x, test_y)
for key in test_metrics:
prefix_key = (
f'{G.model}/test/{key}'
if not key == 'nlogp'
else f'eval/{key}'
)
logger[prefix_key] += [test_metrics[key].detach().cpu().item()]
else:
test_batch = next(iter(test_ds))
test_x, test_y = test_batch[0].to(G.device), test_batch[1].to(G.device)
# run the model specific evaluate function. usually draws samples and creates other relevant visualizations.
eval_time = time.time()
model.evaluate(writer, test_x, test_y, epoch)
logger['dt/eval'] = time.time() - eval_time
# LOGGING
logger['num_vars'] = common.count_vars(model)
if epoch % G.save_n == 0:
model.save(G.logdir, test_x, test_y)
print("SAVED MODEL", G.logdir)
if G.eval_heavy:
print("RUNNING HEAVY EVAL...")
eval_heavy_time = time.time()
eval_heavy(logger, model, test_ds, autoencoder, classifier, G)
logger['dt/eval_heavy'] = time.time() - eval_heavy_time
print("DONE HEAVY EVAL")
logger = common.dump_logger(logger, writer, epoch, G)
if epoch >= G.epochs:
break
# TRAIN
model.train()
train_time = time.time()
for batch in train_ds:
if G.skip_training:
break
train_x, train_y = batch[0].to(G.device), batch[1].to(G.device)
# TODO: see if we can just use loss and write the gan such that it works.
metrics = model.train_step(train_x, train_y)
for key in metrics:
prefix_key = (
f'{G.model}/train/{key}' if not key == 'nlogp' else f'train/{key}'
)
logger[prefix_key] += [metrics[key].detach().cpu()]
logger['dt/train'] = time.time() - train_time
if __name__ == '__main__':
model, train_ds, test_ds, autoencoder, classifier, G = load_model_and_data()
train(model, train_ds, test_ds, autoencoder, classifier, G)