-
Notifications
You must be signed in to change notification settings - Fork 1
/
policy_nn.py
97 lines (77 loc) · 3.36 KB
/
policy_nn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
"""
Optimizing: Policy Neural-Network's parameters (layers' weights and biases), for autonomous agent control.
Policy Neural Network I/O: environment state --> probability distribution over actions.
Any AI Gym environment can be chosen, as long as the relevant variables parameters
(`env_name`, `input_dims`, `n_actions`, `optimal_fit`) are changed accordingly.
Here, AI Gym's CartPole environment is chosen as an example.
"""
import numpy as np
import torch
import torch.distributions
import torch.nn.functional
import gym
max_gen_num = 25
pop_size = 500
env_name = 'CartPole-v0'
input_dims = 4 # input layer
n_actions = 2 # output layer
optimal_fit = 200 # game automatically terminates after 200 time-steps
hidden_layers_units = [25, 10] # The individual NN hidden layers
#########################################
# NN-model specific
layers_weights_shapes = [] # the parameter vector's layers (each layer's weight matrix dimension)
params_num = 0 # the total number parameters
for i in range(len(hidden_layers_units) + 1):
v_curr = n_actions if i == len(hidden_layers_units) else hidden_layers_units[i]
v_prev = input_dims if i == 0 else hidden_layers_units[i - 1]
layers_weights_shapes.append((v_curr, v_prev))
params_num += (v_curr * (v_prev + 1))
task_name = 'CartPole' + str(params_num) + 'D'
def split_model_params_vec(params_vec):
"""
:param params_vec: NN parameters vector
:return: params_by_layer: a list of layer-wise tuples: (layer's weight matrix, layer's bias vector)
"""
params_by_layer = []
end_pt = 0
for i, layer in enumerate(layers_weights_shapes):
start_pt, end_pt = end_pt, end_pt + np.prod(layer)
weights = params_vec[start_pt:end_pt].view(layer)
start_pt, end_pt = end_pt, end_pt + layer[0]
bias = params_vec[start_pt:end_pt]
params_by_layer.append((weights, bias))
return params_by_layer
def construct_model_and_pass_state(s, params_by_layer):
"""
constructs the NN model, and passes the input state into it
:param s: input state
:param params_by_layer: a list of layer-wise tuples: (layer's weight matrix, layer's bias vector)
:return: probabilities over actions
"""
for i, layer_params in enumerate(params_by_layer):
w, b = layer_params
x = torch.nn.functional.linear(s if i == 0 else x, w, b) # logits
x = torch.relu(x) if i != len(params_by_layer) - 1 else torch.softmax(x, dim=0) # probs
return x
#########################################
# RL specific
env = gym.make(env_name)
def fitness_function(individual_params):
"""
assessing the individual's fitness by testing its model in the CartPole environment
(until it loses the game and returns the number of time steps it lasted as its fitness score).
:return: individual's fitness score
"""
if isinstance(individual_params, np.ndarray):
individual_params = torch.as_tensor(individual_params, dtype=torch.float32)
params_by_layer = split_model_params_vec(individual_params)
done = False
fitness_score = 0
s = torch.from_numpy(env.reset()).float()
while not done:
probs = construct_model_and_pass_state(s, params_by_layer)
a = torch.distributions.Categorical(probs=probs).sample().item()
s_, r, done, info = env.step(a)
s = torch.from_numpy(s_).float()
fitness_score += r
return fitness_score