forked from intelligent-environments-lab/CityLearn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_SAC.py
319 lines (260 loc) · 13.3 KB
/
train_SAC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/env python
# coding: utf-8
"""
Implementation of Soft Actor Critic (SAC) network
using PyTorch.
See https://arxiv.org/pdf/1801.01290.pdf for algorithm details.
@author: Anjukan Kathirgamanathan 2020 (k.anjukan@gmail.com) and Kacper
Twardowski (kanexer@gmail.com)
Project for CityLearn Competition
"""
# Import Packages
import argparse
import numpy as np
import random
import itertools
import torch
from agent_SAC import SAC, RBC_Agent
from torch.utils.tensorboard import SummaryWriter
from citylearn import CityLearn
from pathlib import Path
import os, time, warnings
from PIL import Image
from torchvision.transforms import ToTensor
from algo_utils import graph_total, graph_building, tabulate_table
# Ignore the casting to float32 warnings
warnings.simplefilter("ignore", UserWarning)
"""
###################################
STEP 1: Set the Training Parameters
======
To complete
"""
parser = argparse.ArgumentParser(description='PyTorch Soft Actor-Critic Args')
parser.add_argument('--seed', type=int, default=123456, metavar='N',
help='random seed (default: 123456)')
parser.add_argument('--num_episodes', type=int, default=100, metavar='N',
help='Number of episodes to train for (default: 1000000)')
parser.add_argument('--start_steps', type=int, default=8760*1, metavar='N',
help='Steps sampling random actions (default: 8760)')
parser.add_argument('--checkpoint_interval', type=int, default=10, metavar='N',
help='Saves a checkpoint with actor/critic weights every n episodes')
args = parser.parse_args()
# Environment
# Central agent controlling the buildings using the OpenAI Stable Baselines
climate_zone = 1
data_path = Path("data/Climate_Zone_"+str(climate_zone))
building_attributes = data_path / 'building_attributes.json'
weather_file = data_path / 'weather_data.csv'
solar_profile = data_path / 'solar_generation_1kW.csv'
building_state_actions = 'buildings_state_action_space.json'
building_ids = ['Building_1',"Building_2","Building_3","Building_4","Building_5","Building_6","Building_7","Building_8","Building_9"]
objective_function = ['ramping','1-load_factor','average_daily_peak','peak_demand','net_electricity_consumption']
env = CityLearn(data_path, building_attributes, weather_file, solar_profile, building_ids, buildings_states_actions = building_state_actions, cost_function = objective_function, central_agent = True, verbose = 0)
RBC_env = CityLearn(data_path, building_attributes, weather_file, solar_profile, building_ids, buildings_states_actions = building_state_actions, cost_function = objective_function, central_agent = False, verbose = 0)
# Contain the lower and upper bounds of the states and actions, to be provided to the agent to normalize the variables between 0 and 1.
# Can be obtained using observations_spaces[i].low or .high
observations_spaces, actions_spaces = env.get_state_action_spaces()
observations_spacesRBC, actions_spacesRBC = RBC_env.get_state_action_spaces()
# Provides information on Building type, Climate Zone, Annual DHW demand, Annual Cooling Demand, Annual Electricity Demand, Solar Capacity, and correllations among buildings
building_info = env.get_building_information()
"""
#############################################
STEP 2: Determine the size of the Action and State Spaces and the Number of Agents
The observation space consists of various variables corresponding to the
building_state_action_json file. See https://github.com/intelligent-environments-lab/CityLearn
for more information about the states. Each agent receives all observations of all buildings
(communication between buildings).
Up to two continuous actions are available, corresponding to whether to charge or discharge
the cooling storage and DHW storage tanks.
"""
# Get number of buildings and agents in Environment
num_buildings = env.n_buildings
print('\nNumber of Buildings: ', num_buildings)
print('\nCentral Agent: ', env.central_agent)
# Set the size of state observations or state size
print('\nSize of State: ', observations_spaces)
print('\n')
# Store the weights and scores in a new directory
parent_dir = "alg/sac_{}/".format(time.strftime("%Y%m%d-%H%M%S")) # apprends the timedate
os.makedirs(parent_dir, exist_ok=True)
# Create the final dir
final_dir = parent_dir+"final/"
os.makedirs(final_dir, exist_ok=True)
# Tensorboard writer object
writer = SummaryWriter(log_dir=parent_dir+'tensorboard/')
print("Logging to {}\n".format(parent_dir+'tensorboard/'))
# Set seeds (TO DO: CHECK PERFORMANCE SAME FOR TWO RUNS WITH SAME SEED)
torch.manual_seed(args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
env.seed(args.seed)
# Get the Rule Base Controller baseline actions
agent = RBC_Agent(actions_spacesRBC)
state = RBC_env.reset()
state_list = []
action_list = []
doneRBC = False
while not doneRBC:
action = agent.select_action([list(RBC_env.buildings.values())[0].sim_results['hour'][RBC_env.time_step]])
action_list.append(action)
state_list.append(state)
next_stateRBC, rewardsRBC, doneRBC, _ = RBC_env.step(action)
state = next_stateRBC
RBC_action_base = np.array(action_list)
RBC_state_base = np.array(state_list)
RBC_24h_peak = [day.max() for day in np.append(RBC_env.net_electric_consumption,0).reshape(-1, 24)]
"""
###################################
STEP 3: Create SAC Agent from the Agent Class in sac.py
A SAC agent initialized with the following parameters.
======
To be completed
"""
# Agent
agent = SAC(env, env.observation_space.shape[0], env.action_space, args, constrain_action_space=True and env.central_agent, smooth_action_space = True, evaluate = False, continue_training = True)
"""
###################################
STEP 4: Run the SAC Training Sequence
The SAC Training Process involves the agent learning from repeated episodes of behaviour
to map states to actions the maximize rewards received via environmental interaction.
The agent training process involves the following:
(1) Reset the environment at the beginning of each episode.
(2) Obtain (observe) current state, s, of the environment at time t
(3) Perform an action, a(t), in the environment given s(t)
(4) Observe the result of the action in terms of the reward received and
the state of the environment at time t+1 (i.e., s(t+1))
(5) Update agent memory and learn from experience (i.e, agent.step)
(6) Update episode score (total reward received) and set s(t) -> s(t+1).
(7) If episode is done, break and repeat from (1), otherwise repeat from (3).
"""
# Training Loop
total_numsteps = 0
updates = 0
best_reward = 1.2
# Measure the time taken for training
start_timer = time.time()
# The list of scores and rewards
score_list = []
reward_list = []
for i_episode in itertools.count(1):
# Initialise episode rewards
episode_reward = 0
episode_peak_reward = 0
episode_day_reward = 0
episode_night_reward = 0
episode_smooth_reward = 0
episode_steps = 0
done = False
state = env.reset()
# For every step
while not done:
# If learning hasn't started yet, sample random action
if args.start_steps > total_numsteps:
#
action = env.action_space.sample()
agent.action_tracker.append(action)
# Else sample action from policy
else:
action = agent.select_action(state)
if len(agent.memory) > agent.batch_size:
# Update parameters of all the networks
if total_numsteps % agent.update_interval == 0:
critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(total_numsteps)
# Tensorboard log policy metrics
writer.add_scalar('loss/critic_1', critic_1_loss,total_numsteps)
writer.add_scalar('loss/critic_2', critic_2_loss, total_numsteps)
writer.add_scalar('loss/policy', policy_loss, total_numsteps)
writer.add_scalar('loss/entropy_loss', ent_loss, total_numsteps)
writer.add_scalar('entropy_temprature/alpha', alpha, total_numsteps)
# Step
next_state, reward, done, _ = env.step(action)
# Append transition to memory
reward, r_peak, r_day, r_night, r_smooth = agent.add_to_buffer(state, action, reward, next_state, done)
episode_steps += 1
total_numsteps += 1
episode_reward += reward
episode_peak_reward += r_peak
episode_day_reward += r_day
episode_night_reward += r_night
episode_smooth_reward += r_smooth
state = next_state
# Tensorboard log reward values
writer.add_scalar('Reward/Total', episode_reward, total_numsteps)
writer.add_scalar('Reward/Peak', episode_peak_reward, total_numsteps)
writer.add_scalar('Reward/Day_Charging', episode_day_reward, total_numsteps)
writer.add_scalar('Reward/Night_Charging', episode_night_reward, total_numsteps)
writer.add_scalar('Reward/Smooth_Actions', episode_smooth_reward, total_numsteps)
# Tensorboard log citylearn cost function
writer.add_scalar("Scores/ramping", env.cost()['ramping'], total_numsteps)
writer.add_scalar("Scores/1-load_factor", env.cost()['1-load_factor'], total_numsteps)
writer.add_scalar("Scores/average_daily_peak", env.cost()['average_daily_peak'], total_numsteps)
writer.add_scalar("Scores/peak_demand", env.cost()['peak_demand'], total_numsteps)
writer.add_scalar("Scores/net_electricity_consumption", env.cost()['net_electricity_consumption'], total_numsteps)
writer.add_scalar("Scores/total", env.cost()['total'], total_numsteps)
# Append the total score/reward to the list
score_list.append(env.cost()['total'])
reward_list.append(episode_reward)
# Log how much storage is utilised by calculating abs sum of actions (CHECK IF WORKS WITH MULTIPLE BUILDINGS!!!)
episode_actions = np.array(agent.action_tracker[-8759:])
cooling = sum(abs(episode_actions[:,0]))
writer.add_scalar("Action/Cooling", cooling, total_numsteps)
if agent.act_size[0] == 2:
dhw = sum(abs(episode_actions[:,1]))
writer.add_scalar("Action/DHW", dhw, total_numsteps)
writer.add_histogram("Action/Tracker", np.array(agent.action_tracker), total_numsteps)
print("Episode: {}, total numsteps: {}, total cost: {}, reward: {}".format(i_episode, total_numsteps, round(env.cost()['total'],5), round(episode_reward, 2)))
# Save trained Actor and Critic network periodically as a checkpoint if it's the best model achieved
if i_episode % args.checkpoint_interval == 0:
if env.cost()['total'] < best_reward:
best_reward = env.cost()['total']
print("Saving new best model to {}".format(parent_dir))
agent.save_model(parent_dir)
# If training episodes completed
if i_episode > args.num_episodes - 1:
break
env.close()
timer = time.time() - start_timer
"""
###################################
STEP 5: POSTPROCESSING
"""
# Building to plot results for
building_number = building_ids[0]
# Save the score list to a file using pickle
os.makedirs(f'{parent_dir}scores/', exist_ok=True)
with open(f'{parent_dir}scores/{args.seed}.npy', 'wb') as f:
np_score = np.array(score_list)
np.save(f, np_score)
# Save the score list to a file using pickle
os.makedirs(f'{parent_dir}rewards/', exist_ok=True)
with open(f'{parent_dir}rewards/{args.seed}.npy', 'wb') as f:
np_reward = np.array(reward_list)
np.save(f, np_reward)
# Plot District level power consumption
graph_total(env=env, RBC_env = RBC_env, agent=agent, parent_dir=final_dir, start_date = '2017-09-01', end_date = '2017-09-10')
divide_lambda = lambda x: int(x/4)
district_graph = Image.open(parent_dir+"final/"+r"district.jpg")
district_graph = district_graph.resize(tuple(map(divide_lambda,district_graph.size)))
writer.add_image("Graph for District/Elec_Consumption", ToTensor()(district_graph))
district_RBC_comp_graph = Image.open(parent_dir+"final/"+r"district_RBC_comp_daily_peak.jpg")
district_RBC_comp_graph = district_graph.resize(tuple(map(divide_lambda,district_graph.size)))
writer.add_image("Graph for District/Daily_Peak", ToTensor()(district_RBC_comp_graph))
action_index = 0
i = 0
# Plot individual building power consumption and agent actions
for building in building_ids:
# Graph district energy consumption and agent behaviour
graph_building(building_number=building, env=env, RBC_env = RBC_env, agent=agent, parent_dir=final_dir, start_date = '2017-09-01', end_date = '2017-09-10', action_index = action_index)
# Add these graphs to the tensorboard
train_graph = Image.open(parent_dir+"final/"+r"train"+"{}.jpg".format(building[-1]))
train_graph = train_graph.resize(tuple(map(divide_lambda,train_graph.size)))
action_graph = Image.open(parent_dir+"final/"+r"actions"+"{}.jpg".format(building[-1]))
action_graph = action_graph.resize(tuple(map(divide_lambda,action_graph.size)))
writer.add_image("Graph for {}/Train".format(building), ToTensor()(train_graph))
writer.add_image("Graph for {}/Actions".format(building), ToTensor()(action_graph))
action_index = action_index + agent.act_size[i]
i = i + 1
# Tabulate run parameters in training log
tabulate_table(env=env, timer=timer, algo="SAC", agent = agent, climate_zone=climate_zone, building_ids=building_ids,
building_attributes=building_attributes, parent_dir=final_dir, num_episodes=i_episode, episode_scores=[episode_reward])