Module AssetAllocator.algorithms.SAC.agent
Expand source code
import math
import random
import sys
sys.path.append('../')
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
from .SAC import *
class SACAgent:
"""This is the agent class for the SAC Agent.
Original paper can be found at https://arxiv.org/abs/1802.09477
This implementation was adapted from https://github.com/higgsfield/RL-Adventure-2/blob/master/7.soft%20actor-critic.ipynb
"""
def __init__(self, env, hidden_dim = 256, value_lr = 3e-4, soft_q_lr = 3e-4, policy_lr = 3e-4,
gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0, soft_tau=1e-2,
replay_buffer_size = 1_000_000, batch_size = 128, device = 'cpu'):
"""Initializes the TD3 Agent
Args:
env ([type]): Gym environment for the agent to interact with
hidden_dim (int, optional): Size of hidden layer neurons. Defaults to 256.
device (str, optional): One of cuda or cpu. Defaults to 'cuda'.
memory_dim ([type], optional): Size of replay buffer. Defaults to 100_000.
max_action (int, optional): Action scaling factor. Defaults to 1.
discount (float, optional): Reward discount factor. Defaults to 0.99.
update_freq (int, optional): Number of times to update targets networks. Defaults to 2.
tau (float, optional): Polyak averaging soft updates factor. Defaults to 0.005.
policy_noise_std (float, optional): Standard deviation of noise. Defaults to 0.2.
policy_noise_clip (float, optional): Clip value of noise. Defaults to 0.5.
actor_lr ([type], optional): Actor's learning rate. Defaults to 1e-3.
critic_lr ([type], optional): Critic's learning rate. Defaults to 1e-3.
batch_size (int, optional): Batch size for replay buffer and networks. Defaults to 128.
"""
self.env = env
self.action_dim = self.env.action_space.shape[0]
self.state_dim = self.env.observation_space.shape[0]
self.hidden_dim = hidden_dim
self.device = device
self.value_lr = value_lr
self.soft_q_lr = soft_q_lr
self.policy_lr = policy_lr
self.replay_buffer_size = replay_buffer_size
self.gamma = gamma
self.mean_lambda = mean_lambda
self.std_lambda = std_lambda
self.z_lambda = z_lambda
self.soft_tau = soft_tau
self.value_net = ValueNetwork(self.state_dim, self.hidden_dim).to(self.device)
self.target_value_net = ValueNetwork(self.state_dim, hidden_dim).to(self.device)
self.soft_q_net = SoftQNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device)
self.policy_net = PolicyNetwork(self.state_dim, self.action_dim,
self.hidden_dim, device = self.device).to(self.device)
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
target_param.data.copy_(param.data)
self.value_criterion = nn.MSELoss()
self.soft_q_criterion = nn.MSELoss()
self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr)
self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=self.soft_q_lr)
self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr)
self.replay_buffer = ReplayBuffer(self.replay_buffer_size)
self.batch_size = batch_size
def soft_q_update(self):
state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size)
state = torch.FloatTensor(state).to(self.device)
next_state = torch.FloatTensor(next_state).to(self.device)
action = torch.FloatTensor(action).to(self.device)
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
expected_q_value = self.soft_q_net(state, action)
expected_value = self.value_net(state)
new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
target_value = self.target_value_net(next_state)
next_q_value = reward + (1 - done) * self.gamma * target_value
q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
expected_new_q_value = self.soft_q_net(state, new_action)
next_value = expected_new_q_value - log_prob
value_loss = self.value_criterion(expected_value, next_value.detach())
log_prob_target = expected_new_q_value - expected_value
policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
mean_loss = self.mean_lambda * mean.pow(2).mean()
std_loss = self.std_lambda * log_std.pow(2).mean()
z_loss = self.z_lambda * z.pow(2).sum(1).mean()
policy_loss += mean_loss + std_loss + z_loss
self.soft_q_optimizer.zero_grad()
q_value_loss.backward()
self.soft_q_optimizer.step()
self.value_optimizer.zero_grad()
value_loss.backward()
self.value_optimizer.step()
self.policy_optimizer.zero_grad()
policy_loss.backward()
self.policy_optimizer.step()
for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
target_param.data.copy_(
target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau
)
def learn(self, timesteps, print_every = 100):
"""Helper method to train agent
Args:
total_steps (int): Total steps the agent has taken
timesteps (int): Total timesteps the agent has interacted for
print_every (int): Verbosity control iteration (int): Number of training iterations
"""
idx = 0
flag = False
count_of_dones = 0
while idx < timesteps:
state = self.env.reset()
ep_reward = 0
done = False
while not done:
action = self.policy_net.get_action(state)
next_state, reward, done, _ = self.env.step(action)
self.replay_buffer.push(state, action, reward, next_state, done)
if len(self.replay_buffer) > self.batch_size:
self.soft_q_update()
state = next_state
ep_reward += reward
idx += 1
if done:
count_of_dones += 1
flag = True
if flag and count_of_dones % print_every == 0:
print(f'Score at timestep {idx}: {ep_reward}.')
flag = False
if idx > timesteps:
break
def predict(self, state):
"""Returns agent's action based on a given state
Args:
state (array_like): Current environment state
Returns:
action (array_like): Agent's action
"""
action = self.policy_net.get_action(state)
return action
def save(self, filename):
"""
Saves trained model
Params
=====
filepath(str) : folder path to save the agent
"""
torch.save(self.value_net.state_dict(), filename + '_value_net')
torch.save(self.value_optimizer.state_dict(), filename + '_value_optimizer')
torch.save(self.soft_q_net.state_dict(), filename + '_soft_q_net')
torch.save(self.soft_q_optimizer.state_dict(), filename + '_soft_q_optimizer')
torch.save(self.policy_net.state_dict(), filename + '_policy_net')
torch.save(self.policy_optimizer.state_dict(), filename + '_policy_optimizer')
def load(self, filename):
"""
Loads trained model
Params
=====
filepath(str) : folder path to save the agent
"""
self.value_net.load_state_dict(torch.load(filename + '_value_net'))
self.value_optimizer.load_state_dict(torch.load(filename + '_value_optimizer'))
self.soft_q_net.load_state_dict(torch.load(filename + '_soft_q_net'))
self.soft_q_optimizer.load_state_dict(torch.load(filename + '_soft_q_optimizer'))
self.policy_net.load_state_dict(torch.load(filename + '_policy_net'))
self.policy_optimizer.load_state_dict(torch.load(filename + '_policy_optimizer'))
self.target_value_net.load_state_dict(self.value_net.state_dict())
Classes
class SACAgent (env, hidden_dim=256, value_lr=0.0003, soft_q_lr=0.0003, policy_lr=0.0003, gamma=0.99, mean_lambda=0.001, std_lambda=0.001, z_lambda=0.0, soft_tau=0.01, replay_buffer_size=1000000, batch_size=128, device='cpu')
-
This is the agent class for the SAC Agent. Original paper can be found at https://arxiv.org/abs/1802.09477 This implementation was adapted from https://github.com/higgsfield/RL-Adventure-2/blob/master/7.soft%20actor-critic.ipynb
Initializes the TD3 Agent
Args
env
:[type]
- Gym environment for the agent to interact with
hidden_dim
:int
, optional- Size of hidden layer neurons. Defaults to 256.
device
:str
, optional- One of cuda or cpu. Defaults to 'cuda'.
memory_dim
:[type]
, optional- Size of replay buffer. Defaults to 100_000.
max_action
:int
, optional- Action scaling factor. Defaults to 1.
discount
:float
, optional- Reward discount factor. Defaults to 0.99.
update_freq
:int
, optional- Number of times to update targets networks. Defaults to 2.
tau
:float
, optional- Polyak averaging soft updates factor. Defaults to 0.005.
policy_noise_std
:float
, optional- Standard deviation of noise. Defaults to 0.2.
policy_noise_clip
:float
, optional- Clip value of noise. Defaults to 0.5.
actor_lr
:[type]
, optional- Actor's learning rate. Defaults to 1e-3.
critic_lr
:[type]
, optional- Critic's learning rate. Defaults to 1e-3.
batch_size
:int
, optional- Batch size for replay buffer and networks. Defaults to 128.
Expand source code
class SACAgent: """This is the agent class for the SAC Agent. Original paper can be found at https://arxiv.org/abs/1802.09477 This implementation was adapted from https://github.com/higgsfield/RL-Adventure-2/blob/master/7.soft%20actor-critic.ipynb """ def __init__(self, env, hidden_dim = 256, value_lr = 3e-4, soft_q_lr = 3e-4, policy_lr = 3e-4, gamma=0.99, mean_lambda=1e-3, std_lambda=1e-3, z_lambda=0.0, soft_tau=1e-2, replay_buffer_size = 1_000_000, batch_size = 128, device = 'cpu'): """Initializes the TD3 Agent Args: env ([type]): Gym environment for the agent to interact with hidden_dim (int, optional): Size of hidden layer neurons. Defaults to 256. device (str, optional): One of cuda or cpu. Defaults to 'cuda'. memory_dim ([type], optional): Size of replay buffer. Defaults to 100_000. max_action (int, optional): Action scaling factor. Defaults to 1. discount (float, optional): Reward discount factor. Defaults to 0.99. update_freq (int, optional): Number of times to update targets networks. Defaults to 2. tau (float, optional): Polyak averaging soft updates factor. Defaults to 0.005. policy_noise_std (float, optional): Standard deviation of noise. Defaults to 0.2. policy_noise_clip (float, optional): Clip value of noise. Defaults to 0.5. actor_lr ([type], optional): Actor's learning rate. Defaults to 1e-3. critic_lr ([type], optional): Critic's learning rate. Defaults to 1e-3. batch_size (int, optional): Batch size for replay buffer and networks. Defaults to 128. """ self.env = env self.action_dim = self.env.action_space.shape[0] self.state_dim = self.env.observation_space.shape[0] self.hidden_dim = hidden_dim self.device = device self.value_lr = value_lr self.soft_q_lr = soft_q_lr self.policy_lr = policy_lr self.replay_buffer_size = replay_buffer_size self.gamma = gamma self.mean_lambda = mean_lambda self.std_lambda = std_lambda self.z_lambda = z_lambda self.soft_tau = soft_tau self.value_net = ValueNetwork(self.state_dim, self.hidden_dim).to(self.device) self.target_value_net = ValueNetwork(self.state_dim, hidden_dim).to(self.device) self.soft_q_net = SoftQNetwork(self.state_dim, self.action_dim, self.hidden_dim).to(self.device) self.policy_net = PolicyNetwork(self.state_dim, self.action_dim, self.hidden_dim, device = self.device).to(self.device) for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param.data) self.value_criterion = nn.MSELoss() self.soft_q_criterion = nn.MSELoss() self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.value_lr) self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=self.soft_q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=self.policy_lr) self.replay_buffer = ReplayBuffer(self.replay_buffer_size) self.batch_size = batch_size def soft_q_update(self): state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size) state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) expected_q_value = self.soft_q_net(state, action) expected_value = self.value_net(state) new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) target_value = self.target_value_net(next_state) next_q_value = reward + (1 - done) * self.gamma * target_value q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach()) expected_new_q_value = self.soft_q_net(state, new_action) next_value = expected_new_q_value - log_prob value_loss = self.value_criterion(expected_value, next_value.detach()) log_prob_target = expected_new_q_value - expected_value policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() mean_loss = self.mean_lambda * mean.pow(2).mean() std_loss = self.std_lambda * log_std.pow(2).mean() z_loss = self.z_lambda * z.pow(2).sum(1).mean() policy_loss += mean_loss + std_loss + z_loss self.soft_q_optimizer.zero_grad() q_value_loss.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_( target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau ) def learn(self, timesteps, print_every = 100): """Helper method to train agent Args: total_steps (int): Total steps the agent has taken timesteps (int): Total timesteps the agent has interacted for print_every (int): Verbosity control iteration (int): Number of training iterations """ idx = 0 flag = False count_of_dones = 0 while idx < timesteps: state = self.env.reset() ep_reward = 0 done = False while not done: action = self.policy_net.get_action(state) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) if len(self.replay_buffer) > self.batch_size: self.soft_q_update() state = next_state ep_reward += reward idx += 1 if done: count_of_dones += 1 flag = True if flag and count_of_dones % print_every == 0: print(f'Score at timestep {idx}: {ep_reward}.') flag = False if idx > timesteps: break def predict(self, state): """Returns agent's action based on a given state Args: state (array_like): Current environment state Returns: action (array_like): Agent's action """ action = self.policy_net.get_action(state) return action def save(self, filename): """ Saves trained model Params ===== filepath(str) : folder path to save the agent """ torch.save(self.value_net.state_dict(), filename + '_value_net') torch.save(self.value_optimizer.state_dict(), filename + '_value_optimizer') torch.save(self.soft_q_net.state_dict(), filename + '_soft_q_net') torch.save(self.soft_q_optimizer.state_dict(), filename + '_soft_q_optimizer') torch.save(self.policy_net.state_dict(), filename + '_policy_net') torch.save(self.policy_optimizer.state_dict(), filename + '_policy_optimizer') def load(self, filename): """ Loads trained model Params ===== filepath(str) : folder path to save the agent """ self.value_net.load_state_dict(torch.load(filename + '_value_net')) self.value_optimizer.load_state_dict(torch.load(filename + '_value_optimizer')) self.soft_q_net.load_state_dict(torch.load(filename + '_soft_q_net')) self.soft_q_optimizer.load_state_dict(torch.load(filename + '_soft_q_optimizer')) self.policy_net.load_state_dict(torch.load(filename + '_policy_net')) self.policy_optimizer.load_state_dict(torch.load(filename + '_policy_optimizer')) self.target_value_net.load_state_dict(self.value_net.state_dict())
Methods
def learn(self, timesteps, print_every=100)
-
Helper method to train agent
Args
total_steps
:int
- Total steps the agent has taken
timesteps
:int
- Total timesteps the agent has interacted for
print_every
:int
- Verbosity control iteration (int): Number of training iterations
Expand source code
def learn(self, timesteps, print_every = 100): """Helper method to train agent Args: total_steps (int): Total steps the agent has taken timesteps (int): Total timesteps the agent has interacted for print_every (int): Verbosity control iteration (int): Number of training iterations """ idx = 0 flag = False count_of_dones = 0 while idx < timesteps: state = self.env.reset() ep_reward = 0 done = False while not done: action = self.policy_net.get_action(state) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) if len(self.replay_buffer) > self.batch_size: self.soft_q_update() state = next_state ep_reward += reward idx += 1 if done: count_of_dones += 1 flag = True if flag and count_of_dones % print_every == 0: print(f'Score at timestep {idx}: {ep_reward}.') flag = False if idx > timesteps: break
def load(self, filename)
-
Loads trained model Params ===== filepath(str) : folder path to save the agent
Expand source code
def load(self, filename): """ Loads trained model Params ===== filepath(str) : folder path to save the agent """ self.value_net.load_state_dict(torch.load(filename + '_value_net')) self.value_optimizer.load_state_dict(torch.load(filename + '_value_optimizer')) self.soft_q_net.load_state_dict(torch.load(filename + '_soft_q_net')) self.soft_q_optimizer.load_state_dict(torch.load(filename + '_soft_q_optimizer')) self.policy_net.load_state_dict(torch.load(filename + '_policy_net')) self.policy_optimizer.load_state_dict(torch.load(filename + '_policy_optimizer')) self.target_value_net.load_state_dict(self.value_net.state_dict())
def predict(self, state)
-
Returns agent's action based on a given state
Args
state
:array_like
- Current environment state
Returns
action (array_like): Agent's action
Expand source code
def predict(self, state): """Returns agent's action based on a given state Args: state (array_like): Current environment state Returns: action (array_like): Agent's action """ action = self.policy_net.get_action(state) return action
def save(self, filename)
-
Saves trained model Params ===== filepath(str) : folder path to save the agent
Expand source code
def save(self, filename): """ Saves trained model Params ===== filepath(str) : folder path to save the agent """ torch.save(self.value_net.state_dict(), filename + '_value_net') torch.save(self.value_optimizer.state_dict(), filename + '_value_optimizer') torch.save(self.soft_q_net.state_dict(), filename + '_soft_q_net') torch.save(self.soft_q_optimizer.state_dict(), filename + '_soft_q_optimizer') torch.save(self.policy_net.state_dict(), filename + '_policy_net') torch.save(self.policy_optimizer.state_dict(), filename + '_policy_optimizer')
def soft_q_update(self)
-
Expand source code
def soft_q_update(self): state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size) state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) expected_q_value = self.soft_q_net(state, action) expected_value = self.value_net(state) new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) target_value = self.target_value_net(next_state) next_q_value = reward + (1 - done) * self.gamma * target_value q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach()) expected_new_q_value = self.soft_q_net(state, new_action) next_value = expected_new_q_value - log_prob value_loss = self.value_criterion(expected_value, next_value.detach()) log_prob_target = expected_new_q_value - expected_value policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean() mean_loss = self.mean_lambda * mean.pow(2).mean() std_loss = self.std_lambda * log_std.pow(2).mean() z_loss = self.z_lambda * z.pow(2).sum(1).mean() policy_loss += mean_loss + std_loss + z_loss self.soft_q_optimizer.zero_grad() q_value_loss.backward() self.soft_q_optimizer.step() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_( target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau )