Module AssetAllocator.algorithms.REINFORCE.agent
Expand source code
import argparse
import math
import os
import numpy as np
import gym
from gym import wrappers
import torch
from torch.autograd import Variable
import torch.nn.utils as utils
from .reinforce_continuous import REINFORCE
from .normalized_actions import check_and_normalize_box_actions
default_device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
def softmax(x, axis=0):
# Use the LogSumExp Trick
max_val = np.amax(x, axis=axis, keepdims=True)
x = x - max_val
# Softmax
num = np.exp(x)
denum = num.sum(axis=axis, keepdims=True)
softmax = num/denum
return softmax
class REINFORCEAgent:
"""
Helper class to manage and train REINFORCE Agent
"""
def __init__(self, env, device=default_device):
"""Initializes the REINFORCE Agent
Args:
env (gym object): Gym environment for the agent to interact with
device (str, optional): One of cuda or cpu. Defaults to 'cuda'.
"""
torch.manual_seed(env.seed)
np.random.seed(env.seed)
hidden_size = 128
self.env = env
self.gamma = 0.99
self.agent = REINFORCE(
hidden_size, env.observation_space.shape[0], env.action_space)
def train(self, timesteps, print_every):
"""Helper method to train the agent
Args:
timesteps (int): Total timesteps the agent has interacted for
print_every (int): Verbosity control
"""
reward_history = [] # tracks the reward per episode
best_score = -np.inf
epochs = timesteps//self.env.episode_length + 1
total_steps = 0
flag = False
count_of_dones = 0
for epoch in range(epochs):
done = False
state = torch.Tensor([self.env.reset()])
entropies = []
log_probs = []
rewards = []
ep_reward = 0
while not done:
action, log_prob, entropy = self.agent.select_action(state)
action = action.cpu()
action = softmax(action.numpy()[0])
next_state, reward, done, _ = self.env.step(action)
entropies.append(entropy)
log_probs.append(log_prob)
rewards.append(reward)
ep_reward += reward
state = torch.Tensor([next_state])
total_steps += 1
if done:
count_of_dones += 1
flag = True
if flag and count_of_dones % print_every == 0:
print(f'Score at timestep {total_steps}: {ep_reward}.')
flag = False
if total_steps >= timesteps:
break
self.agent.update_parameters(
rewards, log_probs, entropies, self.gamma)
self.env.close()
def learn(self, timesteps, print_every=100):
"""
Trains the agent
Params
======
timesteps (int): Number of timesteps the agent should interact with the environment
print_every (int): Verbosity control
"""
self.agent.model.train()
self.train(timesteps, print_every)
def predict(self, state):
"""Returns agent's action based on a given state
Args:
state (array_like): Current environment state
Returns:
action (array_like): Agent's action
"""
self.agent.model.eval()
state = torch.from_numpy(state).float()
action, log_prob, entropy = self.agent.select_action(state)
normalized_action = softmax(action.cpu().numpy())
return normalized_action
# return action
def save(self, file_name):
"""
Saves trained model
Params
=====
filepath(str) : folder path to save the agent
"""
torch.save(self.agent.model.state_dict(), file_name)
def load(self, file_name):
"""
Loads trained model
Params
=====
filepath(str) : folder path to save the agent
"""
self.model.load_state_dict(torch.load(file_name))
Functions
def softmax(x, axis=0)
-
Expand source code
def softmax(x, axis=0): # Use the LogSumExp Trick max_val = np.amax(x, axis=axis, keepdims=True) x = x - max_val # Softmax num = np.exp(x) denum = num.sum(axis=axis, keepdims=True) softmax = num/denum return softmax
Classes
class REINFORCEAgent (env, device=device(type='cpu'))
-
Helper class to manage and train REINFORCE Agent
Initializes the REINFORCE Agent
Args
env
:gym object
- Gym environment for the agent to interact with
device
:str
, optional- One of cuda or cpu. Defaults to 'cuda'.
Expand source code
class REINFORCEAgent: """ Helper class to manage and train REINFORCE Agent """ def __init__(self, env, device=default_device): """Initializes the REINFORCE Agent Args: env (gym object): Gym environment for the agent to interact with device (str, optional): One of cuda or cpu. Defaults to 'cuda'. """ torch.manual_seed(env.seed) np.random.seed(env.seed) hidden_size = 128 self.env = env self.gamma = 0.99 self.agent = REINFORCE( hidden_size, env.observation_space.shape[0], env.action_space) def train(self, timesteps, print_every): """Helper method to train the agent Args: timesteps (int): Total timesteps the agent has interacted for print_every (int): Verbosity control """ reward_history = [] # tracks the reward per episode best_score = -np.inf epochs = timesteps//self.env.episode_length + 1 total_steps = 0 flag = False count_of_dones = 0 for epoch in range(epochs): done = False state = torch.Tensor([self.env.reset()]) entropies = [] log_probs = [] rewards = [] ep_reward = 0 while not done: action, log_prob, entropy = self.agent.select_action(state) action = action.cpu() action = softmax(action.numpy()[0]) next_state, reward, done, _ = self.env.step(action) entropies.append(entropy) log_probs.append(log_prob) rewards.append(reward) ep_reward += reward state = torch.Tensor([next_state]) total_steps += 1 if done: count_of_dones += 1 flag = True if flag and count_of_dones % print_every == 0: print(f'Score at timestep {total_steps}: {ep_reward}.') flag = False if total_steps >= timesteps: break self.agent.update_parameters( rewards, log_probs, entropies, self.gamma) self.env.close() def learn(self, timesteps, print_every=100): """ Trains the agent Params ====== timesteps (int): Number of timesteps the agent should interact with the environment print_every (int): Verbosity control """ self.agent.model.train() self.train(timesteps, print_every) def predict(self, state): """Returns agent's action based on a given state Args: state (array_like): Current environment state Returns: action (array_like): Agent's action """ self.agent.model.eval() state = torch.from_numpy(state).float() action, log_prob, entropy = self.agent.select_action(state) normalized_action = softmax(action.cpu().numpy()) return normalized_action # return action def save(self, file_name): """ Saves trained model Params ===== filepath(str) : folder path to save the agent """ torch.save(self.agent.model.state_dict(), file_name) def load(self, file_name): """ Loads trained model Params ===== filepath(str) : folder path to save the agent """ self.model.load_state_dict(torch.load(file_name))
Methods
def learn(self, timesteps, print_every=100)
-
Trains the agent
Params
timesteps (int): Number of timesteps the agent should interact with the environment print_every (int): Verbosity control
Expand source code
def learn(self, timesteps, print_every=100): """ Trains the agent Params ====== timesteps (int): Number of timesteps the agent should interact with the environment print_every (int): Verbosity control """ self.agent.model.train() self.train(timesteps, print_every)
def load(self, file_name)
-
Loads trained model
Params
filepath(str) : folder path to save the agent
Expand source code
def load(self, file_name): """ Loads trained model Params ===== filepath(str) : folder path to save the agent """ self.model.load_state_dict(torch.load(file_name))
def predict(self, state)
-
Returns agent's action based on a given state
Args
state
:array_like
- Current environment state
Returns
action (array_like): Agent's action
Expand source code
def predict(self, state): """Returns agent's action based on a given state Args: state (array_like): Current environment state Returns: action (array_like): Agent's action """ self.agent.model.eval() state = torch.from_numpy(state).float() action, log_prob, entropy = self.agent.select_action(state) normalized_action = softmax(action.cpu().numpy()) return normalized_action
def save(self, file_name)
-
Saves trained model
Params
filepath(str) : folder path to save the agent
Expand source code
def save(self, file_name): """ Saves trained model Params ===== filepath(str) : folder path to save the agent """ torch.save(self.agent.model.state_dict(), file_name)
def train(self, timesteps, print_every)
-
Helper method to train the agent
Args
timesteps
:int
- Total timesteps the agent has interacted for
print_every
:int
- Verbosity control
Expand source code
def train(self, timesteps, print_every): """Helper method to train the agent Args: timesteps (int): Total timesteps the agent has interacted for print_every (int): Verbosity control """ reward_history = [] # tracks the reward per episode best_score = -np.inf epochs = timesteps//self.env.episode_length + 1 total_steps = 0 flag = False count_of_dones = 0 for epoch in range(epochs): done = False state = torch.Tensor([self.env.reset()]) entropies = [] log_probs = [] rewards = [] ep_reward = 0 while not done: action, log_prob, entropy = self.agent.select_action(state) action = action.cpu() action = softmax(action.numpy()[0]) next_state, reward, done, _ = self.env.step(action) entropies.append(entropy) log_probs.append(log_prob) rewards.append(reward) ep_reward += reward state = torch.Tensor([next_state]) total_steps += 1 if done: count_of_dones += 1 flag = True if flag and count_of_dones % print_every == 0: print(f'Score at timestep {total_steps}: {ep_reward}.') flag = False if total_steps >= timesteps: break self.agent.update_parameters( rewards, log_probs, entropies, self.gamma) self.env.close()