Module AssetAllocator.algorithms.PPO.agent

Expand source code
import numpy as np
import time
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.distributions import MultivariateNormal
import torch.nn.functional as F

class PPOAgent:
        """
                This is the agent class for the Proximal Policy Optimization Algorithm.
  
        Original paper can be found at https://arxiv.org/abs/1707.06347

        This implementation was adapted from https://github.com/ericyangyu/PPO-for-Beginners
    
        """
        def __init__(self, env, device = 'cuda', timesteps_per_batch = 50_000, max_timesteps_per_episode = 2_000,
                n_updates_per_iteration = 5, lr = 0.005, gamma = 0.95, clip = 0.2,
                render = False, seed = None):
                """
                        Initializes the PPO model, including hyperparameters.

                        Parameters:
                        ==========
                                policy_class - the policy class to use for our actor/critic networks.
                                env (PortfolioGymEnv): instance of environment
                                hyperparameters - all extra arguments passed into PPO that should be hyperparameters.

                        Returns:
                                None
                """

                # Initialize hyperparameters for training with PPO
                # Initialize default values for hyperparameters
                # Algorithm hyperparameters
                self.timesteps_per_batch = timesteps_per_batch                 # Number of timesteps to run per batch
                self.max_timesteps_per_episode = max_timesteps_per_episode          # Max number of timesteps per episode
                self.n_updates_per_iteration = n_updates_per_iteration              # No of times to update actor/critic per iteration
                self.lr = lr                                 # Learning rate of actor optimizer
                self.gamma = gamma                               # Discount factor to be applied when calculating Rewards-To-Go
                self.clip = clip                              # Recommended 0.2, helps define the threshold to clip the ratio during SGA

                # Miscellaneous parameters
                self.render = render
                self.seed = seed


                # Sets the seed if specified
                if self.seed != None:
                        # Check if our seed is valid first
                        assert(type(self.seed) == int)

                        # Set the seed 
                        torch.manual_seed(self.seed)
                        print(f"Successfully set seed to {self.seed}")
                self.device = device

                # Extract environment information
                self.env = env
                self.obs_dim = env.observation_space.shape[0]
                self.act_dim = env.action_space.shape[0]

                # Initialize actor and critic networks
                self.actor = FeedForwardNN(self.obs_dim, self.act_dim, device).to(device)                                                   # ALG STEP 1
                self.critic = FeedForwardNN(self.obs_dim, 1, device).to(device)

                # Initialize optimizers for actor and critic
                self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
                self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

                # Initialize the covariance matrix used to query the actor for actions
                self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5)
                self.cov_mat = torch.diag(self.cov_var)

                # This logger will help us with printing out summaries of each iteration
                self.logger = {
                        'delta_t': time.time_ns(),
                        't_so_far': 0,          # timesteps so far
                        'i_so_far': 0,          # iterations so far
                        'batch_lens': [],       # episodic lengths in batch
                        'batch_rews': [],       # episodic returns in batch
                        'actor_losses': [],     # losses of actor network in current iteration
                }

        def learn(self, timesteps, print_every = 1):
                """
                        Train the actor and critic networks. Here is where the main PPO algorithm resides.

                        Parameters:
                        ===========
                                total_timesteps - the total number of timesteps to train for
                        print_every (int): Verbosity control

                        Return:
                                None
                """
                print(f"Learning... Running {self.max_timesteps_per_episode} timesteps per episode, ", end='')
                print(f"{self.timesteps_per_batch} timesteps per batch for a total of {timesteps} timesteps")
                t_so_far = 0 # Timesteps simulated so far
                i_so_far = 0 # Iterations ran so far
                self.print_every = print_every
                while t_so_far < timesteps:                                                                       # ALG STEP 2
                        # We collect our batch simulations here
                        batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.step()                     # ALG STEP 3

                        # Calculate how many timesteps we collected this batch
                        t_so_far += np.sum(batch_lens)

                        # Increment the number of iterations
                        i_so_far += 1

                        # Logging timesteps so far and iterations so far
                        self.logger['t_so_far'] = t_so_far
                        self.logger['i_so_far'] = i_so_far

                        # Calculate advantage at k-th iteration
                        V, _ = self.evaluate(batch_obs, batch_acts)
                        A_k = batch_rtgs - V.detach()                                                                       # ALG STEP 5

                        A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

                        # This is the loop where we update our network for some n epochs
                        for _ in range(self.n_updates_per_iteration):                                                       # ALG STEP 6 & 7
                                # Calculate V_phi and pi_theta(a_t | s_t)
                                V, curr_log_probs = self.evaluate(batch_obs, batch_acts)

                                # Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
                                # NOTE: we just subtract the logs, which is the same as
                                # dividing the values and then canceling the log with e^log.
                                # For why we use log probabilities instead of actual probabilities,
                                # here's a great explanation: 
                                # https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
                                # TL;DR makes gradient ascent easier behind the scenes.
                                ratios = torch.exp(curr_log_probs - batch_log_probs)

                                # Calculate surrogate losses.
                                surr1 = ratios * A_k
                                surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k

                                # Calculate actor and critic losses.
                                # NOTE: we take the negative min of the surrogate losses because we're trying to maximize
                                # the performance function, but Adam minimizes the loss. So minimizing the negative
                                # performance function maximizes it.
                                actor_loss = (-torch.min(surr1, surr2)).mean()
                                critic_loss = nn.MSELoss()(V, batch_rtgs)

                                # Calculate gradients and perform backward propagation for actor network
                                self.actor_optim.zero_grad()
                                actor_loss.backward(retain_graph=True)
                                self.actor_optim.step()

                                # Calculate gradients and perform backward propagation for critic network
                                self.critic_optim.zero_grad()
                                critic_loss.backward()
                                self.critic_optim.step()

                                # Log actor loss
                                self.logger['actor_losses'].append(actor_loss.detach())

                                
        def save(self):
                """
        Saves trained model
        
                Parameters:
                        None        

        """
                torch.save(self.actor, './ppo_actor.pth')
                torch.save(self.critic, './ppo_critic.pth')


        def step(self):
                """
                        This is where we collect the batch of data from simulation. 
                        Since this is an on-policy algorithm, we'll need to collect a fresh batch
                        of data each time we iterate the actor/critic networks.

                        Parameters:
                                None

                        Return:
                                batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation)
                                batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action)
                                batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps)
                                batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps)
                                batch_lens - the lengths of each episode this batch. Shape: (number of episodes)
                """
                # Batch data. For more details, check function header.
                batch_obs = []
                batch_acts = []
                batch_log_probs = []
                batch_rews = []
                batch_rtgs = []
                batch_lens = []

                # Episodic data. Keeps track of rewards per episode, will get cleared
                # upon each new episode
                ep_rews = []

                t = 0 # Keeps track of how many timesteps we've run so far this batch
                flag = False
                count_of_dones = 0

                # Keep simulating until we've run more than or equal to specified timesteps per batch
                while t < self.timesteps_per_batch:
                        ep_rews = [] # rewards collected per episode

                        # Reset the environment. Note that obs is short for observation. 
                        obs = self.env.reset()
                        done = False    

                        # Run an episode for a maximum of max_timesteps_per_episode timesteps
                        for ep_t in range(self.max_timesteps_per_episode):
                                # If render is specified, render the environment
                                if self.render and (self.logger['i_so_far'] % self.render_every_i == 0) and len(batch_lens) == 0:
                                        self.env.render()

                                t += 1 # Increment timesteps ran this batch so far

                                # Track observations in this batch
                                batch_obs.append(obs)

                                # Calculate action and make a step in the env. 
                                # Note that rew is short for reward.
                                action, log_prob = self.get_action(obs)
                                action = torch.nn.Softmax(dim = 0)(torch.Tensor(action)).numpy()
                                obs, rew, done, _ = self.env.step(action)

                                # Track recent reward, action, and action log probability
                                ep_rews.append(rew)
                                batch_acts.append(action)
                                batch_log_probs.append(log_prob)

                                # If the environment tells us the episode is terminated, break
                                if done:
                                        count_of_dones += 1
                                        flag = True
                                        break

                                if flag and count_of_dones % self.print_every == 0:
                                        count_of_dones += 1
                                        flag = False
                                        print(f'Score at timestep {t}: {sum(ep_rews)}.')

                    
                        # Track episodic lengths and rewards
                        batch_lens.append(ep_t + 1)
                        batch_rews.append(ep_rews)

                # Reshape data as tensors in the shape specified in function description, before returning
                batch_obs = torch.tensor(np.array(batch_obs), dtype=torch.float)
                batch_acts = torch.tensor(np.array(batch_acts), dtype=torch.float)
                batch_log_probs = torch.tensor(np.array(batch_log_probs), dtype=torch.float)
                batch_rtgs = self.compute_rtgs(batch_rews)                                                              # ALG STEP 4

                # Log the episodic returns and episodic lengths in this batch.
                self.logger['batch_rews'] = batch_rews
                self.logger['batch_lens'] = batch_lens

                return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens


        def load(env, actor_model):
                """
                Load trained model

                Params
                =====
                actor_model : folder path to save the agent
                """
                # Extract out dimensions of observation and action spaces
                obs_dim = env.observation_space.shape[0]
                act_dim = env.action_space.shape[0]

                # Build our policy the same way we build our actor model in PPO
                policy = FeedForwardNN(obs_dim, act_dim)

                # Load in the actor model saved by the PPO algorithm
                policy.load_state_dict(torch.load(actor_model))


        def compute_rtgs(self, batch_rews):
                """
                        Compute the Reward-To-Go of each timestep in a batch given the rewards.

                        Parameters:
                                batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)

                        Return:
                                batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
                """
                # The rewards-to-go (rtg) per episode per batch to return.
                # The shape will be (num timesteps per episode)
                batch_rtgs = []

                # Iterate through each episode
                for ep_rews in reversed(batch_rews):

                        discounted_reward = 0 # The discounted reward so far

                        # Iterate through all rewards in the episode. We go backwards for smoother calculation of each
                        # discounted return (think about why it would be harder starting from the beginning)
                        for rew in reversed(ep_rews):
                                discounted_reward = rew + discounted_reward * self.gamma
                                batch_rtgs.insert(0, discounted_reward)

                # Convert the rewards-to-go into a tensor
                batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)

                return batch_rtgs

        def predict(self, state):
                """
                        Queries an action from the actor network, should be called from step.

                        Parameters:
                                state - the observation at the current timestep

                        Return:
                                action - the action to take, as a numpy array
                """
                action, _ = self.get_action(state)
                return torch.nn.Softmax(dim = 0)(torch.Tensor(action)).numpy()
    
        def get_action(self, obs):
                """
                        Queries an action from the actor network, should be called from step.

                        Parameters:
                                obs - the observation at the current timestep

                        Return:
                                action - the action to take, as a numpy array
                                log_prob - the log probability of the selected action in the distribution
                """
                # Query the actor network for a mean action
                mean = self.actor(obs)

                # Create a distribution with the mean action and std from the covariance matrix above.
                # For more information on how this distribution works, check out Andrew Ng's lecture on it:
                # https://www.youtube.com/watch?v=JjB58InuTqM
                dist = MultivariateNormal(mean.to(self.device), self.cov_mat.to(self.device))

                # Sample an action from the distribution
                action = dist.sample()

                # Calculate the log probability for that action
                log_prob = dist.log_prob(action)

                # Return the sampled action and the log probability of that action in our distribution
                return action.detach().cpu().numpy(), log_prob.detach()

        def evaluate(self, batch_obs, batch_acts):
                """
                        Estimate the values of each observation, and the log probs of
                        each action in the most recent batch with the most recent
                        iteration of the actor network. Should be called from learn.

                        Parameters:
                                batch_obs - the observations from the most recently collected batch as a tensor.
                                                        Shape: (number of timesteps in batch, dimension of observation)
                                batch_acts - the actions from the most recently collected batch as a tensor.
                                                        Shape: (number of timesteps in batch, dimension of action)

                        Return:
                                V - the predicted values of batch_obs
                                log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
                """
                # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
                V = self.critic(batch_obs).squeeze()

                # Calculate the log probabilities of batch actions using most recent actor network.
                # This segment of code is similar to that in get_action()
                mean = self.actor(batch_obs)
                dist = MultivariateNormal(mean, self.cov_mat)
                log_probs = dist.log_prob(batch_acts)

                # Return the value vector V of each observation in the batch
                # and log probabilities log_probs of each action in the batch
                return V, log_probs

class FeedForwardNN(nn.Module):
        """
                A standard in_dim-64-64-out_dim Feed Forward Neural Network.
        """
        def __init__(self, in_dim, out_dim, device = 'cuda'):
                """
                        Initialize the network and set up the layers.

                        Parameters:
                                in_dim - input dimensions as an int
                                out_dim - output dimensions as an int

                        Return:
                                None
                """
                super(FeedForwardNN, self).__init__()
                self.device = device
                self.layer1 = nn.Linear(in_dim, 64)
                self.layer2 = nn.Linear(64, 64)
                self.layer3 = nn.Linear(64, out_dim)

        def forward(self, obs):
                """
                        Runs a forward pass on the neural network.

                        Parameters:
                                obs - observation to pass as input

                        Return:
                                output - the output of our forward pass
                """
                # Convert observation to tensor if it's a numpy array
                if isinstance(obs, np.ndarray):
                        obs = torch.tensor(obs, dtype=torch.float).to(self.device)

                activation1 = F.relu(self.layer1(obs))
                activation2 = F.relu(self.layer2(activation1))
                output = self.layer3(activation2)

                return output

Classes

class FeedForwardNN (in_dim, out_dim, device='cuda')

A standard in_dim-64-64-out_dim Feed Forward Neural Network.

Initialize the network and set up the layers.

Parameters

in_dim - input dimensions as an int out_dim - output dimensions as an int

Return

None

Expand source code
class FeedForwardNN(nn.Module):
        """
                A standard in_dim-64-64-out_dim Feed Forward Neural Network.
        """
        def __init__(self, in_dim, out_dim, device = 'cuda'):
                """
                        Initialize the network and set up the layers.

                        Parameters:
                                in_dim - input dimensions as an int
                                out_dim - output dimensions as an int

                        Return:
                                None
                """
                super(FeedForwardNN, self).__init__()
                self.device = device
                self.layer1 = nn.Linear(in_dim, 64)
                self.layer2 = nn.Linear(64, 64)
                self.layer3 = nn.Linear(64, out_dim)

        def forward(self, obs):
                """
                        Runs a forward pass on the neural network.

                        Parameters:
                                obs - observation to pass as input

                        Return:
                                output - the output of our forward pass
                """
                # Convert observation to tensor if it's a numpy array
                if isinstance(obs, np.ndarray):
                        obs = torch.tensor(obs, dtype=torch.float).to(self.device)

                activation1 = F.relu(self.layer1(obs))
                activation2 = F.relu(self.layer2(activation1))
                output = self.layer3(activation2)

                return output

Ancestors

  • torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, obs) ‑> Callable[..., Any]

Runs a forward pass on the neural network.

Parameters

obs - observation to pass as input

Return

output - the output of our forward pass

Expand source code
def forward(self, obs):
        """
                Runs a forward pass on the neural network.

                Parameters:
                        obs - observation to pass as input

                Return:
                        output - the output of our forward pass
        """
        # Convert observation to tensor if it's a numpy array
        if isinstance(obs, np.ndarray):
                obs = torch.tensor(obs, dtype=torch.float).to(self.device)

        activation1 = F.relu(self.layer1(obs))
        activation2 = F.relu(self.layer2(activation1))
        output = self.layer3(activation2)

        return output
class PPOAgent (env, device='cuda', timesteps_per_batch=50000, max_timesteps_per_episode=2000, n_updates_per_iteration=5, lr=0.005, gamma=0.95, clip=0.2, render=False, seed=None)

This is the agent class for the Proximal Policy Optimization Algorithm.

Original paper can be found at https://arxiv.org/abs/1707.06347

This implementation was adapted from https://github.com/ericyangyu/PPO-for-Beginners

Initializes the PPO model, including hyperparameters.

Parameters:

    policy_class - the policy class to use for our actor/critic networks.
    env (PortfolioGymEnv): instance of environment
    hyperparameters - all extra arguments passed into PPO that should be hyperparameters.

Returns

None

Expand source code
class PPOAgent:
        """
                This is the agent class for the Proximal Policy Optimization Algorithm.
  
        Original paper can be found at https://arxiv.org/abs/1707.06347

        This implementation was adapted from https://github.com/ericyangyu/PPO-for-Beginners
    
        """
        def __init__(self, env, device = 'cuda', timesteps_per_batch = 50_000, max_timesteps_per_episode = 2_000,
                n_updates_per_iteration = 5, lr = 0.005, gamma = 0.95, clip = 0.2,
                render = False, seed = None):
                """
                        Initializes the PPO model, including hyperparameters.

                        Parameters:
                        ==========
                                policy_class - the policy class to use for our actor/critic networks.
                                env (PortfolioGymEnv): instance of environment
                                hyperparameters - all extra arguments passed into PPO that should be hyperparameters.

                        Returns:
                                None
                """

                # Initialize hyperparameters for training with PPO
                # Initialize default values for hyperparameters
                # Algorithm hyperparameters
                self.timesteps_per_batch = timesteps_per_batch                 # Number of timesteps to run per batch
                self.max_timesteps_per_episode = max_timesteps_per_episode          # Max number of timesteps per episode
                self.n_updates_per_iteration = n_updates_per_iteration              # No of times to update actor/critic per iteration
                self.lr = lr                                 # Learning rate of actor optimizer
                self.gamma = gamma                               # Discount factor to be applied when calculating Rewards-To-Go
                self.clip = clip                              # Recommended 0.2, helps define the threshold to clip the ratio during SGA

                # Miscellaneous parameters
                self.render = render
                self.seed = seed


                # Sets the seed if specified
                if self.seed != None:
                        # Check if our seed is valid first
                        assert(type(self.seed) == int)

                        # Set the seed 
                        torch.manual_seed(self.seed)
                        print(f"Successfully set seed to {self.seed}")
                self.device = device

                # Extract environment information
                self.env = env
                self.obs_dim = env.observation_space.shape[0]
                self.act_dim = env.action_space.shape[0]

                # Initialize actor and critic networks
                self.actor = FeedForwardNN(self.obs_dim, self.act_dim, device).to(device)                                                   # ALG STEP 1
                self.critic = FeedForwardNN(self.obs_dim, 1, device).to(device)

                # Initialize optimizers for actor and critic
                self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
                self.critic_optim = Adam(self.critic.parameters(), lr=self.lr)

                # Initialize the covariance matrix used to query the actor for actions
                self.cov_var = torch.full(size=(self.act_dim,), fill_value=0.5)
                self.cov_mat = torch.diag(self.cov_var)

                # This logger will help us with printing out summaries of each iteration
                self.logger = {
                        'delta_t': time.time_ns(),
                        't_so_far': 0,          # timesteps so far
                        'i_so_far': 0,          # iterations so far
                        'batch_lens': [],       # episodic lengths in batch
                        'batch_rews': [],       # episodic returns in batch
                        'actor_losses': [],     # losses of actor network in current iteration
                }

        def learn(self, timesteps, print_every = 1):
                """
                        Train the actor and critic networks. Here is where the main PPO algorithm resides.

                        Parameters:
                        ===========
                                total_timesteps - the total number of timesteps to train for
                        print_every (int): Verbosity control

                        Return:
                                None
                """
                print(f"Learning... Running {self.max_timesteps_per_episode} timesteps per episode, ", end='')
                print(f"{self.timesteps_per_batch} timesteps per batch for a total of {timesteps} timesteps")
                t_so_far = 0 # Timesteps simulated so far
                i_so_far = 0 # Iterations ran so far
                self.print_every = print_every
                while t_so_far < timesteps:                                                                       # ALG STEP 2
                        # We collect our batch simulations here
                        batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.step()                     # ALG STEP 3

                        # Calculate how many timesteps we collected this batch
                        t_so_far += np.sum(batch_lens)

                        # Increment the number of iterations
                        i_so_far += 1

                        # Logging timesteps so far and iterations so far
                        self.logger['t_so_far'] = t_so_far
                        self.logger['i_so_far'] = i_so_far

                        # Calculate advantage at k-th iteration
                        V, _ = self.evaluate(batch_obs, batch_acts)
                        A_k = batch_rtgs - V.detach()                                                                       # ALG STEP 5

                        A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

                        # This is the loop where we update our network for some n epochs
                        for _ in range(self.n_updates_per_iteration):                                                       # ALG STEP 6 & 7
                                # Calculate V_phi and pi_theta(a_t | s_t)
                                V, curr_log_probs = self.evaluate(batch_obs, batch_acts)

                                # Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
                                # NOTE: we just subtract the logs, which is the same as
                                # dividing the values and then canceling the log with e^log.
                                # For why we use log probabilities instead of actual probabilities,
                                # here's a great explanation: 
                                # https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
                                # TL;DR makes gradient ascent easier behind the scenes.
                                ratios = torch.exp(curr_log_probs - batch_log_probs)

                                # Calculate surrogate losses.
                                surr1 = ratios * A_k
                                surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k

                                # Calculate actor and critic losses.
                                # NOTE: we take the negative min of the surrogate losses because we're trying to maximize
                                # the performance function, but Adam minimizes the loss. So minimizing the negative
                                # performance function maximizes it.
                                actor_loss = (-torch.min(surr1, surr2)).mean()
                                critic_loss = nn.MSELoss()(V, batch_rtgs)

                                # Calculate gradients and perform backward propagation for actor network
                                self.actor_optim.zero_grad()
                                actor_loss.backward(retain_graph=True)
                                self.actor_optim.step()

                                # Calculate gradients and perform backward propagation for critic network
                                self.critic_optim.zero_grad()
                                critic_loss.backward()
                                self.critic_optim.step()

                                # Log actor loss
                                self.logger['actor_losses'].append(actor_loss.detach())

                                
        def save(self):
                """
        Saves trained model
        
                Parameters:
                        None        

        """
                torch.save(self.actor, './ppo_actor.pth')
                torch.save(self.critic, './ppo_critic.pth')


        def step(self):
                """
                        This is where we collect the batch of data from simulation. 
                        Since this is an on-policy algorithm, we'll need to collect a fresh batch
                        of data each time we iterate the actor/critic networks.

                        Parameters:
                                None

                        Return:
                                batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation)
                                batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action)
                                batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps)
                                batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps)
                                batch_lens - the lengths of each episode this batch. Shape: (number of episodes)
                """
                # Batch data. For more details, check function header.
                batch_obs = []
                batch_acts = []
                batch_log_probs = []
                batch_rews = []
                batch_rtgs = []
                batch_lens = []

                # Episodic data. Keeps track of rewards per episode, will get cleared
                # upon each new episode
                ep_rews = []

                t = 0 # Keeps track of how many timesteps we've run so far this batch
                flag = False
                count_of_dones = 0

                # Keep simulating until we've run more than or equal to specified timesteps per batch
                while t < self.timesteps_per_batch:
                        ep_rews = [] # rewards collected per episode

                        # Reset the environment. Note that obs is short for observation. 
                        obs = self.env.reset()
                        done = False    

                        # Run an episode for a maximum of max_timesteps_per_episode timesteps
                        for ep_t in range(self.max_timesteps_per_episode):
                                # If render is specified, render the environment
                                if self.render and (self.logger['i_so_far'] % self.render_every_i == 0) and len(batch_lens) == 0:
                                        self.env.render()

                                t += 1 # Increment timesteps ran this batch so far

                                # Track observations in this batch
                                batch_obs.append(obs)

                                # Calculate action and make a step in the env. 
                                # Note that rew is short for reward.
                                action, log_prob = self.get_action(obs)
                                action = torch.nn.Softmax(dim = 0)(torch.Tensor(action)).numpy()
                                obs, rew, done, _ = self.env.step(action)

                                # Track recent reward, action, and action log probability
                                ep_rews.append(rew)
                                batch_acts.append(action)
                                batch_log_probs.append(log_prob)

                                # If the environment tells us the episode is terminated, break
                                if done:
                                        count_of_dones += 1
                                        flag = True
                                        break

                                if flag and count_of_dones % self.print_every == 0:
                                        count_of_dones += 1
                                        flag = False
                                        print(f'Score at timestep {t}: {sum(ep_rews)}.')

                    
                        # Track episodic lengths and rewards
                        batch_lens.append(ep_t + 1)
                        batch_rews.append(ep_rews)

                # Reshape data as tensors in the shape specified in function description, before returning
                batch_obs = torch.tensor(np.array(batch_obs), dtype=torch.float)
                batch_acts = torch.tensor(np.array(batch_acts), dtype=torch.float)
                batch_log_probs = torch.tensor(np.array(batch_log_probs), dtype=torch.float)
                batch_rtgs = self.compute_rtgs(batch_rews)                                                              # ALG STEP 4

                # Log the episodic returns and episodic lengths in this batch.
                self.logger['batch_rews'] = batch_rews
                self.logger['batch_lens'] = batch_lens

                return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens


        def load(env, actor_model):
                """
                Load trained model

                Params
                =====
                actor_model : folder path to save the agent
                """
                # Extract out dimensions of observation and action spaces
                obs_dim = env.observation_space.shape[0]
                act_dim = env.action_space.shape[0]

                # Build our policy the same way we build our actor model in PPO
                policy = FeedForwardNN(obs_dim, act_dim)

                # Load in the actor model saved by the PPO algorithm
                policy.load_state_dict(torch.load(actor_model))


        def compute_rtgs(self, batch_rews):
                """
                        Compute the Reward-To-Go of each timestep in a batch given the rewards.

                        Parameters:
                                batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)

                        Return:
                                batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
                """
                # The rewards-to-go (rtg) per episode per batch to return.
                # The shape will be (num timesteps per episode)
                batch_rtgs = []

                # Iterate through each episode
                for ep_rews in reversed(batch_rews):

                        discounted_reward = 0 # The discounted reward so far

                        # Iterate through all rewards in the episode. We go backwards for smoother calculation of each
                        # discounted return (think about why it would be harder starting from the beginning)
                        for rew in reversed(ep_rews):
                                discounted_reward = rew + discounted_reward * self.gamma
                                batch_rtgs.insert(0, discounted_reward)

                # Convert the rewards-to-go into a tensor
                batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)

                return batch_rtgs

        def predict(self, state):
                """
                        Queries an action from the actor network, should be called from step.

                        Parameters:
                                state - the observation at the current timestep

                        Return:
                                action - the action to take, as a numpy array
                """
                action, _ = self.get_action(state)
                return torch.nn.Softmax(dim = 0)(torch.Tensor(action)).numpy()
    
        def get_action(self, obs):
                """
                        Queries an action from the actor network, should be called from step.

                        Parameters:
                                obs - the observation at the current timestep

                        Return:
                                action - the action to take, as a numpy array
                                log_prob - the log probability of the selected action in the distribution
                """
                # Query the actor network for a mean action
                mean = self.actor(obs)

                # Create a distribution with the mean action and std from the covariance matrix above.
                # For more information on how this distribution works, check out Andrew Ng's lecture on it:
                # https://www.youtube.com/watch?v=JjB58InuTqM
                dist = MultivariateNormal(mean.to(self.device), self.cov_mat.to(self.device))

                # Sample an action from the distribution
                action = dist.sample()

                # Calculate the log probability for that action
                log_prob = dist.log_prob(action)

                # Return the sampled action and the log probability of that action in our distribution
                return action.detach().cpu().numpy(), log_prob.detach()

        def evaluate(self, batch_obs, batch_acts):
                """
                        Estimate the values of each observation, and the log probs of
                        each action in the most recent batch with the most recent
                        iteration of the actor network. Should be called from learn.

                        Parameters:
                                batch_obs - the observations from the most recently collected batch as a tensor.
                                                        Shape: (number of timesteps in batch, dimension of observation)
                                batch_acts - the actions from the most recently collected batch as a tensor.
                                                        Shape: (number of timesteps in batch, dimension of action)

                        Return:
                                V - the predicted values of batch_obs
                                log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
                """
                # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
                V = self.critic(batch_obs).squeeze()

                # Calculate the log probabilities of batch actions using most recent actor network.
                # This segment of code is similar to that in get_action()
                mean = self.actor(batch_obs)
                dist = MultivariateNormal(mean, self.cov_mat)
                log_probs = dist.log_prob(batch_acts)

                # Return the value vector V of each observation in the batch
                # and log probabilities log_probs of each action in the batch
                return V, log_probs

Methods

def compute_rtgs(self, batch_rews)

Compute the Reward-To-Go of each timestep in a batch given the rewards.

Parameters

batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)

Return

batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)

Expand source code
def compute_rtgs(self, batch_rews):
        """
                Compute the Reward-To-Go of each timestep in a batch given the rewards.

                Parameters:
                        batch_rews - the rewards in a batch, Shape: (number of episodes, number of timesteps per episode)

                Return:
                        batch_rtgs - the rewards to go, Shape: (number of timesteps in batch)
        """
        # The rewards-to-go (rtg) per episode per batch to return.
        # The shape will be (num timesteps per episode)
        batch_rtgs = []

        # Iterate through each episode
        for ep_rews in reversed(batch_rews):

                discounted_reward = 0 # The discounted reward so far

                # Iterate through all rewards in the episode. We go backwards for smoother calculation of each
                # discounted return (think about why it would be harder starting from the beginning)
                for rew in reversed(ep_rews):
                        discounted_reward = rew + discounted_reward * self.gamma
                        batch_rtgs.insert(0, discounted_reward)

        # Convert the rewards-to-go into a tensor
        batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)

        return batch_rtgs
def evaluate(self, batch_obs, batch_acts)

Estimate the values of each observation, and the log probs of each action in the most recent batch with the most recent iteration of the actor network. Should be called from learn.

Parameters

batch_obs - the observations from the most recently collected batch as a tensor. Shape: (number of timesteps in batch, dimension of observation) batch_acts - the actions from the most recently collected batch as a tensor. Shape: (number of timesteps in batch, dimension of action)

Return

V - the predicted values of batch_obs log_probs - the log probabilities of the actions taken in batch_acts given batch_obs

Expand source code
def evaluate(self, batch_obs, batch_acts):
        """
                Estimate the values of each observation, and the log probs of
                each action in the most recent batch with the most recent
                iteration of the actor network. Should be called from learn.

                Parameters:
                        batch_obs - the observations from the most recently collected batch as a tensor.
                                                Shape: (number of timesteps in batch, dimension of observation)
                        batch_acts - the actions from the most recently collected batch as a tensor.
                                                Shape: (number of timesteps in batch, dimension of action)

                Return:
                        V - the predicted values of batch_obs
                        log_probs - the log probabilities of the actions taken in batch_acts given batch_obs
        """
        # Query critic network for a value V for each batch_obs. Shape of V should be same as batch_rtgs
        V = self.critic(batch_obs).squeeze()

        # Calculate the log probabilities of batch actions using most recent actor network.
        # This segment of code is similar to that in get_action()
        mean = self.actor(batch_obs)
        dist = MultivariateNormal(mean, self.cov_mat)
        log_probs = dist.log_prob(batch_acts)

        # Return the value vector V of each observation in the batch
        # and log probabilities log_probs of each action in the batch
        return V, log_probs
def get_action(self, obs)

Queries an action from the actor network, should be called from step.

Parameters

obs - the observation at the current timestep

Return

action - the action to take, as a numpy array log_prob - the log probability of the selected action in the distribution

Expand source code
def get_action(self, obs):
        """
                Queries an action from the actor network, should be called from step.

                Parameters:
                        obs - the observation at the current timestep

                Return:
                        action - the action to take, as a numpy array
                        log_prob - the log probability of the selected action in the distribution
        """
        # Query the actor network for a mean action
        mean = self.actor(obs)

        # Create a distribution with the mean action and std from the covariance matrix above.
        # For more information on how this distribution works, check out Andrew Ng's lecture on it:
        # https://www.youtube.com/watch?v=JjB58InuTqM
        dist = MultivariateNormal(mean.to(self.device), self.cov_mat.to(self.device))

        # Sample an action from the distribution
        action = dist.sample()

        # Calculate the log probability for that action
        log_prob = dist.log_prob(action)

        # Return the sampled action and the log probability of that action in our distribution
        return action.detach().cpu().numpy(), log_prob.detach()
def learn(self, timesteps, print_every=1)

Train the actor and critic networks. Here is where the main PPO algorithm resides.

Parameters:

    total_timesteps - the total number of timesteps to train for

print_every (int): Verbosity control

Return

None

Expand source code
def learn(self, timesteps, print_every = 1):
        """
                Train the actor and critic networks. Here is where the main PPO algorithm resides.

                Parameters:
                ===========
                        total_timesteps - the total number of timesteps to train for
                print_every (int): Verbosity control

                Return:
                        None
        """
        print(f"Learning... Running {self.max_timesteps_per_episode} timesteps per episode, ", end='')
        print(f"{self.timesteps_per_batch} timesteps per batch for a total of {timesteps} timesteps")
        t_so_far = 0 # Timesteps simulated so far
        i_so_far = 0 # Iterations ran so far
        self.print_every = print_every
        while t_so_far < timesteps:                                                                       # ALG STEP 2
                # We collect our batch simulations here
                batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.step()                     # ALG STEP 3

                # Calculate how many timesteps we collected this batch
                t_so_far += np.sum(batch_lens)

                # Increment the number of iterations
                i_so_far += 1

                # Logging timesteps so far and iterations so far
                self.logger['t_so_far'] = t_so_far
                self.logger['i_so_far'] = i_so_far

                # Calculate advantage at k-th iteration
                V, _ = self.evaluate(batch_obs, batch_acts)
                A_k = batch_rtgs - V.detach()                                                                       # ALG STEP 5

                A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

                # This is the loop where we update our network for some n epochs
                for _ in range(self.n_updates_per_iteration):                                                       # ALG STEP 6 & 7
                        # Calculate V_phi and pi_theta(a_t | s_t)
                        V, curr_log_probs = self.evaluate(batch_obs, batch_acts)

                        # Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
                        # NOTE: we just subtract the logs, which is the same as
                        # dividing the values and then canceling the log with e^log.
                        # For why we use log probabilities instead of actual probabilities,
                        # here's a great explanation: 
                        # https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
                        # TL;DR makes gradient ascent easier behind the scenes.
                        ratios = torch.exp(curr_log_probs - batch_log_probs)

                        # Calculate surrogate losses.
                        surr1 = ratios * A_k
                        surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k

                        # Calculate actor and critic losses.
                        # NOTE: we take the negative min of the surrogate losses because we're trying to maximize
                        # the performance function, but Adam minimizes the loss. So minimizing the negative
                        # performance function maximizes it.
                        actor_loss = (-torch.min(surr1, surr2)).mean()
                        critic_loss = nn.MSELoss()(V, batch_rtgs)

                        # Calculate gradients and perform backward propagation for actor network
                        self.actor_optim.zero_grad()
                        actor_loss.backward(retain_graph=True)
                        self.actor_optim.step()

                        # Calculate gradients and perform backward propagation for critic network
                        self.critic_optim.zero_grad()
                        critic_loss.backward()
                        self.critic_optim.step()

                        # Log actor loss
                        self.logger['actor_losses'].append(actor_loss.detach())
def load(env, actor_model)

Load trained model

Params

actor_model : folder path to save the agent

Expand source code
def load(env, actor_model):
        """
        Load trained model

        Params
        =====
        actor_model : folder path to save the agent
        """
        # Extract out dimensions of observation and action spaces
        obs_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]

        # Build our policy the same way we build our actor model in PPO
        policy = FeedForwardNN(obs_dim, act_dim)

        # Load in the actor model saved by the PPO algorithm
        policy.load_state_dict(torch.load(actor_model))
def predict(self, state)

Queries an action from the actor network, should be called from step.

Parameters

state - the observation at the current timestep

Return

action - the action to take, as a numpy array

Expand source code
def predict(self, state):
        """
                Queries an action from the actor network, should be called from step.

                Parameters:
                        state - the observation at the current timestep

                Return:
                        action - the action to take, as a numpy array
        """
        action, _ = self.get_action(state)
        return torch.nn.Softmax(dim = 0)(torch.Tensor(action)).numpy()
def save(self)

Saves trained model

    Parameters:
            None
Expand source code
def save(self):
        """
Saves trained model

        Parameters:
                None        

"""
        torch.save(self.actor, './ppo_actor.pth')
        torch.save(self.critic, './ppo_critic.pth')
def step(self)

This is where we collect the batch of data from simulation. Since this is an on-policy algorithm, we'll need to collect a fresh batch of data each time we iterate the actor/critic networks.

Parameters

None

Return

batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation) batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action) batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps) batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps) batch_lens - the lengths of each episode this batch. Shape: (number of episodes)

Expand source code
def step(self):
        """
                This is where we collect the batch of data from simulation. 
                Since this is an on-policy algorithm, we'll need to collect a fresh batch
                of data each time we iterate the actor/critic networks.

                Parameters:
                        None

                Return:
                        batch_obs - the observations collected this batch. Shape: (number of timesteps, dimension of observation)
                        batch_acts - the actions collected this batch. Shape: (number of timesteps, dimension of action)
                        batch_log_probs - the log probabilities of each action taken this batch. Shape: (number of timesteps)
                        batch_rtgs - the Rewards-To-Go of each timestep in this batch. Shape: (number of timesteps)
                        batch_lens - the lengths of each episode this batch. Shape: (number of episodes)
        """
        # Batch data. For more details, check function header.
        batch_obs = []
        batch_acts = []
        batch_log_probs = []
        batch_rews = []
        batch_rtgs = []
        batch_lens = []

        # Episodic data. Keeps track of rewards per episode, will get cleared
        # upon each new episode
        ep_rews = []

        t = 0 # Keeps track of how many timesteps we've run so far this batch
        flag = False
        count_of_dones = 0

        # Keep simulating until we've run more than or equal to specified timesteps per batch
        while t < self.timesteps_per_batch:
                ep_rews = [] # rewards collected per episode

                # Reset the environment. Note that obs is short for observation. 
                obs = self.env.reset()
                done = False    

                # Run an episode for a maximum of max_timesteps_per_episode timesteps
                for ep_t in range(self.max_timesteps_per_episode):
                        # If render is specified, render the environment
                        if self.render and (self.logger['i_so_far'] % self.render_every_i == 0) and len(batch_lens) == 0:
                                self.env.render()

                        t += 1 # Increment timesteps ran this batch so far

                        # Track observations in this batch
                        batch_obs.append(obs)

                        # Calculate action and make a step in the env. 
                        # Note that rew is short for reward.
                        action, log_prob = self.get_action(obs)
                        action = torch.nn.Softmax(dim = 0)(torch.Tensor(action)).numpy()
                        obs, rew, done, _ = self.env.step(action)

                        # Track recent reward, action, and action log probability
                        ep_rews.append(rew)
                        batch_acts.append(action)
                        batch_log_probs.append(log_prob)

                        # If the environment tells us the episode is terminated, break
                        if done:
                                count_of_dones += 1
                                flag = True
                                break

                        if flag and count_of_dones % self.print_every == 0:
                                count_of_dones += 1
                                flag = False
                                print(f'Score at timestep {t}: {sum(ep_rews)}.')

            
                # Track episodic lengths and rewards
                batch_lens.append(ep_t + 1)
                batch_rews.append(ep_rews)

        # Reshape data as tensors in the shape specified in function description, before returning
        batch_obs = torch.tensor(np.array(batch_obs), dtype=torch.float)
        batch_acts = torch.tensor(np.array(batch_acts), dtype=torch.float)
        batch_log_probs = torch.tensor(np.array(batch_log_probs), dtype=torch.float)
        batch_rtgs = self.compute_rtgs(batch_rews)                                                              # ALG STEP 4

        # Log the episodic returns and episodic lengths in this batch.
        self.logger['batch_rews'] = batch_rews
        self.logger['batch_lens'] = batch_lens

        return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens