Module AssetAllocator.algorithms.REINFORCE.reinforce_continuous

Expand source code
import sys
import math

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.utils as utils
import torchvision.transforms as T
from torch.autograd import Variable

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

pi = Variable(torch.FloatTensor([math.pi])).to(device)


def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
    return a*b


class Policy(nn.Module):
    """This is the policy network for the REINFORCCE Agent.

    This implementation was adapted from https://github.com/chingyaoc/pytorch-REINFORCE
    
    """
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Policy, self).__init__()
        self.action_space = action_space
        num_outputs = action_space.shape[0]

        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_outputs)
        self.linear2_ = nn.Linear(hidden_size, num_outputs)

    def forward(self, inputs):
        x = inputs
        x = F.relu(self.linear1(x))
        mu = self.linear2(x)
        sigma_sq = self.linear2_(x)

        return mu, sigma_sq


class REINFORCE:
    
    def __init__(self, hidden_size, num_inputs, action_space):
        """Initializes the REINFORCE Agent

        Args:
            hidden_size (int): Size of hidden layer neurons in the policy network
            num_inputs (str): input size to the first hidden dimension.
            action_space (gym.env.Box): Action space of the gym environment
        """      
        self.action_space = action_space
        self.model = Policy(hidden_size, num_inputs, action_space)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.model.train()

    def select_action(self, state):
        """Takes in current environment's state and returns the agent's action

        Args:
            state (array_like): Current environment state

        Returns:
            action: Agent's action
        """
        mu, sigma_sq = self.model(Variable(state).to(device))
        sigma_sq = F.softplus(sigma_sq)

        eps = torch.randn(mu.size())
        # calculate the probability
        action = (mu + sigma_sq.sqrt()*Variable(eps).to(device)).data
        prob = normal(action, mu, sigma_sq)
        entropy = -0.5*((sigma_sq+2*pi.expand_as(sigma_sq)).log()+1)

        log_prob = prob.log()
        return action, log_prob, entropy

    def update_parameters(self, rewards, log_probs, entropies, gamma):
        """Takes in previous reward, log probabilities, entropies and noise to update the policy network's parameters

        Args:
            rewards (array_like): rewards from previous step
            log_probs
            entropies
            gamma

        Returns:
            action: Agent's action
        """
        R = torch.zeros(1, 1)
        loss = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).to(
                device)).sum() - (0.0001*entropies[i].to(device)).sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_norm_(self.model.parameters(), 40)
        self.optimizer.step()

Functions

def normal(x, mu, sigma_sq)
Expand source code
def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
    return a*b

Classes

class Policy (hidden_size, num_inputs, action_space)

This is the policy network for the REINFORCCE Agent.

This implementation was adapted from https://github.com/chingyaoc/pytorch-REINFORCE

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code
class Policy(nn.Module):
    """This is the policy network for the REINFORCCE Agent.

    This implementation was adapted from https://github.com/chingyaoc/pytorch-REINFORCE
    
    """
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Policy, self).__init__()
        self.action_space = action_space
        num_outputs = action_space.shape[0]

        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_outputs)
        self.linear2_ = nn.Linear(hidden_size, num_outputs)

    def forward(self, inputs):
        x = inputs
        x = F.relu(self.linear1(x))
        mu = self.linear2(x)
        sigma_sq = self.linear2_(x)

        return mu, sigma_sq

Ancestors

  • torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, inputs) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Expand source code
def forward(self, inputs):
    x = inputs
    x = F.relu(self.linear1(x))
    mu = self.linear2(x)
    sigma_sq = self.linear2_(x)

    return mu, sigma_sq
class REINFORCE (hidden_size, num_inputs, action_space)

Initializes the REINFORCE Agent

Args

hidden_size : int
Size of hidden layer neurons in the policy network
num_inputs : str
input size to the first hidden dimension.
action_space : gym.env.Box
Action space of the gym environment
Expand source code
class REINFORCE:
    
    def __init__(self, hidden_size, num_inputs, action_space):
        """Initializes the REINFORCE Agent

        Args:
            hidden_size (int): Size of hidden layer neurons in the policy network
            num_inputs (str): input size to the first hidden dimension.
            action_space (gym.env.Box): Action space of the gym environment
        """      
        self.action_space = action_space
        self.model = Policy(hidden_size, num_inputs, action_space)
        self.model = self.model.to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.model.train()

    def select_action(self, state):
        """Takes in current environment's state and returns the agent's action

        Args:
            state (array_like): Current environment state

        Returns:
            action: Agent's action
        """
        mu, sigma_sq = self.model(Variable(state).to(device))
        sigma_sq = F.softplus(sigma_sq)

        eps = torch.randn(mu.size())
        # calculate the probability
        action = (mu + sigma_sq.sqrt()*Variable(eps).to(device)).data
        prob = normal(action, mu, sigma_sq)
        entropy = -0.5*((sigma_sq+2*pi.expand_as(sigma_sq)).log()+1)

        log_prob = prob.log()
        return action, log_prob, entropy

    def update_parameters(self, rewards, log_probs, entropies, gamma):
        """Takes in previous reward, log probabilities, entropies and noise to update the policy network's parameters

        Args:
            rewards (array_like): rewards from previous step
            log_probs
            entropies
            gamma

        Returns:
            action: Agent's action
        """
        R = torch.zeros(1, 1)
        loss = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).to(
                device)).sum() - (0.0001*entropies[i].to(device)).sum()
        loss = loss / len(rewards)

        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_norm_(self.model.parameters(), 40)
        self.optimizer.step()

Methods

def select_action(self, state)

Takes in current environment's state and returns the agent's action

Args

state : array_like
Current environment state

Returns

action
Agent's action
Expand source code
def select_action(self, state):
    """Takes in current environment's state and returns the agent's action

    Args:
        state (array_like): Current environment state

    Returns:
        action: Agent's action
    """
    mu, sigma_sq = self.model(Variable(state).to(device))
    sigma_sq = F.softplus(sigma_sq)

    eps = torch.randn(mu.size())
    # calculate the probability
    action = (mu + sigma_sq.sqrt()*Variable(eps).to(device)).data
    prob = normal(action, mu, sigma_sq)
    entropy = -0.5*((sigma_sq+2*pi.expand_as(sigma_sq)).log()+1)

    log_prob = prob.log()
    return action, log_prob, entropy
def update_parameters(self, rewards, log_probs, entropies, gamma)

Takes in previous reward, log probabilities, entropies and noise to update the policy network's parameters

Args

rewards : array_like
rewards from previous step

log_probs entropies gamma

Returns

action
Agent's action
Expand source code
def update_parameters(self, rewards, log_probs, entropies, gamma):
    """Takes in previous reward, log probabilities, entropies and noise to update the policy network's parameters

    Args:
        rewards (array_like): rewards from previous step
        log_probs
        entropies
        gamma

    Returns:
        action: Agent's action
    """
    R = torch.zeros(1, 1)
    loss = 0
    for i in reversed(range(len(rewards))):
        R = gamma * R + rewards[i]
        loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).to(
            device)).sum() - (0.0001*entropies[i].to(device)).sum()
    loss = loss / len(rewards)

    self.optimizer.zero_grad()
    loss.backward()
    utils.clip_grad_norm_(self.model.parameters(), 40)
    self.optimizer.step()