Module AssetAllocator.algorithms.NAF.replay_buffer

Expand source code
import torch
import numpy as np
import random 
from collections import deque, namedtuple

class ReplayBuffer:
    """
    Fixed-size buffer to store experience tuples.
    
    Original paper can be found at https://arxiv.org/abs/1906.04594

    This implementation was adapted from https://github.com/BY571/Normalized-Advantage-Function-NAF-
    
    """

    def __init__(self, buffer_size, batch_size, device, seed, gamma):
        """Initialize a ReplayBuffer object.

        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.device = device
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
        self.gamma = gamma

        self.n_step_buffer = desque(maxlen=1)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory.

        Params
        =====
        state (array_like): current state
        action (array_like): current action
        reward (array_like): reward for current state and action pair
        next_state (array_like): next state
        done(array_like): current end status    
        """

        self.n_step_buffer.append((state, action, reward, next_state, done))
        if len(self.n_step_buffer) == 1:
            state, action, reward, next_state, done = self.calc_multistep_return()

            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)
    
    def calc_multistep_return(self):
        """
        Helper method to compute multistep returns
        """
        Return = 0
        for idx in range(1):
            Return += self.gamma**idx * self.n_step_buffer[idx][2]
        
        return self.n_step_buffer[0][0], self.n_step_buffer[0][1], Return, self.n_step_buffer[-1][3], self.n_step_buffer[-1][4]
    
    
    def sample(self):
        """
        Randomly sample a batch of experiences from memory.
        """
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """
        Return the current size of internal memory.
        """
        return len(self.memory)

Classes

class ReplayBuffer (buffer_size, batch_size, device, seed, gamma)

Fixed-size buffer to store experience tuples.

Original paper can be found at https://arxiv.org/abs/1906.04594

This implementation was adapted from https://github.com/BY571/Normalized-Advantage-Function-NAF-

Initialize a ReplayBuffer object.

Params

buffer_size (int): maximum size of buffer
batch_size (int): size of each training batch
seed (int): random seed
Expand source code
class ReplayBuffer:
    """
    Fixed-size buffer to store experience tuples.
    
    Original paper can be found at https://arxiv.org/abs/1906.04594

    This implementation was adapted from https://github.com/BY571/Normalized-Advantage-Function-NAF-
    
    """

    def __init__(self, buffer_size, batch_size, device, seed, gamma):
        """Initialize a ReplayBuffer object.

        Params
        ======
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.device = device
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
        self.gamma = gamma

        self.n_step_buffer = desque(maxlen=1)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory.

        Params
        =====
        state (array_like): current state
        action (array_like): current action
        reward (array_like): reward for current state and action pair
        next_state (array_like): next state
        done(array_like): current end status    
        """

        self.n_step_buffer.append((state, action, reward, next_state, done))
        if len(self.n_step_buffer) == 1:
            state, action, reward, next_state, done = self.calc_multistep_return()

            e = self.experience(state, action, reward, next_state, done)
            self.memory.append(e)
    
    def calc_multistep_return(self):
        """
        Helper method to compute multistep returns
        """
        Return = 0
        for idx in range(1):
            Return += self.gamma**idx * self.n_step_buffer[idx][2]
        
        return self.n_step_buffer[0][0], self.n_step_buffer[0][1], Return, self.n_step_buffer[-1][3], self.n_step_buffer[-1][4]
    
    
    def sample(self):
        """
        Randomly sample a batch of experiences from memory.
        """
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """
        Return the current size of internal memory.
        """
        return len(self.memory)

Methods

def add(self, state, action, reward, next_state, done)

Add a new experience to memory.

Params

state (array_like): current state action (array_like): current action reward (array_like): reward for current state and action pair next_state (array_like): next state done(array_like): current end status

Expand source code
def add(self, state, action, reward, next_state, done):
    """Add a new experience to memory.

    Params
    =====
    state (array_like): current state
    action (array_like): current action
    reward (array_like): reward for current state and action pair
    next_state (array_like): next state
    done(array_like): current end status    
    """

    self.n_step_buffer.append((state, action, reward, next_state, done))
    if len(self.n_step_buffer) == 1:
        state, action, reward, next_state, done = self.calc_multistep_return()

        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
def calc_multistep_return(self)

Helper method to compute multistep returns

Expand source code
def calc_multistep_return(self):
    """
    Helper method to compute multistep returns
    """
    Return = 0
    for idx in range(1):
        Return += self.gamma**idx * self.n_step_buffer[idx][2]
    
    return self.n_step_buffer[0][0], self.n_step_buffer[0][1], Return, self.n_step_buffer[-1][3], self.n_step_buffer[-1][4]
def sample(self)

Randomly sample a batch of experiences from memory.

Expand source code
def sample(self):
    """
    Randomly sample a batch of experiences from memory.
    """
    experiences = random.sample(self.memory, k=self.batch_size)

    states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
    actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
    rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
    next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
    dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)

    return (states, actions, rewards, next_states, dones)