Module AssetAllocator.algorithms.TRPO.agent
Expand source code
from itertools import count
import signal
import sys
import os
import time
sys.path.append('..')
import numpy as np
import gym
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import scipy.optimize
import matplotlib.pyplot as plt
from .value import Value
from .policy import Policy
from .utils import *
from .trpo import trpo_step
class TRPOAgent:
"""
This is the agent class for the Trust Region Policy Optmization Algorithm.
Original paper can be found at https://arxiv.org/abs/1502.05477
This implementation was adapted from https://github.com/MEfeTiryaki/trpo/blob/master/train.py
"""
def __init__(
self,
env,
device = 'cuda',
damping=0.1,
episode_length= 2000,
fisher_ratio=1,
gamma=0.995,
l2_reg=0.001,
lambda_=0.97,
lr=0.001,
max_iteration_number=200,
max_kl=0.01,
save=False,
seed=543,
val_opt_iter=200,
value_memory=1,
value_memory_shuffle=False):
"""Initialize a TRPOAgent object.
Params
======
env (PortfolioGymEnv): instance of environment
device: device type (one of cuda or cpu)
damping: policy optimization parameter
episode_length: max step size for one episode
fisher_ratio: policy optimization parameter
l2_reg: l2 regularization regression
lambda_: gae
max_iteration_number: max policy iteration number
max_kl: max kl divergence (policy optimization parameter)
val_opt_iter: iteration number for value function learning
lr (float): learning rate
gamma (float): discount factor
seed (int): random seed
"""
self.env = env
self.device = device
self.damping = damping
self.episode_length = episode_length
self.fisher_ratio = fisher_ratio
self.gamma = gamma
self.l2_reg = l2_reg
self.lambda_ = lambda_
self.lr = lr
self.max_iteration_number = max_iteration_number
self.max_kl = max_kl
self.val_opt_iter = val_opt_iter
self.save = save
self.seed = seed
self.value_memory = value_memory
self.value_memory_shuffle = value_memory_shuffle
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.shape[-1]
self.policy_net = Policy(self.state_dim, self.action_dim, 256, device).to(device)
self.value_net = Value(self.state_dim, 256).to(device)
def signal_handler(self, sig, frame):
"""
Signal Handler to save the networks when shutting down via ctrl+C
Parameters:
==========
sig
frame
Returns:
"""
if(self.save):
valueParam = get_flat_params_from(self.value_net)
policyParam = get_flat_params_from(self.policy_net)
saveParameterCsv(valueParam,self.load_dir+"/ValueNet")
saveParameterCsv(policyParam,self.load_dir+"/PolicyNet")
print("Networks are saved in "+self.load_dir+"/")
print('Closing!!')
env.close()
sys.exit(0)
def prepare_data(self, batch, valueBatch, previousBatch):
"""
Get the batch data and calculate value,return and generalized advantage
Parameters:
==========
batch
valueBatch
previousBatch
"""
stateList = [ torch.from_numpy(np.concatenate(x,axis=0)).to(self.device) for x in batch["states"]]
actionsList = [torch.from_numpy(np.concatenate(x,axis=0)).to(self.device) for x in batch["actions"]]
for states in stateList:
value = self.value_net.forward(states)
batch["values"].append(value)
advantagesList = []
returnsList = []
rewardsList = []
for rewards,values,masks in zip(batch["rewards"],batch["values"],batch["mask"]):
returns = torch.Tensor(len(rewards),1).to(self.device)
advantages = torch.Tensor(len(rewards),1).to(self.device)
deltas = torch.Tensor(len(rewards),1).to(self.device)
prev_return = 0
prev_value = 0
prev_advantage = 0
for i in reversed(range(len(rewards))):
returns[i] = rewards[i] + self.gamma * prev_value * masks[i] # TD
# returns[i] = rewards[i] + self.gamma * prev_return * masks[i] # Monte Carlo
deltas[i] = rewards[i] + self.gamma * prev_value * masks[i]- values.data[i]
advantages[i] = deltas[i] + self.gamma * self.lambda_* prev_advantage* masks[i]
prev_return = returns[i, 0]
prev_value = values.data[i, 0]
prev_advantage = advantages[i, 0]
returnsList.append(returns)
advantagesList.append(advantages)
rewardsList.append(torch.Tensor(rewards).to(self.device))
batch["states"] = torch.cat(stateList,0)
batch["actions"] = torch.cat(actionsList,0)
batch["rewards"] = torch.cat(rewardsList,0)
batch["returns"] = torch.cat(returnsList,0)
advantagesList = torch.cat(advantagesList,0)
batch["advantages"] = (advantagesList- advantagesList.mean()) / advantagesList.std()
valueBatch["states"] = torch.cat(( previousBatch["states"],batch["states"]),0)
valueBatch["targets"] = torch.cat((previousBatch["returns"],batch["returns"]),0)
def update_policy(self, batch):
"""
Get advantage , states and action and calls trpo step
Parameters:
batch (dict of arrays of numpy)
Returns:
"""
advantages = batch["advantages"]
states = batch["states"]
actions = batch["actions"]
trpo_step(self.policy_net, states,actions,advantages , self.max_kl, self.damping)
def update_value(self, valueBatch):
"""
Get valueBatch and run adam optimizer to learn value function
Parameters:
valueBatch (dict of arrays of numpy)
Returns:
"""
# shuffle the data
dataSize = valueBatch["targets"].size()[0]
permutation = torch.randperm(dataSize)
input = valueBatch["states"][permutation]
target = valueBatch["targets"][permutation]
iter = self.val_opt_iter
batchSize = int(dataSize/ iter)
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.lr)
for t in range(iter):
prediction = self.value_net(input[t*batchSize:t*batchSize+batchSize])
loss = loss_fn(prediction, target[t*batchSize:t*batchSize+batchSize])
# XXX : Comment out for debug
# if t%100==0:
# print("\t%f"%loss.data)
optimizer.zero_grad()
loss.backward()
optimizer.step()
def save_to_previousBatch(self, previousBatch, batch):
"""
Save previous batch to use in future value optimization
Parameters:
previousBatch (dict of arrays of numpy)
batch (dict of arrays of numpy)
"""
if self.value_memory<0:
print("Value memory should be equal or greater than zero")
elif self.value_memory>0:
if previousBatch["returns"].size() == 0:
previousBatch= {"states":batch["states"],
"returns":batch["returns"]}
else:
previous_size = previousBatch["returns"].size()[0]
size = batch["returns"].size()[0]
if previous_size/size == self.value_memory:
previousBatch["states"] = torch.cat([previousBatch["states"][size:],batch["states"]],0)
previousBatch["returns"] = torch.cat([previousBatch["returns"][size:],batch["returns"]],0)
else:
previousBatch["states"] = torch.cat([previousBatch["states"],batch["states"]],0)
previousBatch["returns"] = torch.cat([previousBatch["returns"],batch["returns"]],0)
if self.value_memory_shuffle:
permutation = torch.randperm(previousBatch["returns"].size()[0])
previousBatch["states"] = previousBatch["states"][permutation]
previousBatch["returns"] = previousBatch["returns"][permutation]
def calculate_loss(self, reward_sum_mean,reward_sum_std,test_number = 10):
"""
Calculate mean cummulative reward for test_nubmer of trials
Parameters:
reward_sum_mean (list): holds the history of the means.
reward_sum_std (list): holds the history of the std.
Returns:
list: new value appended means
list: new value appended stds
"""
rewardSum = []
for i in range(test_number):
state = self.env.reset()
rewardSum.append(0)
for t in range(self.episode_length):
state, reward, done, _ = self.env.step(self.policy_net.get_action(state)[0] )
state = np.transpose(state)
rewardSum[-1] += reward
if done:
break
reward_sum_mean.append(np.array(rewardSum).mean())
reward_sum_std.append(np.array(rewardSum).std())
return reward_sum_mean, reward_sum_std
def predict(self, state):
"""
Queries an action from the actor network, should be called from step.
Parameters:
state - the observation at the current timestep
Return:
action - the action to take, as a numpy array
"""
action = self.policy_net.get_action(state)[0]
return action
def learn(self, timesteps, print_every = 1):
"""
Trains the agent
Params
======
timesteps (int): Number of timesteps the agent should interact with the environment
print_every (int): Verbosity control
"""
signal.signal(signal.SIGINT, self.signal_handler)
time_start = time.time()
reward_sum_mean,reward_sum_std = [], []
previousBatch= {"states":torch.Tensor(0).to(self.device) ,
"returns":torch.Tensor(0).to(self.device)}
reward_sum_mean,reward_sum_std = self.calculate_loss(reward_sum_mean,reward_sum_std)
#print("Initial loss \n\tloss | mean : %6.4f / std : %6.4f"%(reward_sum_mean[-1],reward_sum_std[-1]) )
num_steps = 0
flag = False
count_of_dones = 0
max_iteration_number = timesteps//self.env.episode_length + 1
for i_episode in range(max_iteration_number):
time_episode_start = time.time()
# reset batches
batch = {"states":[] ,
"actions":[],
"next_states":[] ,
"rewards":[],
"returns":[],
"values":[],
"advantages":[],
"mask":[]}
valueBatch = {"states" :[],
"targets" : []}
# while num_steps < self.batch_size:
done = False
while not done:
state = self.env.reset()
reward_sum = 0
states,actions,rewards,next_states,masks = [],[],[],[],[]
steps = 0
for t in range(self.env.episode_length):
action = self.policy_net.get_action(state)[0] # agent
next_state, reward, done, info = self.env.step(action)
next_state = np.transpose(next_state)
mask = 0 if done else 1
masks.append(mask)
states.append(state)
actions.append(action)
next_states.append(next_state)
rewards.append(reward)
state = next_state
reward_sum += reward
num_steps+=1
if done:
count_of_dones += 1
flag = True
if flag and count_of_dones % print_every == 0:
print(f'Score at timestep {num_steps}: {reward_sum}.')
flag = False
if num_steps >= timesteps:
break
batch["states"].append(np.expand_dims(states, axis=1) )
batch["actions"].append(actions)
batch["next_states"].append(np.expand_dims(next_states, axis=1))
batch["rewards"].append(rewards)
batch["mask"].append(masks)
#num_steps += steps
self.prepare_data(batch,valueBatch,previousBatch)
self.update_policy(batch) # First policy update to avoid overfitting
self.update_value(valueBatch)
self.save_to_previousBatch(previousBatch,batch)
#print("episode %d | total: %.4f "%( i_episode, time.time()-time_episode_start))
reward_sum_mean,reward_sum_std = self.calculate_loss(reward_sum_mean,reward_sum_std)
#print("loss | mean : %6.4f / std : %6.4f"%(reward_sum_mean[-1],reward_sum_std[-1]))
Classes
class TRPOAgent (env, device='cuda', damping=0.1, episode_length=2000, fisher_ratio=1, gamma=0.995, l2_reg=0.001, lambda_=0.97, lr=0.001, max_iteration_number=200, max_kl=0.01, save=False, seed=543, val_opt_iter=200, value_memory=1, value_memory_shuffle=False)
-
This is the agent class for the Trust Region Policy Optmization Algorithm.
Original paper can be found at https://arxiv.org/abs/1502.05477
This implementation was adapted from https://github.com/MEfeTiryaki/trpo/blob/master/train.py
Initialize a TRPOAgent object.
Params
env (PortfolioGymEnv): instance of environment device: device type (one of cuda or cpu) damping: policy optimization parameter episode_length: max step size for one episode fisher_ratio: policy optimization parameter l2_reg: l2 regularization regression lambda_: gae max_iteration_number: max policy iteration number max_kl: max kl divergence (policy optimization parameter) val_opt_iter: iteration number for value function learning lr (float): learning rate gamma (float): discount factor seed (int): random seed
Expand source code
class TRPOAgent: """ This is the agent class for the Trust Region Policy Optmization Algorithm. Original paper can be found at https://arxiv.org/abs/1502.05477 This implementation was adapted from https://github.com/MEfeTiryaki/trpo/blob/master/train.py """ def __init__( self, env, device = 'cuda', damping=0.1, episode_length= 2000, fisher_ratio=1, gamma=0.995, l2_reg=0.001, lambda_=0.97, lr=0.001, max_iteration_number=200, max_kl=0.01, save=False, seed=543, val_opt_iter=200, value_memory=1, value_memory_shuffle=False): """Initialize a TRPOAgent object. Params ====== env (PortfolioGymEnv): instance of environment device: device type (one of cuda or cpu) damping: policy optimization parameter episode_length: max step size for one episode fisher_ratio: policy optimization parameter l2_reg: l2 regularization regression lambda_: gae max_iteration_number: max policy iteration number max_kl: max kl divergence (policy optimization parameter) val_opt_iter: iteration number for value function learning lr (float): learning rate gamma (float): discount factor seed (int): random seed """ self.env = env self.device = device self.damping = damping self.episode_length = episode_length self.fisher_ratio = fisher_ratio self.gamma = gamma self.l2_reg = l2_reg self.lambda_ = lambda_ self.lr = lr self.max_iteration_number = max_iteration_number self.max_kl = max_kl self.val_opt_iter = val_opt_iter self.save = save self.seed = seed self.value_memory = value_memory self.value_memory_shuffle = value_memory_shuffle self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[-1] self.policy_net = Policy(self.state_dim, self.action_dim, 256, device).to(device) self.value_net = Value(self.state_dim, 256).to(device) def signal_handler(self, sig, frame): """ Signal Handler to save the networks when shutting down via ctrl+C Parameters: ========== sig frame Returns: """ if(self.save): valueParam = get_flat_params_from(self.value_net) policyParam = get_flat_params_from(self.policy_net) saveParameterCsv(valueParam,self.load_dir+"/ValueNet") saveParameterCsv(policyParam,self.load_dir+"/PolicyNet") print("Networks are saved in "+self.load_dir+"/") print('Closing!!') env.close() sys.exit(0) def prepare_data(self, batch, valueBatch, previousBatch): """ Get the batch data and calculate value,return and generalized advantage Parameters: ========== batch valueBatch previousBatch """ stateList = [ torch.from_numpy(np.concatenate(x,axis=0)).to(self.device) for x in batch["states"]] actionsList = [torch.from_numpy(np.concatenate(x,axis=0)).to(self.device) for x in batch["actions"]] for states in stateList: value = self.value_net.forward(states) batch["values"].append(value) advantagesList = [] returnsList = [] rewardsList = [] for rewards,values,masks in zip(batch["rewards"],batch["values"],batch["mask"]): returns = torch.Tensor(len(rewards),1).to(self.device) advantages = torch.Tensor(len(rewards),1).to(self.device) deltas = torch.Tensor(len(rewards),1).to(self.device) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(len(rewards))): returns[i] = rewards[i] + self.gamma * prev_value * masks[i] # TD # returns[i] = rewards[i] + self.gamma * prev_return * masks[i] # Monte Carlo deltas[i] = rewards[i] + self.gamma * prev_value * masks[i]- values.data[i] advantages[i] = deltas[i] + self.gamma * self.lambda_* prev_advantage* masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] returnsList.append(returns) advantagesList.append(advantages) rewardsList.append(torch.Tensor(rewards).to(self.device)) batch["states"] = torch.cat(stateList,0) batch["actions"] = torch.cat(actionsList,0) batch["rewards"] = torch.cat(rewardsList,0) batch["returns"] = torch.cat(returnsList,0) advantagesList = torch.cat(advantagesList,0) batch["advantages"] = (advantagesList- advantagesList.mean()) / advantagesList.std() valueBatch["states"] = torch.cat(( previousBatch["states"],batch["states"]),0) valueBatch["targets"] = torch.cat((previousBatch["returns"],batch["returns"]),0) def update_policy(self, batch): """ Get advantage , states and action and calls trpo step Parameters: batch (dict of arrays of numpy) Returns: """ advantages = batch["advantages"] states = batch["states"] actions = batch["actions"] trpo_step(self.policy_net, states,actions,advantages , self.max_kl, self.damping) def update_value(self, valueBatch): """ Get valueBatch and run adam optimizer to learn value function Parameters: valueBatch (dict of arrays of numpy) Returns: """ # shuffle the data dataSize = valueBatch["targets"].size()[0] permutation = torch.randperm(dataSize) input = valueBatch["states"][permutation] target = valueBatch["targets"][permutation] iter = self.val_opt_iter batchSize = int(dataSize/ iter) loss_fn = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.lr) for t in range(iter): prediction = self.value_net(input[t*batchSize:t*batchSize+batchSize]) loss = loss_fn(prediction, target[t*batchSize:t*batchSize+batchSize]) # XXX : Comment out for debug # if t%100==0: # print("\t%f"%loss.data) optimizer.zero_grad() loss.backward() optimizer.step() def save_to_previousBatch(self, previousBatch, batch): """ Save previous batch to use in future value optimization Parameters: previousBatch (dict of arrays of numpy) batch (dict of arrays of numpy) """ if self.value_memory<0: print("Value memory should be equal or greater than zero") elif self.value_memory>0: if previousBatch["returns"].size() == 0: previousBatch= {"states":batch["states"], "returns":batch["returns"]} else: previous_size = previousBatch["returns"].size()[0] size = batch["returns"].size()[0] if previous_size/size == self.value_memory: previousBatch["states"] = torch.cat([previousBatch["states"][size:],batch["states"]],0) previousBatch["returns"] = torch.cat([previousBatch["returns"][size:],batch["returns"]],0) else: previousBatch["states"] = torch.cat([previousBatch["states"],batch["states"]],0) previousBatch["returns"] = torch.cat([previousBatch["returns"],batch["returns"]],0) if self.value_memory_shuffle: permutation = torch.randperm(previousBatch["returns"].size()[0]) previousBatch["states"] = previousBatch["states"][permutation] previousBatch["returns"] = previousBatch["returns"][permutation] def calculate_loss(self, reward_sum_mean,reward_sum_std,test_number = 10): """ Calculate mean cummulative reward for test_nubmer of trials Parameters: reward_sum_mean (list): holds the history of the means. reward_sum_std (list): holds the history of the std. Returns: list: new value appended means list: new value appended stds """ rewardSum = [] for i in range(test_number): state = self.env.reset() rewardSum.append(0) for t in range(self.episode_length): state, reward, done, _ = self.env.step(self.policy_net.get_action(state)[0] ) state = np.transpose(state) rewardSum[-1] += reward if done: break reward_sum_mean.append(np.array(rewardSum).mean()) reward_sum_std.append(np.array(rewardSum).std()) return reward_sum_mean, reward_sum_std def predict(self, state): """ Queries an action from the actor network, should be called from step. Parameters: state - the observation at the current timestep Return: action - the action to take, as a numpy array """ action = self.policy_net.get_action(state)[0] return action def learn(self, timesteps, print_every = 1): """ Trains the agent Params ====== timesteps (int): Number of timesteps the agent should interact with the environment print_every (int): Verbosity control """ signal.signal(signal.SIGINT, self.signal_handler) time_start = time.time() reward_sum_mean,reward_sum_std = [], [] previousBatch= {"states":torch.Tensor(0).to(self.device) , "returns":torch.Tensor(0).to(self.device)} reward_sum_mean,reward_sum_std = self.calculate_loss(reward_sum_mean,reward_sum_std) #print("Initial loss \n\tloss | mean : %6.4f / std : %6.4f"%(reward_sum_mean[-1],reward_sum_std[-1]) ) num_steps = 0 flag = False count_of_dones = 0 max_iteration_number = timesteps//self.env.episode_length + 1 for i_episode in range(max_iteration_number): time_episode_start = time.time() # reset batches batch = {"states":[] , "actions":[], "next_states":[] , "rewards":[], "returns":[], "values":[], "advantages":[], "mask":[]} valueBatch = {"states" :[], "targets" : []} # while num_steps < self.batch_size: done = False while not done: state = self.env.reset() reward_sum = 0 states,actions,rewards,next_states,masks = [],[],[],[],[] steps = 0 for t in range(self.env.episode_length): action = self.policy_net.get_action(state)[0] # agent next_state, reward, done, info = self.env.step(action) next_state = np.transpose(next_state) mask = 0 if done else 1 masks.append(mask) states.append(state) actions.append(action) next_states.append(next_state) rewards.append(reward) state = next_state reward_sum += reward num_steps+=1 if done: count_of_dones += 1 flag = True if flag and count_of_dones % print_every == 0: print(f'Score at timestep {num_steps}: {reward_sum}.') flag = False if num_steps >= timesteps: break batch["states"].append(np.expand_dims(states, axis=1) ) batch["actions"].append(actions) batch["next_states"].append(np.expand_dims(next_states, axis=1)) batch["rewards"].append(rewards) batch["mask"].append(masks) #num_steps += steps self.prepare_data(batch,valueBatch,previousBatch) self.update_policy(batch) # First policy update to avoid overfitting self.update_value(valueBatch) self.save_to_previousBatch(previousBatch,batch) #print("episode %d | total: %.4f "%( i_episode, time.time()-time_episode_start)) reward_sum_mean,reward_sum_std = self.calculate_loss(reward_sum_mean,reward_sum_std)
Methods
def calculate_loss(self, reward_sum_mean, reward_sum_std, test_number=10)
-
Calculate mean cummulative reward for test_nubmer of trials
Parameters: reward_sum_mean (list): holds the history of the means. reward_sum_std (list): holds the history of the std.
Returns: list: new value appended means list: new value appended stds
Expand source code
def calculate_loss(self, reward_sum_mean,reward_sum_std,test_number = 10): """ Calculate mean cummulative reward for test_nubmer of trials Parameters: reward_sum_mean (list): holds the history of the means. reward_sum_std (list): holds the history of the std. Returns: list: new value appended means list: new value appended stds """ rewardSum = [] for i in range(test_number): state = self.env.reset() rewardSum.append(0) for t in range(self.episode_length): state, reward, done, _ = self.env.step(self.policy_net.get_action(state)[0] ) state = np.transpose(state) rewardSum[-1] += reward if done: break reward_sum_mean.append(np.array(rewardSum).mean()) reward_sum_std.append(np.array(rewardSum).std()) return reward_sum_mean, reward_sum_std
def learn(self, timesteps, print_every=1)
-
Trains the agent
Params
timesteps (int): Number of timesteps the agent should interact with the environment print_every (int): Verbosity control
Expand source code
def learn(self, timesteps, print_every = 1): """ Trains the agent Params ====== timesteps (int): Number of timesteps the agent should interact with the environment print_every (int): Verbosity control """ signal.signal(signal.SIGINT, self.signal_handler) time_start = time.time() reward_sum_mean,reward_sum_std = [], [] previousBatch= {"states":torch.Tensor(0).to(self.device) , "returns":torch.Tensor(0).to(self.device)} reward_sum_mean,reward_sum_std = self.calculate_loss(reward_sum_mean,reward_sum_std) #print("Initial loss \n\tloss | mean : %6.4f / std : %6.4f"%(reward_sum_mean[-1],reward_sum_std[-1]) ) num_steps = 0 flag = False count_of_dones = 0 max_iteration_number = timesteps//self.env.episode_length + 1 for i_episode in range(max_iteration_number): time_episode_start = time.time() # reset batches batch = {"states":[] , "actions":[], "next_states":[] , "rewards":[], "returns":[], "values":[], "advantages":[], "mask":[]} valueBatch = {"states" :[], "targets" : []} # while num_steps < self.batch_size: done = False while not done: state = self.env.reset() reward_sum = 0 states,actions,rewards,next_states,masks = [],[],[],[],[] steps = 0 for t in range(self.env.episode_length): action = self.policy_net.get_action(state)[0] # agent next_state, reward, done, info = self.env.step(action) next_state = np.transpose(next_state) mask = 0 if done else 1 masks.append(mask) states.append(state) actions.append(action) next_states.append(next_state) rewards.append(reward) state = next_state reward_sum += reward num_steps+=1 if done: count_of_dones += 1 flag = True if flag and count_of_dones % print_every == 0: print(f'Score at timestep {num_steps}: {reward_sum}.') flag = False if num_steps >= timesteps: break batch["states"].append(np.expand_dims(states, axis=1) ) batch["actions"].append(actions) batch["next_states"].append(np.expand_dims(next_states, axis=1)) batch["rewards"].append(rewards) batch["mask"].append(masks) #num_steps += steps self.prepare_data(batch,valueBatch,previousBatch) self.update_policy(batch) # First policy update to avoid overfitting self.update_value(valueBatch) self.save_to_previousBatch(previousBatch,batch) #print("episode %d | total: %.4f "%( i_episode, time.time()-time_episode_start)) reward_sum_mean,reward_sum_std = self.calculate_loss(reward_sum_mean,reward_sum_std)
def predict(self, state)
-
Queries an action from the actor network, should be called from step.
Parameters
state - the observation at the current timestep
Return
action - the action to take, as a numpy array
Expand source code
def predict(self, state): """ Queries an action from the actor network, should be called from step. Parameters: state - the observation at the current timestep Return: action - the action to take, as a numpy array """ action = self.policy_net.get_action(state)[0] return action
def prepare_data(self, batch, valueBatch, previousBatch)
-
Get the batch data and calculate value,return and generalized advantage Parameters: ========== batch valueBatch previousBatch
Expand source code
def prepare_data(self, batch, valueBatch, previousBatch): """ Get the batch data and calculate value,return and generalized advantage Parameters: ========== batch valueBatch previousBatch """ stateList = [ torch.from_numpy(np.concatenate(x,axis=0)).to(self.device) for x in batch["states"]] actionsList = [torch.from_numpy(np.concatenate(x,axis=0)).to(self.device) for x in batch["actions"]] for states in stateList: value = self.value_net.forward(states) batch["values"].append(value) advantagesList = [] returnsList = [] rewardsList = [] for rewards,values,masks in zip(batch["rewards"],batch["values"],batch["mask"]): returns = torch.Tensor(len(rewards),1).to(self.device) advantages = torch.Tensor(len(rewards),1).to(self.device) deltas = torch.Tensor(len(rewards),1).to(self.device) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(len(rewards))): returns[i] = rewards[i] + self.gamma * prev_value * masks[i] # TD # returns[i] = rewards[i] + self.gamma * prev_return * masks[i] # Monte Carlo deltas[i] = rewards[i] + self.gamma * prev_value * masks[i]- values.data[i] advantages[i] = deltas[i] + self.gamma * self.lambda_* prev_advantage* masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] returnsList.append(returns) advantagesList.append(advantages) rewardsList.append(torch.Tensor(rewards).to(self.device)) batch["states"] = torch.cat(stateList,0) batch["actions"] = torch.cat(actionsList,0) batch["rewards"] = torch.cat(rewardsList,0) batch["returns"] = torch.cat(returnsList,0) advantagesList = torch.cat(advantagesList,0) batch["advantages"] = (advantagesList- advantagesList.mean()) / advantagesList.std() valueBatch["states"] = torch.cat(( previousBatch["states"],batch["states"]),0) valueBatch["targets"] = torch.cat((previousBatch["returns"],batch["returns"]),0)
def save_to_previousBatch(self, previousBatch, batch)
-
Save previous batch to use in future value optimization Parameters: previousBatch (dict of arrays of numpy) batch (dict of arrays of numpy)
Expand source code
def save_to_previousBatch(self, previousBatch, batch): """ Save previous batch to use in future value optimization Parameters: previousBatch (dict of arrays of numpy) batch (dict of arrays of numpy) """ if self.value_memory<0: print("Value memory should be equal or greater than zero") elif self.value_memory>0: if previousBatch["returns"].size() == 0: previousBatch= {"states":batch["states"], "returns":batch["returns"]} else: previous_size = previousBatch["returns"].size()[0] size = batch["returns"].size()[0] if previous_size/size == self.value_memory: previousBatch["states"] = torch.cat([previousBatch["states"][size:],batch["states"]],0) previousBatch["returns"] = torch.cat([previousBatch["returns"][size:],batch["returns"]],0) else: previousBatch["states"] = torch.cat([previousBatch["states"],batch["states"]],0) previousBatch["returns"] = torch.cat([previousBatch["returns"],batch["returns"]],0) if self.value_memory_shuffle: permutation = torch.randperm(previousBatch["returns"].size()[0]) previousBatch["states"] = previousBatch["states"][permutation] previousBatch["returns"] = previousBatch["returns"][permutation]
def signal_handler(self, sig, frame)
-
Signal Handler to save the networks when shutting down via ctrl+C Parameters: ========== sig frame
Returns:
Expand source code
def signal_handler(self, sig, frame): """ Signal Handler to save the networks when shutting down via ctrl+C Parameters: ========== sig frame Returns: """ if(self.save): valueParam = get_flat_params_from(self.value_net) policyParam = get_flat_params_from(self.policy_net) saveParameterCsv(valueParam,self.load_dir+"/ValueNet") saveParameterCsv(policyParam,self.load_dir+"/PolicyNet") print("Networks are saved in "+self.load_dir+"/") print('Closing!!') env.close() sys.exit(0)
def update_policy(self, batch)
-
Get advantage , states and action and calls trpo step Parameters: batch (dict of arrays of numpy) Returns:
Expand source code
def update_policy(self, batch): """ Get advantage , states and action and calls trpo step Parameters: batch (dict of arrays of numpy) Returns: """ advantages = batch["advantages"] states = batch["states"] actions = batch["actions"] trpo_step(self.policy_net, states,actions,advantages , self.max_kl, self.damping)
def update_value(self, valueBatch)
-
Get valueBatch and run adam optimizer to learn value function Parameters: valueBatch (dict of arrays of numpy) Returns:
Expand source code
def update_value(self, valueBatch): """ Get valueBatch and run adam optimizer to learn value function Parameters: valueBatch (dict of arrays of numpy) Returns: """ # shuffle the data dataSize = valueBatch["targets"].size()[0] permutation = torch.randperm(dataSize) input = valueBatch["states"][permutation] target = valueBatch["targets"][permutation] iter = self.val_opt_iter batchSize = int(dataSize/ iter) loss_fn = torch.nn.MSELoss(reduction='sum') optimizer = torch.optim.Adam(self.value_net.parameters(), lr=self.lr) for t in range(iter): prediction = self.value_net(input[t*batchSize:t*batchSize+batchSize]) loss = loss_fn(prediction, target[t*batchSize:t*batchSize+batchSize]) # XXX : Comment out for debug # if t%100==0: # print("\t%f"%loss.data) optimizer.zero_grad() loss.backward() optimizer.step()