Module AssetAllocator.algorithms.PPO.eval_policy
Expand source code
def _log_summary(ep_len, ep_ret, ep_num):
                """
                        Print to stdout what we've logged so far in the most recent episode.
                        Parameters:
                                None
                        Return:
                                None
                """
                # Round decimal places for more aesthetic logging messages
                ep_len = str(round(ep_len, 2))
                ep_ret = str(round(ep_ret, 2))
                # Print logging statements
                print(flush=True)
                print(f"-------------------- Episode #{ep_num} --------------------", flush=True)
                print(f"Episodic Length: {ep_len}", flush=True)
                print(f"Episodic Return: {ep_ret}", flush=True)
                print(f"------------------------------------------------------", flush=True)
                print(flush=True)
def rollout(policy, env, render):
        """
                Returns a generator to roll out each episode given a trained policy and
                environment to test on. 
                Parameters:
                        policy - The trained policy to test
                        env - The environment to evaluate the policy on
                        render - Specifies whether to render or not
                
                Return:
                        A generator object rollout, or iterable, which will return the latest
                        episodic length and return on each iteration of the generator.  
        """
        # Rollout until user kills process
        while True:
                obs = env.reset()
                done = False
                # number of timesteps so far
                t = 0
                # Logging data
                ep_len = 0            # episodic length
                ep_ret = 0            # episodic return
                while not done:
                        t += 1
                        # Render environment if specified, off by default
                        if render:
                                env.render()
                        # Query deterministic action from policy and run it
                        action = policy(obs).detach().numpy()
                        obs, rew, done, _ = env.step(action)
                        # Sum all episodic rewards as we go along
                        ep_ret += rew
                        
                # Track episodic length
                ep_len = t
                # returns episodic length and return in this iteration
                yield ep_len, ep_ret
def eval_policy(policy, env, render=False):
        """
                The main function to evaluate our policy with. It will iterate a generator object
                "rollout", which will simulate each episode and return the most recent episode's
                length and return. We can then log it right after. And yes, eval_policy will run
                forever until you kill the process. 
                Parameters:
                        policy - The trained policy to test, basically another name for our actor model
                        env - The environment to test the policy on
                        render - Whether we should render our episodes. False by default.
                Return:
                        None
                NOTE: To learn more about generators, look at rollout's function description
        """
        # Rollout with the policy and environment, and log each episode's data
        for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)):
                _log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)Functions
- def eval_policy(policy, env, render=False)
- 
The main function to evaluate our policy with. It will iterate a generator object "rollout", which will simulate each episode and return the most recent episode's length and return. We can then log it right after. And yes, eval_policy will run forever until you kill the process. Parameterspolicy - The trained policy to test, basically another name for our actor model env - The environment to test the policy on render - Whether we should render our episodes. False by default. ReturnNone NOTE: To learn more about generators, look at rollout's function description Expand source codedef eval_policy(policy, env, render=False): """ The main function to evaluate our policy with. It will iterate a generator object "rollout", which will simulate each episode and return the most recent episode's length and return. We can then log it right after. And yes, eval_policy will run forever until you kill the process. Parameters: policy - The trained policy to test, basically another name for our actor model env - The environment to test the policy on render - Whether we should render our episodes. False by default. Return: None NOTE: To learn more about generators, look at rollout's function description """ # Rollout with the policy and environment, and log each episode's data for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)): _log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
- def rollout(policy, env, render)
- 
Returns a generator to roll out each episode given a trained policy and environment to test on. Parameterspolicy - The trained policy to test env - The environment to evaluate the policy on render - Specifies whether to render or not ReturnA generator object rollout, or iterable, which will return the latest episodic length and return on each iteration of the generator. Expand source codedef rollout(policy, env, render): """ Returns a generator to roll out each episode given a trained policy and environment to test on. Parameters: policy - The trained policy to test env - The environment to evaluate the policy on render - Specifies whether to render or not Return: A generator object rollout, or iterable, which will return the latest episodic length and return on each iteration of the generator. """ # Rollout until user kills process while True: obs = env.reset() done = False # number of timesteps so far t = 0 # Logging data ep_len = 0 # episodic length ep_ret = 0 # episodic return while not done: t += 1 # Render environment if specified, off by default if render: env.render() # Query deterministic action from policy and run it action = policy(obs).detach().numpy() obs, rew, done, _ = env.step(action) # Sum all episodic rewards as we go along ep_ret += rew # Track episodic length ep_len = t # returns episodic length and return in this iteration yield ep_len, ep_ret