Module AssetAllocator.algorithms.PPO.eval_policy

Expand source code
def _log_summary(ep_len, ep_ret, ep_num):
                """
                        Print to stdout what we've logged so far in the most recent episode.

                        Parameters:
                                None

                        Return:
                                None
                """
                # Round decimal places for more aesthetic logging messages
                ep_len = str(round(ep_len, 2))
                ep_ret = str(round(ep_ret, 2))

                # Print logging statements
                print(flush=True)
                print(f"-------------------- Episode #{ep_num} --------------------", flush=True)
                print(f"Episodic Length: {ep_len}", flush=True)
                print(f"Episodic Return: {ep_ret}", flush=True)
                print(f"------------------------------------------------------", flush=True)
                print(flush=True)

def rollout(policy, env, render):
        """
                Returns a generator to roll out each episode given a trained policy and
                environment to test on. 

                Parameters:
                        policy - The trained policy to test
                        env - The environment to evaluate the policy on
                        render - Specifies whether to render or not
                
                Return:
                        A generator object rollout, or iterable, which will return the latest
                        episodic length and return on each iteration of the generator.  
        """
        # Rollout until user kills process
        while True:
                obs = env.reset()
                done = False

                # number of timesteps so far
                t = 0

                # Logging data
                ep_len = 0            # episodic length
                ep_ret = 0            # episodic return

                while not done:
                        t += 1

                        # Render environment if specified, off by default
                        if render:
                                env.render()

                        # Query deterministic action from policy and run it
                        action = policy(obs).detach().numpy()
                        obs, rew, done, _ = env.step(action)

                        # Sum all episodic rewards as we go along
                        ep_ret += rew
                        
                # Track episodic length
                ep_len = t

                # returns episodic length and return in this iteration
                yield ep_len, ep_ret

def eval_policy(policy, env, render=False):
        """
                The main function to evaluate our policy with. It will iterate a generator object
                "rollout", which will simulate each episode and return the most recent episode's
                length and return. We can then log it right after. And yes, eval_policy will run
                forever until you kill the process. 

                Parameters:
                        policy - The trained policy to test, basically another name for our actor model
                        env - The environment to test the policy on
                        render - Whether we should render our episodes. False by default.

                Return:
                        None

                NOTE: To learn more about generators, look at rollout's function description
        """
        # Rollout with the policy and environment, and log each episode's data
        for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)):
                _log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)

Functions

def eval_policy(policy, env, render=False)

The main function to evaluate our policy with. It will iterate a generator object "rollout", which will simulate each episode and return the most recent episode's length and return. We can then log it right after. And yes, eval_policy will run forever until you kill the process.

Parameters

policy - The trained policy to test, basically another name for our actor model env - The environment to test the policy on render - Whether we should render our episodes. False by default.

Return

None

NOTE: To learn more about generators, look at rollout's function description

Expand source code
def eval_policy(policy, env, render=False):
        """
                The main function to evaluate our policy with. It will iterate a generator object
                "rollout", which will simulate each episode and return the most recent episode's
                length and return. We can then log it right after. And yes, eval_policy will run
                forever until you kill the process. 

                Parameters:
                        policy - The trained policy to test, basically another name for our actor model
                        env - The environment to test the policy on
                        render - Whether we should render our episodes. False by default.

                Return:
                        None

                NOTE: To learn more about generators, look at rollout's function description
        """
        # Rollout with the policy and environment, and log each episode's data
        for ep_num, (ep_len, ep_ret) in enumerate(rollout(policy, env, render)):
                _log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
def rollout(policy, env, render)

Returns a generator to roll out each episode given a trained policy and environment to test on.

Parameters

policy - The trained policy to test env - The environment to evaluate the policy on render - Specifies whether to render or not

Return

A generator object rollout, or iterable, which will return the latest episodic length and return on each iteration of the generator.

Expand source code
def rollout(policy, env, render):
        """
                Returns a generator to roll out each episode given a trained policy and
                environment to test on. 

                Parameters:
                        policy - The trained policy to test
                        env - The environment to evaluate the policy on
                        render - Specifies whether to render or not
                
                Return:
                        A generator object rollout, or iterable, which will return the latest
                        episodic length and return on each iteration of the generator.  
        """
        # Rollout until user kills process
        while True:
                obs = env.reset()
                done = False

                # number of timesteps so far
                t = 0

                # Logging data
                ep_len = 0            # episodic length
                ep_ret = 0            # episodic return

                while not done:
                        t += 1

                        # Render environment if specified, off by default
                        if render:
                                env.render()

                        # Query deterministic action from policy and run it
                        action = policy(obs).detach().numpy()
                        obs, rew, done, _ = env.step(action)

                        # Sum all episodic rewards as we go along
                        ep_ret += rew
                        
                # Track episodic length
                ep_len = t

                # returns episodic length and return in this iteration
                yield ep_len, ep_ret