Source code for prl.agents.agents

from abc import ABC, abstractmethod
from copy import deepcopy
from functools import reduce
from operator import iadd

import numpy as np

from prl.callbacks.callbacks import CallbackHandler
from prl.environments.environments import Environment
from prl.storage import History, Memory, Storage
from prl.typing import (
    EnvironmentABC,
    FunctionApproximatorABC,
    AgentABC,
    AdvantageABC,
    State,
    Action,
)
from prl.utils import timeit, agent_logger


[docs]class Agent(AgentABC, ABC): """Base class for all agents""" def __init__(self): self.step_count = 0 self.iteration_count = 0 self.episode_count = 0 self._actual_reward_count = 0 self._actual_episode_length = 0 @property @abstractmethod def id(self) -> str: """Agent UUID"""
[docs] @timeit def train( self, env: Environment, n_iterations: int, callback_list: list = None, **kwargs ): """Trains the agent using environment. Also handles callbacks during training. Args: env: Environment to train on n_iterations: Maximum number of iterations to train callback_list: List of callbacks kwargs: other arguments passed to `train_iteration`, `pre_train_setup` and `post_train_cleanup` """ agent_logger.add("agent_step", self.step_count) agent_logger.add("agent_iteration", self.iteration_count) agent_logger.add("agent_episode", self.episode_count) callback_list = callback_list or [] callback_handler = CallbackHandler(callback_list, env) self.pre_train_setup(env, **kwargs) callback_handler.on_training_begin(self) for i in range(n_iterations): self.train_iteration(env=env, **kwargs) self.iteration_count += 1 agent_logger.add("agent_iteration", self.iteration_count) if callback_handler.on_iteration_end(self): break self.post_train_cleanup(env, **kwargs) callback_handler.on_training_end(self)
[docs] @abstractmethod def train_iteration(self, env: Environment, **kwargs): """Performs single training iteration. This method should contain repeatable part of training an agent. Args: env: Environment **kwargs: Kwargs passed from train() method """
[docs] def pre_train_setup(self, env: Environment, **kwargs): """Performs pre-training setup. This method should handle non-repeatable part of training an agent. Args: env: Environment **kwargs: Kwargs passed from train() method """
[docs] def post_train_cleanup(self, env: Environment, **kwargs): """Performs cleaning up fields that are no longer needed after training to keep agent lightweight. Args: env: Environment **kwargs: Kwargs passed from train() method """
[docs] @abstractmethod def act(self, state: State) -> Action: """Makes a step based on current environments state Args: state: state from the environment. Returns: Action to execute on the environment. """
[docs] @timeit def play_episodes(self, env: Environment, episodes: int) -> History: """Method for playing full episodes used usually to train agents. Args: env: Environment episodes: Number of episodes to play. Returns: History object representing episodes history """ history_list = [] for i in range(episodes): state = env.reset() history: History = History(state, np.int32, env.initial_history_length) while True: action = self.act(state) state, reward, done, _ = env.step(action) self.step_count += 1 agent_logger.add("agent_step", self.step_count) history.update(action, reward, done, state) # TODO step callback if done: self.episode_count += 1 agent_logger.add("agent_episode", self.episode_count) history_list.append(history) agent_logger.add( "episode_total_reward", history.get_total_rewards()[-1] ) agent_logger.add("episode_length", len(history)) # TODO episode callback break return reduce(iadd, history_list)
[docs] @timeit def play_steps(self, env: Environment, n_steps: int, storage: Storage) -> Storage: """Method for performing some number of steps in the environments. Appends new states to existing storage Args: env: Environment n_steps: Number of steps to play storage: Storage (Memory, History) of the earlier games (used to perform first action) Returns: History with appended states, actions, rewards, etc """ state = storage.get_last_state() for i in range(n_steps): action = self.act(state) state, reward, done, _ = env.step(action) self._actual_reward_count += reward self._actual_episode_length += 1 self.step_count += 1 agent_logger.add("agent_step", self.step_count) storage.update(action, reward, done, state) # TODO step callback if done: self.episode_count += 1 agent_logger.add("agent_episode", self.episode_count) # TODO episode callback agent_logger.add("episode_total_reward", self._actual_reward_count) agent_logger.add("episode_length", self._actual_episode_length) state = env.reset() self._actual_reward_count = 0 self._actual_episode_length = 0 storage.new_state_update(state) return storage
[docs] @timeit def test(self, env) -> History: """ Method for playing full episode used to test agents. Reward in the returned history is the true reward from the environments. This method is used mostly for testing the agent. Args: env: Environment Returns: History object representing episode history """ env.true_reward = True history = self.play_episodes(env, 1) agent_logger.add("test_episode_total_reward", history.get_total_rewards()[-1]) agent_logger.add("test_episode_length", len(history)) env.true_reward = False return history
[docs]class RandomAgent(Agent): """Agent performing random actions""" def __init__(self, agent_id: str = "random_agent", replay_buffer_size=100): super().__init__() self._id = agent_id self.action_space = None self.replay_buffer_size = replay_buffer_size @property def id(self): return self._id
[docs] @timeit def pre_train_setup(self, env: Environment, **kwargs): self.action_space = env.action_space state = env.reset() self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size) # To ensure that we have the next state after doing the first step. self.play_steps(env, n_steps=1, storage=self.replay_buffer)
[docs] @timeit def train_iteration(self, env: Environment, discount_factor: float = 1.0): self.play_steps(env, 1, self.replay_buffer) return None, self.replay_buffer
[docs] def act(self, state: State): return self.action_space.sample()
[docs]class CrossEntropyAgent(Agent): """Agent using cross entropy algorithm""" def __init__( self, policy_network: FunctionApproximatorABC, agent_id: str = "crossentropy_agent", ): super().__init__() self._id = agent_id self.action_space = None self.policy_network = policy_network @property def id(self): return self._id
[docs] @timeit def train_iteration(self, env: EnvironmentABC, n_episodes=32, percentile=75): history = self.play_episodes(env, n_episodes) all_total_rewards = history.get_total_rewards() total_rewards = all_total_rewards[history.get_dones()] total_reward_bound = np.percentile(total_rewards, percentile) above_treshold_mask = all_total_rewards >= total_reward_bound states = history.get_states()[above_treshold_mask] actions = history.get_actions()[above_treshold_mask] loss = self.policy_network.train(states, actions) return loss, history
[docs] def act(self, state: State) -> Action: state = state.reshape(1, *state.shape) act_probs = self.policy_network.predict(state)[0] return np.random.choice(len(act_probs), p=act_probs)
[docs]class REINFORCEAgent(Agent): """Agent using REINFORCE algorithm""" def __init__( self, policy_network: FunctionApproximatorABC, agent_id: str = "REINFORCE_agent" ): super().__init__() self._id = agent_id self.action_space = None self.policy_network = policy_network @property def id(self): return self._id
[docs] def pre_train_setup( self, env: EnvironmentABC, discount_factor: float = 1.0, **kwargs ): assert 0.0 <= discount_factor <= 1.0
[docs] @timeit def train_iteration( self, env: EnvironmentABC, n_episodes: int = 32, discount_factor: float = 1.0 ): history = self.play_episodes(env, n_episodes) states = history.get_states() actions = history.get_actions() returns = history.get_returns(discount_factor) loss = self.policy_network.train(states, actions, returns) return loss, history
[docs] @timeit def act(self, state: State) -> Action: state = state.reshape(1, *state.shape) act_probs = self.policy_network.predict(state)[0] return np.random.choice(len(act_probs), p=act_probs)
[docs]class ActorCriticAgent(Agent): """Basic actor-critic agent.""" def __init__( self, policy_network: FunctionApproximatorABC, value_network: FunctionApproximatorABC, advantage: AdvantageABC, agent_id: str = "ActorCritic_agent", ): super().__init__() self._id = agent_id self.policy_network = policy_network self.value_network = value_network self.advantage = advantage self.memory = None self._should_reset = True @property def id(self): return self._id
[docs] @timeit def train_iteration( self, env: EnvironmentABC, n_steps: int = 32, discount_factor: float = 1.0 ): if self._should_reset: self.memory = Memory( initial_state=env.reset(), action_type=np.int32, maximum_length=n_steps ) self._should_reset = False self.play_steps(env, n_steps, self.memory) states = self.memory.get_states(include_last=True) values = self.value_network.predict(states).squeeze(axis=-1) states = states[:-1, ...] actions = self.memory.get_actions() rewards = self.memory.get_rewards() dones = self.memory.get_dones() advantages = self.advantage(rewards, values, dones, discount_factor) values = values[:-1] policy_loss = self.policy_network.train(states, actions, advantages) target_values = values + advantages value_loss = self.value_network.train(states, target_values) # TODO: entropy loss (add to PolicyGradientLoss) return (None, self.memory)
[docs] @timeit def act(self, state: State) -> Action: state = state.reshape(1, *state.shape) act_probs = self.policy_network.predict(state)[0] return np.random.choice(len(act_probs), p=act_probs)
[docs]class A2CAgent(ActorCriticAgent): """Advantage Actor Critic agent.""" def __init__( self, policy_network: FunctionApproximatorABC, value_network: FunctionApproximatorABC, agent_id: str = "A2C_agent", ): super().__init__( policy_network, value_network, advantage=A2CAdvantage(), agent_id=agent_id )
[docs]class Advantage(AdvantageABC, ABC): """Base class for advantage functions.""" def __call__( self, rewards: np.ndarray, baselines: np.ndarray, dones: np.ndarray, discount_factor: float, ) -> np.ndarray: assert ( rewards.shape[:-1] == baselines.shape[:-1] ), "Incompatible shapes of rewards and baselines." assert ( baselines.shape[-1] == rewards.shape[-1] + 1 ), "Baseline sequence should be 1 longer than the reward sequence." assert rewards.shape == dones.shape, "Incompatible shapes of rewards and dones." return self.calculate_advantages(rewards, baselines, dones, discount_factor)
[docs] @abstractmethod def calculate_advantages( self, rewards: np.ndarray, baselines: np.ndarray, dones: np.ndarray, discount_factor: float, ) -> np.ndarray: pass
[docs]class A2CAdvantage(Advantage): """Advantage function from Asynchronous Methods for Deep Reinforcement Learning."""
[docs] @timeit def calculate_advantages( self, rewards: np.ndarray, baselines: np.ndarray, dones: np.ndarray, discount_factor: float, ) -> np.ndarray: advantages = np.zeros_like(rewards, dtype=np.float32) current_return = baselines[-1] for i in reversed(range(len(rewards))): current_return = rewards[i] + ~dones[i] * discount_factor * current_return advantages[i] = current_return - baselines[i] return advantages
[docs]class GAEAdvantage(Advantage): """Advantage function from High-Dimensional Continuous Control Using Generalized Advantage Estimation. """ def __init__(self, lambda_: float): self.lambda_ = lambda_
[docs] @timeit def calculate_advantages( self, rewards: np.ndarray, baselines: np.ndarray, dones: np.ndarray, discount_factor: float, ) -> np.ndarray: deltas = rewards + ~dones * discount_factor * baselines[1:] - baselines[:-1] gamma_lambda = discount_factor * self.lambda_ advantages = np.zeros_like(rewards, dtype=np.float32) current_advantage = 0 for i in reversed(range(len(rewards))): current_advantage = deltas[i] + ~dones[i] * gamma_lambda * current_advantage advantages[i] = current_advantage return advantages
[docs]class DQNAgent(Agent): """Agent using DQN algorithm""" def __init__( self, q_network: FunctionApproximatorABC, replay_buffer_size: int = 10000, start_epsilon: float = 1.0, end_epsilon: float = 0.05, epsilon_decay: int = 1000, training_set_size: int = 64, target_network_copy_iter: int = 100, steps_between_training=10, agent_id: str = "DQN_agent", ): super().__init__() self._id = agent_id self.action_space = None self.start_epsilon = start_epsilon self.epsilon = start_epsilon self.end_epsilon = end_epsilon self.epsilon_decay = epsilon_decay self.replay_buffer_size = replay_buffer_size self.q_network = q_network self.target_network = deepcopy(q_network) self.batch_size = training_set_size self.target_network_copy_iter = target_network_copy_iter self.steps_between_training = steps_between_training self.epsilon_diff = (self.start_epsilon - self.end_epsilon) / self.epsilon_decay self.replay_buffer = None @property def id(self): return self._id
[docs] def pre_train_setup( self, env: EnvironmentABC, discount_factor: float = 1.0, **kwargs ): assert 0.0 <= discount_factor <= 1.0 state = env.reset() self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size) # To ensure that we have the next state after doing the first step. self.play_steps(env, n_steps=1, storage=self.replay_buffer)
[docs] @timeit def train_iteration(self, env: EnvironmentABC, discount_factor: float = 1.0): if self.epsilon_decay < self.iteration_count: self.epsilon -= self.epsilon_diff if self.iteration_count % self.target_network_copy_iter == 0: self.target_network = deepcopy(self.q_network) self.play_steps(env, self.steps_between_training, self.replay_buffer) states, actions, rewards, dones, next_states = self.replay_buffer.sample_batch( self.replay_buffer_size, self.batch_size, next_states=True ) target_vals = self.target_network.predict(next_states) target_ind = np.argmax(target_vals, axis=1) target_max = target_vals[np.arange(target_vals.shape[0]), target_ind] target_q = rewards + discount_factor * target_max * (~dones) loss = self.q_network.train(states, actions, target_q) return loss, self.replay_buffer
[docs] def act(self, state: State) -> Action: state = state.reshape(1, *state.shape) act_qvals = self.q_network.predict(state)[0] if np.random.uniform() < self.epsilon: return np.random.choice(len(act_qvals)) else: return np.argmax(act_qvals)