Source code for

from abc import ABC, abstractmethod
from typing import Union

import numpy as np
from numba import njit

from prl.typing import HistoryABC, Action, Reward, State, MemoryABC, StorageABC
from prl.utils import timeit

[docs]@njit def calculate_returns( all_rewards: np.ndarray, dones: np.ndarray, horizon: Union[int, np.float], discount_factor: float, _index: int, ): if np.any(dones) and (horizon is np.inf) and dones[_index - 1]: assert 0.0 <= discount_factor <= 1.0 splits = [-1] + list(np.nonzero(dones)[0]) all_returns = np.zeros_like(all_rewards) for s in range(len(splits) - 1): start = splits[s] + 1 end = splits[s + 1] + 1 rewards = all_rewards[start:end] returns = np.zeros_like(rewards) discounts = np.zeros_like(rewards) + discount_factor powers = np.arange(rewards.shape[0]) discounts = np.power(discounts, powers) length = len(rewards) for i in range(length): trimmed_rewards = rewards[i:length] trimmed_discounts = discounts[: (length - i)] returns[i] = np.sum(trimmed_discounts * trimmed_rewards) all_returns[start:end] = returns return all_returns else: raise Exception( "Returns available only for at least one complete episode, there can't be an incomplete episode" "and the horizon must be np.inf" )
[docs]@njit def calculate_total_rewards(all_rewards: np.ndarray, dones: np.ndarray, _index: int): if np.any(dones) and dones[_index - 1]: splits = [-1] + list(np.nonzero(dones)[0]) all_total_rewards = np.zeros_like(all_rewards) for s in range(len(splits) - 1): start = splits[s] + 1 end = splits[s + 1] + 1 rewards = all_rewards[start:end] total_reward = np.sum(rewards) all_total_rewards[start:end] = total_reward return all_total_rewards else: raise Exception( "Returns available only for at least one complete episode and all episodes must be done" )
[docs]class Storage(StorageABC, ABC):
[docs] @abstractmethod def update(self, action, reward, done, state): """ Updates the object with latest states, reward, actions and done flag. Args: action: action executed by the agent reward: reward from environments done: done flag from environments state: new state returned by wrapped environments after executing action """
[docs] @abstractmethod def new_state_update(self, state): """Overwrites newest state in the History Args: state: state array. """
[docs] @abstractmethod def get_states(self) -> np.ndarray: """Returns an array of all states. Returns: array of all states """
[docs] @abstractmethod def get_last_state(self) -> np.ndarray: """Returns only the last state. Returns: last state """
[docs] @abstractmethod def get_rewards(self) -> np.ndarray: """Returns an array of all rewards. Returns: array of all rewards """
[docs] @abstractmethod def get_actions(self) -> np.ndarray: """Returns an array of all actions. Returns: array of all actions """
[docs] @abstractmethod def get_dones(self) -> np.ndarray: """Returns an array of all done flags. Returns: array of all done flags """
[docs] @abstractmethod def sample_batch( self, replay_buffor_size: int, batch_size: int, returns: bool, next_states: bool ) -> tuple: """Samples batch of examples from the Storage. Args: replay_buffer_size: length of a replay buffor to sample examples from batch_size: number of returned examples returns: if True, the method will return the returns from each step instead of the rewards next_states: if True, the method will return also next states (i.e. for DQN algorithm) Returns: batch of samples from history in form of a tuple with np.ndarrays in order: states, actions, rewards, dones, (new_states) """
@timeit def __getitem__(self, indicies) -> tuple: return ( self.get_states()[indicies], self.get_actions()[indicies], self.get_rewards()[indicies], self.get_dones()[indicies], ) @abstractmethod def __len__(self): pass @abstractmethod def __repr__(self): pass
[docs]class History(Storage, HistoryABC): """ An object which is used to keep the episodes history (used within :py:class:`~prl.environments.environments.Environment` class and by some agents). Agent can use this object to keep history of past episodes, calculate returns, total rewards, etc. and sample batches from it. Object also supports indexing and slicing because it supports python Sequence protocol, so functions working on sequences like random.choice can be also used on history. Args: initial_state: initial state from enviroment action_type: numpy type of action (e.g. np.int32) initial_length: initial length of a history """ @timeit def __init__( self, initial_state: np.ndarray, action_type: type, initial_length: int = 512 ): self._index = 0 self.states = np.empty( (initial_length,) + initial_state.shape, dtype=np.float32 ) self.actions = np.empty((initial_length,), dtype=action_type) self.rewards = np.empty((initial_length,)) self.dones = np.empty((initial_length,), dtype=np.bool) self.states[self._index] = initial_state
[docs] @timeit def update(self, action: Action, reward: Reward, done: bool, state: State): if self._index == (self.states.shape[0] - 1): self._enlarge() self.actions[self._index] = action self.rewards[self._index] = reward self.dones[self._index] = done self._index += 1 self.states[self._index] = state
[docs] @timeit def new_state_update(self, state: State): self.states[self._index] = state
[docs] @timeit def get_states(self) -> np.ndarray: return self.states[: self._index]
[docs] @timeit def get_last_state(self) -> np.ndarray: return self.states[self._index]
[docs] @timeit def get_rewards(self) -> np.ndarray: return self.rewards[: self._index]
[docs] @timeit def get_actions(self) -> np.ndarray: return self.actions[: self._index]
[docs] @timeit def get_dones(self) -> np.ndarray: return self.dones[: self._index]
[docs] @timeit def get_returns( self, discount_factor: float = 1.0, horizon: float = np.inf ) -> np.ndarray: """Calculates returns for each step. Returns: array of discounted returns for each step """ return calculate_returns( self.get_rewards(), self.get_dones(), horizon, discount_factor, self._index )
[docs] @timeit def get_total_rewards(self) -> np.ndarray: """ Calculates sum of all rewards for each episode and reports it for each state, so every state in one episode has the same value of total reward. This can be useful for filtering states for best episodes (e.g. in Cross Entropy Algorithm). Returns: total reward for each state """ return calculate_total_rewards( self.get_rewards(), self.get_dones(), self._index )
[docs] @timeit def get_number_of_episodes(self) -> int: """Returns a number of full episodes in history. Returns: number of full episodes in history """ return int(self.get_dones().sum())
[docs] @timeit def sample_batch( self, replay_buffer_size: int, batch_size: int = 64, returns: bool = False, next_states: bool = False, ) -> tuple: if returns: raise NotImplementedError("The returns will be implemented soon") elif next_states: if self._index < 2: raise Exception( "Can't sample examples with next_state when the history has length 1." ) indexes = np.random.randint( np.max([0, self._index - replay_buffer_size]), self._index - 1, size=batch_size, ) return self[indexes] + (self.get_states()[indexes + 1],) else: indexes = np.random.randint( np.max(0, self._index - replay_buffer_size), self._index, size=batch_size, ) return self[indexes]
[docs] def get_summary(self) -> (float, float, int): total_rewards_mean = self.get_total_rewards()[self.get_dones()].mean() mean_length = len(self) / self.get_number_of_episodes() return total_rewards_mean, mean_length, self._index
@timeit def _enlarge(self): new_shape = list(self.states.shape) new_shape[0] *= 2 self.states = np.resize(self.states, new_shape) new_shape = list(self.actions.shape) new_shape[0] *= 2 self.actions = np.resize(self.actions, new_shape) new_shape = list(self.rewards.shape) new_shape[0] *= 2 self.rewards = np.resize(self.rewards, new_shape) new_shape = list(self.dones.shape) new_shape[0] *= 2 self.dones = np.resize(self.dones, new_shape) print("Enlarging History. New max length: ", self.dones.shape[0]) def __add__(self, other): raise NotImplementedError( "You can only use inplace operators between History instances" ) @timeit def __iadd__(self, other: HistoryABC): self.states = np.concatenate([self.get_states(), other.states]) self.actions = np.concatenate([self.get_actions(), other.actions]) self.rewards = np.concatenate([self.get_rewards(), other.rewards]) self.dones = np.concatenate([self.get_dones(), other.dones]) self._index += other._index return self def __len__(self): return self._index def __repr__(self): representation = "" for k, v in self.__dict__.items(): if isinstance(v, np.ndarray): representation += "%s:\n%s\n" % (k, v[: self._index]) else: representation += "%s:\n%s\n" % (k, v) return representation
[docs]class Memory(Storage, MemoryABC): """ An object to be used as replay buffer. Doesn't contain full episodes and acts as limited FIFO queue. Implemented as double size numpy arrays with duplicated data to support very fast slicing and sampling at the cost of higher memory usage. Args: initial_state: initial state from enviroment action_type: numpy type of action (e.g. np.int32) maximum_length: maximum number of examples to keep in queue """ @timeit def __init__( self, initial_state: np.ndarray, action_type, maximum_length: int = 1000 ): self._maximum_length = maximum_length self.states = np.empty( (2 * maximum_length + 2,) + initial_state.shape, dtype=np.float32 ) self.actions = np.empty((2 * maximum_length + 2,), dtype=action_type) self.rewards = np.empty((2 * maximum_length + 2,)) self.dones = np.empty((2 * maximum_length + 2,), dtype=np.bool) self.clear(initial_state)
[docs] @timeit def clear(self, initial_state): self._lower_index = 0 self._index = 1 self._full = False self.states[self._index] = initial_state
[docs] @timeit def update(self, action, reward, done, state): self.actions[self._index] = action self.rewards[self._index] = reward self.dones[self._index] = done if self._full: self.actions[self._lower_index] = action self.rewards[self._lower_index] = reward self.dones[self._lower_index] = done self._index += 1 if self._index > self._maximum_length + 1: self._lower_index += 1 self._full = True self.states[self._lower_index] = state if self._index == 2 * self._maximum_length + 2: self._index = self._maximum_length + 1 self._lower_index = 0 self.states[self._index] = state
[docs] @timeit def new_state_update(self, state): self.states[self._index] = state if self._full: self.states[self._lower_index] = state
[docs] @timeit def get_states(self, include_last=False) -> np.ndarray: index = self._index if include_last: index += 1 return self.states[(self._lower_index + 1) : index]
[docs] @timeit def get_last_state(self) -> np.ndarray: return self.states[self._index]
[docs] @timeit def get_rewards(self) -> np.ndarray: return self.rewards[(self._lower_index + 1) : self._index]
[docs] @timeit def get_actions(self) -> np.ndarray: return self.actions[(self._lower_index + 1) : self._index]
[docs] @timeit def get_dones(self) -> np.ndarray: return self.dones[(self._lower_index + 1) : self._index]
[docs] @timeit def sample_batch( self, replay_buffor_size: int, batch_size: int = 64, returns: bool = False, next_states: bool = False, ) -> tuple: if returns: raise NotImplementedError("The returns will be implemented soon") elif next_states: if self._index < 2: raise Exception( "Can't sample examples with next_state when the history has length 1." ) indicies = np.random.randint( self._index - self._lower_index - 2, size=batch_size ) return self[indicies] + (self.get_states()[indicies + 1],) else: indicies = np.random.randint( self._index - self._lower_index, size=batch_size ) return self[indicies]
def __len__(self): return self._index - (self._lower_index + 1) def __repr__(self): representation = "" for k, v in self.__dict__.items(): if isinstance(v, np.ndarray): representation += "%s:\n%s\n" % ( k, v[(self._lower_index + 1) : self._index], ) else: representation += "%s:\n%s\n" % (k, v) return representation