Spaces:
Sleeping
Sleeping
from collections import OrderedDict | |
import numpy as np | |
import torch | |
from src.rlkit.data_management.replay_buffer import ReplayBuffer, EnsembleReplayBuffer | |
class SimpleReplayBuffer(ReplayBuffer): | |
def __init__( | |
self, | |
max_replay_buffer_size, | |
observation_dim, | |
action_dim, | |
env_info_sizes, | |
): | |
self._observation_dim = observation_dim | |
self._action_dim = action_dim | |
self._max_replay_buffer_size = max_replay_buffer_size | |
self._observations = np.zeros((max_replay_buffer_size, observation_dim)) | |
# It's a bit memory inefficient to save the observations twice, | |
# but it makes the code *much* easier since you no longer have to | |
# worry about termination conditions. | |
self._next_obs = np.zeros((max_replay_buffer_size, observation_dim)) | |
self._actions = np.zeros((max_replay_buffer_size, action_dim)) | |
# Make everything a 2D np array to make it easier for other code to | |
# reason about the shape of the data | |
self._rewards = np.zeros((max_replay_buffer_size, 1)) | |
# self._terminals[i] = a terminal was received at time i | |
self._terminals = np.zeros((max_replay_buffer_size, 1), dtype='uint8') | |
# Define self._env_infos[key][i] to be the return value of env_info[key] | |
# at time i | |
self._env_infos = {} | |
for key, size in env_info_sizes.items(): | |
self._env_infos[key] = np.zeros((max_replay_buffer_size, size)) | |
self._env_info_keys = env_info_sizes.keys() | |
self._top = 0 | |
self._size = 0 | |
def add_sample(self, observation, action, reward, next_observation, | |
terminal, env_info, **kwargs): | |
self._observations[self._top] = observation | |
self._actions[self._top] = action | |
self._rewards[self._top] = reward | |
self._terminals[self._top] = terminal | |
self._next_obs[self._top] = next_observation | |
for key in self._env_info_keys: | |
self._env_infos[key][self._top] = env_info[key] | |
self._advance() | |
def terminate_episode(self): | |
pass | |
def _advance(self): | |
self._top = (self._top + 1) % self._max_replay_buffer_size | |
if self._size < self._max_replay_buffer_size: | |
self._size += 1 | |
def random_batch(self, batch_size): | |
indices = np.random.randint(0, self._size, batch_size) | |
batch = dict( | |
observations=self._observations[indices], | |
actions=self._actions[indices], | |
rewards=self._rewards[indices], | |
terminals=self._terminals[indices], | |
next_observations=self._next_obs[indices], | |
) | |
for key in self._env_info_keys: | |
assert key not in batch.keys() | |
batch[key] = self._env_infos[key][indices] | |
return batch | |
def rebuild_env_info_dict(self, idx): | |
return { | |
key: self._env_infos[key][idx] | |
for key in self._env_info_keys | |
} | |
def batch_env_info_dict(self, indices): | |
return { | |
key: self._env_infos[key][indices] | |
for key in self._env_info_keys | |
} | |
def num_steps_can_sample(self): | |
return self._size | |
def get_diagnostics(self): | |
return OrderedDict([ | |
('size', self._size) | |
]) | |
class EnsembleSimpleReplayBuffer(EnsembleReplayBuffer): | |
def __init__( | |
self, | |
max_replay_buffer_size, | |
observation_dim, | |
action_dim, | |
env_info_sizes, | |
num_ensemble, | |
log_dir, | |
): | |
self._observation_dim = observation_dim | |
self._action_dim = action_dim | |
self._max_replay_buffer_size = max_replay_buffer_size | |
self._observations = np.zeros((max_replay_buffer_size, observation_dim)) | |
# It's a bit memory inefficient to save the observations twice, | |
# but it makes the code *much* easier since you no longer have to | |
# worry about termination conditions. | |
self._next_obs = np.zeros((max_replay_buffer_size, observation_dim)) | |
self._actions = np.zeros((max_replay_buffer_size, action_dim)) | |
# Make everything a 2D np array to make it easier for other code to | |
# reason about the shape of the data | |
self._rewards = np.zeros((max_replay_buffer_size, 1)) | |
# self._terminals[i] = a terminal was received at time i | |
self._terminals = np.zeros((max_replay_buffer_size, 1), dtype='uint8') | |
# Define self._env_infos[key][i] to be the return value of env_info[key] | |
# at time i | |
self._env_infos = {} | |
for key, size in env_info_sizes.items(): | |
self._env_infos[key] = np.zeros((max_replay_buffer_size, size)) | |
self._env_info_keys = env_info_sizes.keys() | |
# define mask | |
self._mask = np.zeros((max_replay_buffer_size, num_ensemble)) | |
self._top = 0 | |
self._size = 0 | |
self.buffer_dir = log_dir + '/buffer/' | |
def add_sample(self, observation, action, reward, next_observation, | |
terminal, mask, env_info, **kwargs): | |
self._observations[self._top] = observation | |
self._actions[self._top] = action | |
self._rewards[self._top] = reward | |
self._terminals[self._top] = terminal | |
self._next_obs[self._top] = next_observation | |
self._mask[self._top] = mask | |
for key in self._env_info_keys: | |
self._env_infos[key][self._top] = env_info[key] | |
self._advance() | |
def terminate_episode(self): | |
pass | |
def _advance(self): | |
self._top = (self._top + 1) % self._max_replay_buffer_size | |
if self._size < self._max_replay_buffer_size: | |
self._size += 1 | |
def random_batch(self, batch_size): | |
indices = np.random.randint(0, self._size, batch_size) | |
batch = dict( | |
observations=self._observations[indices], | |
actions=self._actions[indices], | |
rewards=self._rewards[indices], | |
terminals=self._terminals[indices], | |
next_observations=self._next_obs[indices], | |
masks=self._mask[indices], | |
) | |
for key in self._env_info_keys: | |
assert key not in batch.keys() | |
batch[key] = self._env_infos[key][indices] | |
return batch | |
def rebuild_env_info_dict(self, idx): | |
return { | |
key: self._env_infos[key][idx] | |
for key in self._env_info_keys | |
} | |
def batch_env_info_dict(self, indices): | |
return { | |
key: self._env_infos[key][indices] | |
for key in self._env_info_keys | |
} | |
def num_steps_can_sample(self): | |
return self._size | |
def get_diagnostics(self): | |
return OrderedDict([ | |
('size', self._size) | |
]) | |
def save_buffer(self, epoch): | |
path = self.buffer_dir + '/replay_%d.pt' % (epoch) | |
payload = [ | |
self._observations[:self._size], | |
self._actions[:self._size], | |
self._rewards[:self._size], | |
self._terminals[:self._size], | |
self._next_obs[:self._size], | |
self._mask[:self._size], | |
self._size, | |
] | |
torch.save(payload, path) | |
def load_buffer(self, epoch): | |
path = self.buffer_dir + '/replay_%d.pt' % (epoch) | |
payload = torch.load(path) | |
self._observations = payload[0] | |
self._actions = payload[1] | |
self._rewards = payload[2] | |
self._terminals = payload[3] | |
self._next_obs = payload[4] | |
self._mask = payload[5] | |
self._size = payload[6] | |
class RandomReplayBuffer(ReplayBuffer): | |
def __init__( | |
self, | |
max_replay_buffer_size, | |
observation_dim, | |
action_dim, | |
env_info_sizes, | |
single_flag, | |
equal_flag, | |
lower, | |
upper, | |
): | |
self._observation_dim = observation_dim | |
self._action_dim = action_dim | |
self._max_replay_buffer_size = max_replay_buffer_size | |
self._observations = np.zeros((max_replay_buffer_size, observation_dim)) | |
# It's a bit memory inefficient to save the observations twice, | |
# but it makes the code *much* easier since you no longer have to | |
# worry about termination conditions. | |
self._next_obs = np.zeros((max_replay_buffer_size, observation_dim)) | |
self._actions = np.zeros((max_replay_buffer_size, action_dim)) | |
# Make everything a 2D np array to make it easier for other code to | |
# reason about the shape of the data | |
self._rewards = np.zeros((max_replay_buffer_size, 1)) | |
# self._terminals[i] = a terminal was received at time i | |
self._terminals = np.zeros((max_replay_buffer_size, 1), dtype='uint8') | |
# Define self._env_infos[key][i] to be the return value of env_info[key] | |
# at time i | |
self._env_infos = {} | |
for key, size in env_info_sizes.items(): | |
self._env_infos[key] = np.zeros((max_replay_buffer_size, size)) | |
self._env_info_keys = env_info_sizes.keys() | |
self._top = 0 | |
self._size = 0 | |
# randomization | |
self.single_flag = single_flag | |
self.equal_flag = equal_flag | |
self.lower = lower | |
self.upper = upper | |
def add_sample(self, observation, action, reward, next_observation, | |
terminal, env_info, **kwargs): | |
self._observations[self._top] = observation | |
self._actions[self._top] = action | |
self._rewards[self._top] = reward | |
self._terminals[self._top] = terminal | |
self._next_obs[self._top] = next_observation | |
for key in self._env_info_keys: | |
self._env_infos[key][self._top] = env_info[key] | |
self._advance() | |
def terminate_episode(self): | |
pass | |
def _advance(self): | |
self._top = (self._top + 1) % self._max_replay_buffer_size | |
if self._size < self._max_replay_buffer_size: | |
self._size += 1 | |
def random_batch(self, batch_size): | |
indices = np.random.randint(0, self._size, batch_size) | |
obs = self._observations[indices] | |
next_obs = self._next_obs[indices] | |
if self.single_flag == 0: | |
random_number_1 = np.random.uniform(self.lower, self.upper, obs.shape[0]).reshape(-1,1) | |
random_number_2 = np.random.uniform(self.lower, self.upper, obs.shape[0]).reshape(-1,1) | |
else: | |
random_number_1 = np.random.uniform(self.lower, self.upper, obs.shape[0]*obs.shape[1]).reshape(obs.shape[0],-1) | |
random_number_2 = np.random.uniform(self.lower, self.upper, obs.shape[0]*obs.shape[1]).reshape(obs.shape[0],-1) | |
if self.equal_flag == 0: | |
obs = obs * random_number_1 | |
next_obs = next_obs * random_number_1 | |
else: | |
obs = obs * random_number_1 | |
next_obs = next_obs * random_number_2 | |
batch = dict( | |
observations=obs, | |
actions=self._actions[indices], | |
rewards=self._rewards[indices], | |
terminals=self._terminals[indices], | |
next_observations=next_obs, | |
) | |
for key in self._env_info_keys: | |
assert key not in batch.keys() | |
batch[key] = self._env_infos[key][indices] | |
return batch | |
def rebuild_env_info_dict(self, idx): | |
return { | |
key: self._env_infos[key][idx] | |
for key in self._env_info_keys | |
} | |
def batch_env_info_dict(self, indices): | |
return { | |
key: self._env_infos[key][indices] | |
for key in self._env_info_keys | |
} | |
def num_steps_can_sample(self): | |
return self._size | |
def get_diagnostics(self): | |
return OrderedDict([ | |
('size', self._size) | |
]) | |
class GaussianReplayBuffer(ReplayBuffer): | |
def __init__( | |
self, | |
max_replay_buffer_size, | |
observation_dim, | |
action_dim, | |
env_info_sizes, | |
prob, | |
std, | |
): | |
self._observation_dim = observation_dim | |
self._action_dim = action_dim | |
self._max_replay_buffer_size = max_replay_buffer_size | |
self._observations = np.zeros((max_replay_buffer_size, observation_dim)) | |
# It's a bit memory inefficient to save the observations twice, | |
# but it makes the code *much* easier since you no longer have to | |
# worry about termination conditions. | |
self._next_obs = np.zeros((max_replay_buffer_size, observation_dim)) | |
self._actions = np.zeros((max_replay_buffer_size, action_dim)) | |
# Make everything a 2D np array to make it easier for other code to | |
# reason about the shape of the data | |
self._rewards = np.zeros((max_replay_buffer_size, 1)) | |
# self._terminals[i] = a terminal was received at time i | |
self._terminals = np.zeros((max_replay_buffer_size, 1), dtype='uint8') | |
# Define self._env_infos[key][i] to be the return value of env_info[key] | |
# at time i | |
self._env_infos = {} | |
for key, size in env_info_sizes.items(): | |
self._env_infos[key] = np.zeros((max_replay_buffer_size, size)) | |
self._env_info_keys = env_info_sizes.keys() | |
self._top = 0 | |
self._size = 0 | |
# randomization | |
self.prob = prob | |
self.std = std | |
def add_sample(self, observation, action, reward, next_observation, | |
terminal, env_info, **kwargs): | |
self._observations[self._top] = observation | |
self._actions[self._top] = action | |
self._rewards[self._top] = reward | |
self._terminals[self._top] = terminal | |
self._next_obs[self._top] = next_observation | |
for key in self._env_info_keys: | |
self._env_infos[key][self._top] = env_info[key] | |
self._advance() | |
def terminate_episode(self): | |
pass | |
def _advance(self): | |
self._top = (self._top + 1) % self._max_replay_buffer_size | |
if self._size < self._max_replay_buffer_size: | |
self._size += 1 | |
def random_batch(self, batch_size): | |
indices = np.random.randint(0, self._size, batch_size) | |
obs = self._observations[indices] | |
next_obs = self._next_obs[indices] | |
num_batch, dim_input = obs.shape[0], obs.shape[1] | |
noise = np.random.normal(0, self.std, num_batch*dim_input).reshape(num_batch, -1) | |
mask = np.random.uniform(0, 1, num_batch).reshape(num_batch, -1) < self.prob | |
noise = noise * mask | |
obs = obs + noise | |
next_obs = next_obs + noise | |
batch = dict( | |
observations=obs, | |
actions=self._actions[indices], | |
rewards=self._rewards[indices], | |
terminals=self._terminals[indices], | |
next_observations=next_obs, | |
) | |
for key in self._env_info_keys: | |
assert key not in batch.keys() | |
batch[key] = self._env_infos[key][indices] | |
return batch | |
def rebuild_env_info_dict(self, idx): | |
return { | |
key: self._env_infos[key][idx] | |
for key in self._env_info_keys | |
} | |
def batch_env_info_dict(self, indices): | |
return { | |
key: self._env_infos[key][indices] | |
for key in self._env_info_keys | |
} | |
def num_steps_can_sample(self): | |
return self._size | |
def get_diagnostics(self): | |
return OrderedDict([ | |
('size', self._size) | |
]) | |