import numpy as np import torch from src.rlkit.torch import pytorch_util as ptu def multitask_rollout( env, agent, max_path_length=np.inf, render=False, render_kwargs=None, observation_key=None, desired_goal_key=None, get_action_kwargs=None, return_dict_obs=False, ): if render_kwargs is None: render_kwargs = {} if get_action_kwargs is None: get_action_kwargs = {} dict_obs = [] dict_next_obs = [] observations = [] actions = [] rewards = [] terminals = [] agent_infos = [] env_infos = [] next_observations = [] path_length = 0 agent.reset() o = env.reset() if render: env.render(**render_kwargs) goal = o[desired_goal_key] while path_length < max_path_length: dict_obs.append(o) if observation_key: o = o[observation_key] new_obs = np.hstack((o, goal)) a, agent_info = agent.get_action(new_obs, **get_action_kwargs) next_o, r, d, env_info = env.step(a) if render: env.render(**render_kwargs) observations.append(o) rewards.append(r) terminals.append(d) actions.append(a) next_observations.append(next_o) dict_next_obs.append(next_o) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o actions = np.array(actions) if len(actions.shape) == 1: actions = np.expand_dims(actions, 1) observations = np.array(observations) next_observations = np.array(next_observations) if return_dict_obs: observations = dict_obs next_observations = dict_next_obs return dict( observations=observations, actions=actions, rewards=np.array(rewards).reshape(-1, 1), next_observations=next_observations, terminals=np.array(terminals).reshape(-1, 1), agent_infos=agent_infos, env_infos=env_infos, goals=np.repeat(goal[None], path_length, 0), full_observations=dict_obs, ) def rollout( env, agent, noise_flag=0, max_path_length=np.inf, render=False, render_kwargs=None, ): """ The following value for the following keys will be a 2D array, with the first dimension corresponding to the time dimension. - observations - actions - rewards - next_observations - terminals The next two elements will be lists of dictionaries, with the index into the list being the index into the time - agent_infos - env_infos """ if render_kwargs is None: render_kwargs = {} observations = [] actions = [] rewards = [] terminals = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() next_o = None path_length = 0 if render: env.render(**render_kwargs) while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) if noise_flag == 1: r += np.random.normal(0,1,1)[0] observations.append(o) rewards.append(r) terminals.append(d) actions.append(a) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if render: env.render(**render_kwargs) actions = np.array(actions) if len(actions.shape) == 1: actions = np.expand_dims(actions, 1) observations = np.array(observations) if len(observations.shape) == 1: observations = np.expand_dims(observations, 1) next_o = np.array([next_o]) next_observations = np.vstack( ( observations[1:, :], np.expand_dims(next_o, 0) ) ) return dict( observations=observations, actions=actions, rewards=np.array(rewards).reshape(-1, 1), next_observations=next_observations, terminals=np.array(terminals).reshape(-1, 1), agent_infos=agent_infos, env_infos=env_infos, ) def ensemble_rollout( env, agent, num_ensemble, noise_flag=0, max_path_length=np.inf, ber_mean=0.5, render=False, render_kwargs=None, ): """ The following value for the following keys will be a 2D array, with the first dimension corresponding to the time dimension. - observations - actions - rewards - next_observations - terminals The next two elements will be lists of dictionaries, with the index into the list being the index into the time - agent_infos - env_infos """ if render_kwargs is None: render_kwargs = {} observations = [] actions = [] rewards = [] terminals = [] agent_infos = [] env_infos = [] masks = [] # mask for bootstrapping o = env.reset() en_index = np.random.randint(num_ensemble) agent[en_index].reset() next_o = None path_length = 0 if render: env.render(**render_kwargs) while path_length < max_path_length: a, agent_info = agent[en_index].get_action(o) next_o, r, d, env_info = env.step(a) if noise_flag == 1: r += np.random.normal(0,1,1)[0] observations.append(o) rewards.append(r) terminals.append(d) actions.append(a) agent_infos.append(agent_info) env_infos.append(env_info) mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble)) if mask.sum() == 0: rand_index = np.random.randint(num_ensemble, size=1) mask[rand_index] = 1 mask = mask.numpy() masks.append(mask) path_length += 1 if d: break o = next_o if render: env.render(**render_kwargs) actions = np.array(actions) if len(actions.shape) == 1: actions = np.expand_dims(actions, 1) observations = np.array(observations) if len(observations.shape) == 1: observations = np.expand_dims(observations, 1) next_o = np.array([next_o]) next_observations = np.vstack( ( observations[1:, :], np.expand_dims(next_o, 0) ) ) masks = np.array(masks) return dict( observations=observations, actions=actions, rewards=np.array(rewards).reshape(-1, 1), next_observations=next_observations, terminals=np.array(terminals).reshape(-1, 1), agent_infos=agent_infos, env_infos=env_infos, masks=masks, ) def get_ucb_std(obs, policy_action, inference_type, critic1, critic2, feedback_type, en_index, num_ensemble): obs = ptu.from_numpy(obs).float() policy_action = ptu.from_numpy(policy_action).float() obs = obs.reshape(1,-1) policy_action = policy_action.reshape(1,-1) if feedback_type == 0 or feedback_type==2: with torch.no_grad(): target_Q1 = critic1[en_index](obs, policy_action) target_Q2 = critic2[en_index](obs, policy_action) mean_Q = 0.5*(target_Q1.detach() + target_Q2.detach()) var_Q = 0.5*((target_Q1.detach() - mean_Q)**2 + (target_Q2.detach() - mean_Q)**2) ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach() elif feedback_type == 1 or feedback_type==3: mean_Q, var_Q = None, None L_target_Q = [] for en_index in range(num_ensemble): with torch.no_grad(): target_Q1 = critic1[en_index](obs, policy_action) target_Q2 = critic2[en_index](obs, policy_action) L_target_Q.append(target_Q1) L_target_Q.append(target_Q2) if en_index == 0: mean_Q = 0.5*(target_Q1 + target_Q2) / num_ensemble else: mean_Q += 0.5*(target_Q1 + target_Q2) / num_ensemble temp_count = 0 for target_Q in L_target_Q: if temp_count == 0: var_Q = (target_Q.detach() - mean_Q)**2 else: var_Q += (target_Q.detach() - mean_Q)**2 temp_count += 1 var_Q = var_Q / temp_count ucb_score = mean_Q + inference_type * torch.sqrt(var_Q).detach() return ucb_score def ensemble_ucb_rollout( env, agent, critic1, critic2, inference_type, feedback_type, num_ensemble, noise_flag=0, max_path_length=np.inf, ber_mean=0.5, render=False, render_kwargs=None, ): """ The following value for the following keys will be a 2D array, with the first dimension corresponding to the time dimension. - observations - actions - rewards - next_observations - terminals The next two elements will be lists of dictionaries, with the index into the list being the index into the time - agent_infos - env_infos """ if render_kwargs is None: render_kwargs = {} observations = [] actions = [] rewards = [] terminals = [] agent_infos = [] env_infos = [] masks = [] # mask for bootstrapping o = env.reset() for en_index in range(num_ensemble): agent[en_index].reset() next_o = None path_length = 0 if render: env.render(**render_kwargs) while path_length < max_path_length: a_max, ucb_max, agent_info_max = None, None, None for en_index in range(num_ensemble): _a, agent_info = agent[en_index].get_action(o) ucb_score = get_ucb_std(o, _a, inference_type, critic1, critic2, feedback_type, en_index, num_ensemble) if en_index == 0: a_max = _a ucb_max = ucb_score agent_info_max = agent_info else: if ucb_score > ucb_max: ucb_max = ucb_score a_max = _a agent_info_max = agent_info next_o, r, d, env_info = env.step(a_max) if noise_flag == 1: r += np.random.normal(0,1,1)[0] observations.append(o) rewards.append(r) terminals.append(d) actions.append(a_max) agent_infos.append(agent_info_max) env_infos.append(env_info) mask = torch.bernoulli(torch.Tensor([ber_mean]*num_ensemble)) if mask.sum() == 0: rand_index = np.random.randint(num_ensemble, size=1) mask[rand_index] = 1 mask = mask.numpy() masks.append(mask) path_length += 1 if d: break o = next_o if render: env.render(**render_kwargs) actions = np.array(actions) if len(actions.shape) == 1: actions = np.expand_dims(actions, 1) observations = np.array(observations) if len(observations.shape) == 1: observations = np.expand_dims(observations, 1) next_o = np.array([next_o]) next_observations = np.vstack( ( observations[1:, :], np.expand_dims(next_o, 0) ) ) masks = np.array(masks) return dict( observations=observations, actions=actions, rewards=np.array(rewards).reshape(-1, 1), next_observations=next_observations, terminals=np.array(terminals).reshape(-1, 1), agent_infos=agent_infos, env_infos=env_infos, masks=masks, ) def ensemble_eval_rollout( env, agent, num_ensemble, max_path_length=np.inf, render=False, render_kwargs=None, ): """ The following value for the following keys will be a 2D array, with the first dimension corresponding to the time dimension. - observations - actions - rewards - next_observations - terminals The next two elements will be lists of dictionaries, with the index into the list being the index into the time - agent_infos - env_infos """ if render_kwargs is None: render_kwargs = {} observations = [] actions = [] rewards = [] terminals = [] agent_infos = [] env_infos = [] o = env.reset() for en_index in range(num_ensemble): agent[en_index].reset() next_o = None path_length = 0 if render: env.render(**render_kwargs) while path_length < max_path_length: a = None for en_index in range(num_ensemble): _a, agent_info = agent[en_index].get_action(o) if en_index == 0: a = _a else: a += _a a = a / num_ensemble next_o, r, d, env_info = env.step(a) observations.append(o) rewards.append(r) terminals.append(d) actions.append(a) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if render: env.render(**render_kwargs) actions = np.array(actions) if len(actions.shape) == 1: actions = np.expand_dims(actions, 1) observations = np.array(observations) if len(observations.shape) == 1: observations = np.expand_dims(observations, 1) next_o = np.array([next_o]) next_observations = np.vstack( ( observations[1:, :], np.expand_dims(next_o, 0) ) ) return dict( observations=observations, actions=actions, rewards=np.array(rewards).reshape(-1, 1), next_observations=next_observations, terminals=np.array(terminals).reshape(-1, 1), agent_infos=agent_infos, env_infos=env_infos, ) def async_ensemble_eval_rollout( env, agent, num_ensemble, max_path_length=np.inf, render=False, render_kwargs=None, ): """ The following value for the following keys will be a 2D array, with the first dimension corresponding to the time dimension. - observations - actions - rewards - next_observations - terminals The next two elements will be lists of dictionaries, with the index into the list being the index into the time - agent_infos - env_infos """ if render_kwargs is None: render_kwargs = {} # observations = [] # actions = [] # rewards = [] # terminals = [] # agent_infos = [] # env_infos = [] o = env.reset() for en_index in range(num_ensemble): agent[en_index].reset() next_o = None path_length = 0 if render: env.render(**render_kwargs) while path_length < max_path_length: a = None for en_index in range(num_ensemble): _a, agent_info = agent[en_index].get_action(o) if en_index == 0: a = _a else: a += _a a = a / num_ensemble next_o, d = env.step(a) # observations.append(o) # rewards.append(r) # terminals.append(d) # actions.append(a) # agent_infos.append(agent_info) # env_infos.append(env_info) path_length += 1 if d: break o = next_o if render: env.render(**render_kwargs) actions = np.array(actions) if len(actions.shape) == 1: actions = np.expand_dims(actions, 1) observations = np.array(observations) if len(observations.shape) == 1: observations = np.expand_dims(observations, 1) next_o = np.array([next_o]) next_observations = np.vstack( ( observations[1:, :], np.expand_dims(next_o, 0) ) ) return dict( observations=observations, actions=actions, rewards=np.array(rewards).reshape(-1, 1), next_observations=next_observations, terminals=np.array(terminals).reshape(-1, 1), agent_infos=agent_infos, env_infos=env_infos, )