Spaces:
Running
on
A10G
Running
on
A10G
import d4rl # noqa | |
import gym | |
import tqdm | |
from diffusers.experimental import ValueGuidedRLPipeline | |
config = { | |
"n_samples": 64, | |
"horizon": 32, | |
"num_inference_steps": 20, | |
"n_guide_steps": 2, # can set to 0 for faster sampling, does not use value network | |
"scale_grad_by_std": True, | |
"scale": 0.1, | |
"eta": 0.0, | |
"t_grad_cutoff": 2, | |
"device": "cpu", | |
} | |
if __name__ == "__main__": | |
env_name = "hopper-medium-v2" | |
env = gym.make(env_name) | |
pipeline = ValueGuidedRLPipeline.from_pretrained( | |
"bglick13/hopper-medium-v2-value-function-hor32", | |
env=env, | |
) | |
env.seed(0) | |
obs = env.reset() | |
total_reward = 0 | |
total_score = 0 | |
T = 1000 | |
rollout = [obs.copy()] | |
try: | |
for t in tqdm.tqdm(range(T)): | |
# call the policy | |
denorm_actions = pipeline(obs, planning_horizon=32) | |
# execute action in environment | |
next_observation, reward, terminal, _ = env.step(denorm_actions) | |
score = env.get_normalized_score(total_reward) | |
# update return | |
total_reward += reward | |
total_score += score | |
print( | |
f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:" | |
f" {total_score}" | |
) | |
# save observations for rendering | |
rollout.append(next_observation.copy()) | |
obs = next_observation | |
except KeyboardInterrupt: | |
pass | |
print(f"Total reward: {total_reward}") | |