adhisetiawan commited on
Commit
3cd94a5
1 Parent(s): c01ce98

pushing model

Browse files
Qbert-v5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:494b692b2d8ee39f4e5839ce035f20068fae68e1d5ce41624a1719e5336d7cdd
3
+ size 6752403
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - ALE/Qbert-v5
4
+ - deep-reinforcement-learning
5
+ - reinforcement-learning
6
+ - custom-implementation
7
+ library_name: cleanrl
8
+ model-index:
9
+ - name: DQN
10
+ results:
11
+ - task:
12
+ type: reinforcement-learning
13
+ name: reinforcement-learning
14
+ dataset:
15
+ name: ALE/Qbert-v5
16
+ type: ALE/Qbert-v5
17
+ metrics:
18
+ - type: mean_reward
19
+ value: 4122.50 +/- 314.73
20
+ name: mean_reward
21
+ verified: false
22
+ ---
23
+
24
+ # (CleanRL) **DQN** Agent Playing **ALE/Qbert-v5**
25
+
26
+ This is a trained model of a DQN agent playing ALE/Qbert-v5.
27
+ The model was trained by using [CleanRL](https://github.com/vwxyzjn/cleanrl) and the most up-to-date training code can be
28
+ found [here](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/Qbert-v5.py).
29
+
30
+ ## Get Started
31
+
32
+ To use this model, please install the `cleanrl` package with the following command:
33
+
34
+ ```
35
+ pip install "cleanrl[Qbert-v5]"
36
+ python -m cleanrl_utils.enjoy --exp-name Qbert-v5 --env-id ALE/Qbert-v5
37
+ ```
38
+
39
+ Please refer to the [documentation](https://docs.cleanrl.dev/get-started/zoo/) for more detail.
40
+
41
+
42
+ ## Command to reproduce the training
43
+
44
+ ```bash
45
+ curl -OL https://huggingface.co/adhisetiawan/Qbert-v5/raw/main/dqn_atari.py
46
+ curl -OL https://huggingface.co/adhisetiawan/Qbert-v5/raw/main/pyproject.toml
47
+ curl -OL https://huggingface.co/adhisetiawan/Qbert-v5/raw/main/poetry.lock
48
+ poetry install --all-extras
49
+ python dqn_atari.py --exp-name Qbert-v5 --track --wandb-project-name ALE --capture-video --env-id ALE/Qbert-v5 --total-timesteps 1000000 --buffer-size 400000 --save-model --upload-model --hf-entity adhisetiawan
50
+ ```
51
+
52
+ # Hyperparameters
53
+ ```python
54
+ {'batch_size': 32,
55
+ 'buffer_size': 400000,
56
+ 'capture_video': True,
57
+ 'cuda': True,
58
+ 'end_e': 0.01,
59
+ 'env_id': 'ALE/Qbert-v5',
60
+ 'exp_name': 'Qbert-v5',
61
+ 'exploration_fraction': 0.1,
62
+ 'gamma': 0.99,
63
+ 'hf_entity': 'adhisetiawan',
64
+ 'learning_rate': 0.0001,
65
+ 'learning_starts': 80000,
66
+ 'num_envs': 1,
67
+ 'save_model': True,
68
+ 'seed': 1,
69
+ 'start_e': 1,
70
+ 'target_network_frequency': 1000,
71
+ 'tau': 1.0,
72
+ 'torch_deterministic': True,
73
+ 'total_timesteps': 1000000,
74
+ 'track': True,
75
+ 'train_frequency': 4,
76
+ 'upload_model': True,
77
+ 'wandb_entity': None,
78
+ 'wandb_project_name': 'ALE'}
79
+ ```
80
+
dqn_atari.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import random
4
+ import time
5
+ from distutils.util import strtobool
6
+
7
+ import gymnasium as gym
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ import torch.optim as optim
13
+ from stable_baselines3.common.atari_wrappers import (
14
+ ClipRewardEnv,
15
+ EpisodicLifeEnv,
16
+ FireResetEnv,
17
+ MaxAndSkipEnv,
18
+ NoopResetEnv
19
+ )
20
+ from stable_baselines3.common.buffers import ReplayBuffer
21
+ from torch.utils.tensorboard import SummaryWriter
22
+
23
+
24
+ def parse_args():
25
+ # fmt: off
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
28
+ help="the name of this experiment")
29
+ parser.add_argument("--seed", type=int, default=1,
30
+ help="seed of the experiment")
31
+ parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
32
+ help="if toggled, `torch.backends.cudnn.deterministic=False`")
33
+ parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
34
+ help="if toggled, cuda will be enabled by default")
35
+ parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
36
+ help="if toggled, this experiment will be tracked with Weights and Biases")
37
+ parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
38
+ help="the wandb's project name")
39
+ parser.add_argument("--wandb-entity", type=str, default=None,
40
+ help="the entity (team) of wandb's project")
41
+ parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
42
+ help="whether to capture videos of the agent performances (check out `videos` folder)")
43
+ parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
44
+ help="whether to save model into the `runs/{run_name}` folder")
45
+ parser.add_argument("--upload-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
46
+ help="whether to upload the saved model to huggingface")
47
+ parser.add_argument("--hf-entity", type=str, default="",
48
+ help="the user or org name of the model repository from the Hugging Face Hub")
49
+
50
+ # Algorithm specific arguments
51
+ parser.add_argument("--env-id", type=str, default="BreakoutNoFrameskip-v4",
52
+ help="the id of the environment")
53
+ parser.add_argument("--total-timesteps", type=int, default=10000000,
54
+ help="total timesteps of the experiments")
55
+ parser.add_argument("--learning-rate", type=float, default=1e-4,
56
+ help="the learning rate of the optimizer")
57
+ parser.add_argument("--num-envs", type=int, default=1,
58
+ help="the number of parallel game environments")
59
+ parser.add_argument("--buffer-size", type=int, default=1000000,
60
+ help="the replay memory buffer size")
61
+ parser.add_argument("--gamma", type=float, default=0.99,
62
+ help="the discount factor gamma")
63
+ parser.add_argument("--tau", type=float, default=1.,
64
+ help="the target network update rate")
65
+ parser.add_argument("--target-network-frequency", type=int, default=1000,
66
+ help="the timesteps it takes to update the target network")
67
+ parser.add_argument("--batch-size", type=int, default=32,
68
+ help="the batch size of sample from the reply memory")
69
+ parser.add_argument("--start-e", type=float, default=1,
70
+ help="the starting epsilon for exploration")
71
+ parser.add_argument("--end-e", type=float, default=0.01,
72
+ help="the ending epsilon for exploration")
73
+ parser.add_argument("--exploration-fraction", type=float, default=0.10,
74
+ help="the fraction of `total-timesteps` it takes from start-e to go end-e")
75
+ parser.add_argument("--learning-starts", type=int, default=80000,
76
+ help="timestep to start learning")
77
+ parser.add_argument("--train-frequency", type=int, default=4,
78
+ help="the frequency of training")
79
+ args = parser.parse_args()
80
+ # fmt: on
81
+ assert args.num_envs == 1, "vectorized envs are not supported at the moment"
82
+
83
+ return args
84
+
85
+
86
+ def make_env(env_id, seed, idx, capture_video, run_name):
87
+ def thunk():
88
+ if capture_video and idx == 0:
89
+ env = gym.make(env_id, render_mode="rgb_array")
90
+ env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
91
+ else:
92
+ env = gym.make(env_id)
93
+
94
+ env = gym.wrappers.RecordEpisodeStatistics(env)
95
+ env = NoopResetEnv(env, noop_max=30)
96
+ env = MaxAndSkipEnv(env, skip=4)
97
+ env = EpisodicLifeEnv(env)
98
+
99
+ if "FIRE" in env.unwrapped.get_action_meanings():
100
+ env = FireResetEnv(env)
101
+
102
+ env = ClipRewardEnv(env)
103
+ env = gym.wrappers.ResizeObservation(env, (84, 84))
104
+ env = gym.wrappers.GrayScaleObservation(env)
105
+ env = gym.wrappers.FrameStack(env, 4)
106
+ env.action_space.seed(seed)
107
+
108
+ return env
109
+
110
+ return thunk
111
+
112
+
113
+ class QNetwork(nn.Module):
114
+ def __init__(self, env):
115
+ super().__init__()
116
+ self.network = nn.Sequential(
117
+ nn.Conv2d(4, 32, 8, stride=4),
118
+ nn.ReLU(),
119
+ nn.Conv2d(32, 64, 4, stride=2),
120
+ nn.ReLU(),
121
+ nn.Conv2d(64, 64, 3, stride=1),
122
+ nn.ReLU(),
123
+ nn.Flatten(),
124
+ nn.Linear(3136, 512),
125
+ nn.ReLU(),
126
+ nn.Linear(512, env.single_action_space.n),
127
+ )
128
+
129
+ def forward(self, x):
130
+ return self.network(x / 255.0)
131
+
132
+
133
+ def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
134
+ slope = (end_e - start_e) / duration
135
+ return max(slope * t + start_e, end_e)
136
+
137
+ if __name__ == "__main__":
138
+ import stable_baselines3 as sb3
139
+
140
+ if sb3.__version__ < "2.0":
141
+ raise ValueError(
142
+ """On going migration: run the following command to install new dependencies
143
+ pip install "stable_baselines3==2.0.0a1" "gymnasium[atari,accept-rom-license]==0.28.1" "ale-py==0.8.1"
144
+ """
145
+ )
146
+
147
+ args = parse_args()
148
+ run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
149
+ if args.track:
150
+ import wandb
151
+
152
+ wandb.init(
153
+ project=args.wandb_project_name,
154
+ entity=args.wandb_entity,
155
+ sync_tensorboard=True,
156
+ config=vars(args),
157
+ name=run_name,
158
+ monitor_gym=True,
159
+ save_code=True
160
+ )
161
+
162
+ writer = SummaryWriter(f"runs/{run_name}")
163
+ writer.add_text(
164
+ "hyperparameters",
165
+ "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
166
+ )
167
+
168
+ random.seed(args.seed)
169
+ np.random.seed(args.seed)
170
+ torch.manual_seed(args.seed)
171
+ torch.backends.cudnn.deterministic = args.torch_deterministic
172
+
173
+ device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
174
+
175
+ envs = gym.vector.SyncVectorEnv(
176
+ [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
177
+ )
178
+ assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
179
+
180
+ q_network = QNetwork(envs).to(device)
181
+ optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
182
+ target_network = QNetwork(envs).to(device)
183
+ target_network.load_state_dict(q_network.state_dict())
184
+
185
+ rb = ReplayBuffer(
186
+ args.buffer_size,
187
+ envs.single_observation_space,
188
+ envs.single_action_space,
189
+ device,
190
+ optimize_memory_usage=True,
191
+ handle_timeout_termination=False
192
+ )
193
+ start_time = time.time()
194
+
195
+ obs, _ = envs.reset(seed=args.seed)
196
+ for global_step in range(args.total_timesteps):
197
+ epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
198
+ if random.random() < epsilon:
199
+ actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
200
+ else:
201
+ q_values = q_network(torch.Tensor(obs).to(device))
202
+ actions = torch.argmax(q_values, dim=1).cpu().numpy()
203
+
204
+ next_obs, rewards, terminated, truncated, infos = envs.step(actions)
205
+
206
+ if "final_info" in infos:
207
+ for info in infos["final_info"]:
208
+ if "episode" not in info:
209
+ continue
210
+ print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
211
+ writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
212
+ writer.add_scalar("charts/episode_length", info["episode"]["l"], global_step)
213
+ writer.add_scalar("charts/epsilon", epsilon, global_step)
214
+
215
+ real_next_obs = next_obs.copy()
216
+ for idx, d in enumerate(truncated):
217
+ if d:
218
+ real_next_obs[idx] = infos["final_observation"][idx]
219
+ rb.add(obs, real_next_obs, actions, rewards, terminated, infos)
220
+
221
+ obs = next_obs
222
+
223
+ if global_step > args.learning_starts:
224
+ if global_step % args.train_frequency == 0:
225
+ data = rb.sample(args.batch_size)
226
+ with torch.no_grad():
227
+ target_max, _ = target_network(data.next_observations).max(dim=1)
228
+ td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten())
229
+ old_val = q_network(data.observations).gather(1, data.actions).squeeze()
230
+ loss = F.mse_loss(td_target, old_val)
231
+
232
+ if global_step % 100 == 0:
233
+ writer.add_scalar("losses/td_loss", loss, global_step)
234
+ writer.add_scalar("losses/q_values", old_val.mean().item(), global_step)
235
+ print("SPS:", int(global_step / (time.time() - start_time)))
236
+ writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
237
+
238
+ optimizer.zero_grad()
239
+ loss.backward()
240
+ optimizer.step()
241
+
242
+ if global_step % args.target_network_frequency == 0:
243
+ for target_network_param, q_network_param in zip(target_network.parameters(), q_network.parameters()):
244
+ target_network_param.data.copy_(
245
+ args.tau * q_network_param.data + (1.0 - args.tau) * target_network_param.data
246
+ )
247
+
248
+ if args.save_model:
249
+ model_path = f"runs/{run_name}/{args.exp_name}.pth"
250
+ torch.save(q_network.state_dict(), model_path)
251
+ print(f"model saved to {model_path}")
252
+
253
+ from dqn_eval import evaluate
254
+
255
+ episodic_returns = evaluate(
256
+ model_path,
257
+ make_env,
258
+ args.env_id,
259
+ eval_episode=10,
260
+ run_name=f"{run_name}-eval",
261
+ Model=QNetwork,
262
+ device=device,
263
+ epsilon=0.05,
264
+ )
265
+
266
+ for idx, episodic_return in enumerate(episodic_returns):
267
+ writer.add_scalar("eval/episodic_return", episodic_return, idx)
268
+
269
+ if args.upload_model:
270
+ from huggingface import push_to_hub
271
+
272
+ repo_name = f"{args.exp_name}"
273
+ repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
274
+ push_to_hub(args, episodic_returns, repo_id, "DQN", f"runs/{run_name}", f"videos/{run_name}-eval")
275
+
276
+ envs.close()
277
+ writer.close()
278
+
events.out.tfevents.1693372331.LAPTOP-9SN8UL2M.15381.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9f60fb9213737d663b61beb4bc3d75a85a5ba89d0f261997d1327de14464207
3
+ size 2545882
replay.mp4 ADDED
Binary file (150 kB). View file
 
videos/ALE/Qbert-v5__Qbert-v5__1__1693372327-eval/rl-video-episode-0.mp4 ADDED
Binary file (128 kB). View file
 
videos/ALE/Qbert-v5__Qbert-v5__1__1693372327-eval/rl-video-episode-1.mp4 ADDED
Binary file (162 kB). View file
 
videos/ALE/Qbert-v5__Qbert-v5__1__1693372327-eval/rl-video-episode-8.mp4 ADDED
Binary file (150 kB). View file