cotran2 commited on
Commit
12505f0
·
verified ·
1 Parent(s): 9bd7968

pushing model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ videos/ALE/MontezumaRevenge-v5__Montezuma__1__1718240471-eval/rl-video-episode-0.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ videos/ALE/MontezumaRevenge-v5__Montezuma__1__1718240471-eval/rl-video-episode-1.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ videos/ALE/MontezumaRevenge-v5__Montezuma__1__1718240471-eval/rl-video-episode-8.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ replay.mp4 filter=lfs diff=lfs merge=lfs -text
Montezuma.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be604c4575fe02c77bbe2e1a7c08414458d9abcf87f30b4dda9284fcd30269d
3
+ size 6776327
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - ALE/MontezumaRevenge-v5
4
+ - deep-reinforcement-learning
5
+ - reinforcement-learning
6
+ - custom-implementation
7
+ library_name: cleanrl
8
+ model-index:
9
+ - name: DQN
10
+ results:
11
+ - task:
12
+ type: reinforcement-learning
13
+ name: reinforcement-learning
14
+ dataset:
15
+ name: ALE/MontezumaRevenge-v5
16
+ type: ALE/MontezumaRevenge-v5
17
+ metrics:
18
+ - type: mean_reward
19
+ value: 0.00 +/- 0.00
20
+ name: mean_reward
21
+ verified: false
22
+ ---
23
+
24
+ # (CleanRL) **DQN** Agent Playing **ALE/MontezumaRevenge-v5**
25
+
26
+ This is a trained model of a DQN agent playing ALE/MontezumaRevenge-v5.
27
+ The model was trained by using [CleanRL](https://github.com/vwxyzjn/cleanrl) and the most up-to-date training code can be
28
+ found [here](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/Montezuma.py).
29
+
30
+ ## Get Started
31
+
32
+ To use this model, please install the `cleanrl` package with the following command:
33
+
34
+ ```
35
+ pip install "cleanrl[Montezuma]"
36
+ python -m cleanrl_utils.enjoy --exp-name Montezuma --env-id ALE/MontezumaRevenge-v5
37
+ ```
38
+
39
+ Please refer to the [documentation](https://docs.cleanrl.dev/get-started/zoo/) for more detail.
40
+
41
+
42
+ ## Command to reproduce the training
43
+
44
+ ```bash
45
+ curl -OL https://huggingface.co/cotran2/Montezuma/raw/main/dqn_atari.py
46
+ curl -OL https://huggingface.co/cotran2/Montezuma/raw/main/pyproject.toml
47
+ curl -OL https://huggingface.co/cotran2/Montezuma/raw/main/poetry.lock
48
+ poetry install --all-extras
49
+ python dqn_atari.py --exp-name Montezuma --track --wandb-project-name Montezuma --capture-video --env-id ALE/MontezumaRevenge-v5 --total-timesteps 100000 --buffer-size 400000 --save-model True --upload-model True --hf-entity cotran2
50
+ ```
51
+
52
+ # Hyperparameters
53
+ ```python
54
+ {'batch_size': 32,
55
+ 'buffer_size': 400000,
56
+ 'capture_video': True,
57
+ 'cuda': True,
58
+ 'end_e': 0.01,
59
+ 'env_id': 'ALE/MontezumaRevenge-v5',
60
+ 'exp_name': 'Montezuma',
61
+ 'exploration_fraction': 0.1,
62
+ 'gamma': 0.99,
63
+ 'hf_entity': 'cotran2',
64
+ 'learning_rate': 0.0001,
65
+ 'learning_starts': 80000,
66
+ 'num_envs': 1,
67
+ 'save_model': True,
68
+ 'seed': 1,
69
+ 'start_e': 1,
70
+ 'target_network_frequency': 1000,
71
+ 'tau': 1.0,
72
+ 'torch_deterministic': True,
73
+ 'total_timesteps': 100000,
74
+ 'track': True,
75
+ 'train_frequency': 4,
76
+ 'upload_model': True,
77
+ 'wandb_entity': None,
78
+ 'wandb_project_name': 'Montezuma'}
79
+ ```
80
+
dqn_atari.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import random
4
+ import time
5
+ from distutils.util import strtobool
6
+
7
+ import gymnasium as gym
8
+ import numpy as np
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+ import torch.optim as optim
13
+ from stable_baselines3.common.atari_wrappers import (
14
+ ClipRewardEnv,
15
+ EpisodicLifeEnv,
16
+ FireResetEnv,
17
+ MaxAndSkipEnv,
18
+ NoopResetEnv
19
+ )
20
+ from stable_baselines3.common.buffers import ReplayBuffer
21
+ from torch.utils.tensorboard import SummaryWriter
22
+
23
+
24
+ def parse_args():
25
+ # fmt: off
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("--exp-name", type=str, default=os.path.basename(__file__).rstrip(".py"),
28
+ help="the name of this experiment")
29
+ parser.add_argument("--seed", type=int, default=1,
30
+ help="seed of the experiment")
31
+ parser.add_argument("--torch-deterministic", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
32
+ help="if toggled, `torch.backends.cudnn.deterministic=False`")
33
+ parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
34
+ help="if toggled, cuda will be enabled by default")
35
+ parser.add_argument("--track", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
36
+ help="if toggled, this experiment will be tracked with Weights and Biases")
37
+ parser.add_argument("--wandb-project-name", type=str, default="cleanRL",
38
+ help="the wandb's project name")
39
+ parser.add_argument("--wandb-entity", type=str, default=None,
40
+ help="the entity (team) of wandb's project")
41
+ parser.add_argument("--capture-video", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
42
+ help="whether to capture videos of the agent performances (check out `videos` folder)")
43
+ parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
44
+ help="whether to save model into the `runs/{run_name}` folder")
45
+ parser.add_argument("--upload-model", type=lambda x: bool(strtobool(x)), default=False, nargs="?", const=True,
46
+ help="whether to upload the saved model to huggingface")
47
+ parser.add_argument("--hf-entity", type=str, default="",
48
+ help="the user or org name of the model repository from the Hugging Face Hub")
49
+
50
+ # Algorithm specific arguments
51
+ parser.add_argument("--env-id", type=str, default="BreakoutNoFrameskip-v4",
52
+ help="the id of the environment")
53
+ parser.add_argument("--total-timesteps", type=int, default=100000,
54
+ help="total timesteps of the experiments")
55
+ parser.add_argument("--learning-rate", type=float, default=1e-4,
56
+ help="the learning rate of the optimizer")
57
+ parser.add_argument("--num-envs", type=int, default=1,
58
+ help="the number of parallel game environments")
59
+ parser.add_argument("--buffer-size", type=int, default=1000000,
60
+ help="the replay memory buffer size")
61
+ parser.add_argument("--gamma", type=float, default=0.99,
62
+ help="the discount factor gamma")
63
+ parser.add_argument("--tau", type=float, default=1.,
64
+ help="the target network update rate")
65
+ parser.add_argument("--target-network-frequency", type=int, default=1000,
66
+ help="the timesteps it takes to update the target network")
67
+ parser.add_argument("--batch-size", type=int, default=32,
68
+ help="the batch size of sample from the reply memory")
69
+ parser.add_argument("--start-e", type=float, default=1,
70
+ help="the starting epsilon for exploration")
71
+ parser.add_argument("--end-e", type=float, default=0.01,
72
+ help="the ending epsilon for exploration")
73
+ parser.add_argument("--exploration-fraction", type=float, default=0.10,
74
+ help="the fraction of `total-timesteps` it takes from start-e to go end-e")
75
+ parser.add_argument("--learning-starts", type=int, default=80000,
76
+ help="timestep to start learning")
77
+ parser.add_argument("--train-frequency", type=int, default=4,
78
+ help="the frequency of training")
79
+ args = parser.parse_args()
80
+ # fmt: on
81
+ assert args.num_envs == 1, "vectorized envs are not supported at the moment"
82
+
83
+ return args
84
+
85
+
86
+ def make_env(env_id, seed, idx, capture_video, run_name):
87
+ def thunk():
88
+ if capture_video and idx == 0:
89
+ env = gym.make(env_id, render_mode="rgb_array")
90
+ env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
91
+ else:
92
+ env = gym.make(env_id)
93
+
94
+ env = gym.wrappers.RecordEpisodeStatistics(env)
95
+ env = NoopResetEnv(env, noop_max=30)
96
+ env = MaxAndSkipEnv(env, skip=4)
97
+ env = EpisodicLifeEnv(env)
98
+
99
+ if "FIRE" in env.unwrapped.get_action_meanings():
100
+ env = FireResetEnv(env)
101
+
102
+ env = ClipRewardEnv(env)
103
+ env = gym.wrappers.ResizeObservation(env, (84, 84))
104
+ env = gym.wrappers.GrayScaleObservation(env)
105
+ env = gym.wrappers.FrameStack(env, 4)
106
+ env.action_space.seed(seed)
107
+
108
+ return env
109
+
110
+ return thunk
111
+
112
+
113
+ class QNetwork(nn.Module):
114
+ def __init__(self, env):
115
+ super().__init__()
116
+ self.network = nn.Sequential(
117
+ nn.Conv2d(4, 32, 8, stride=4),
118
+ nn.ReLU(),
119
+ nn.Conv2d(32, 64, 4, stride=2),
120
+ nn.ReLU(),
121
+ nn.Conv2d(64, 64, 3, stride=1),
122
+ nn.ReLU(),
123
+ nn.Flatten(),
124
+ nn.Linear(3136, 512),
125
+ nn.ReLU(),
126
+ nn.Linear(512, env.single_action_space.n),
127
+ )
128
+
129
+ def forward(self, x):
130
+ return self.network(x / 255.0)
131
+
132
+
133
+ def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
134
+ slope = (end_e - start_e) / duration
135
+ return max(slope * t + start_e, end_e)
136
+
137
+ if __name__ == "__main__":
138
+ import stable_baselines3 as sb3
139
+ from huggingface_hub import login
140
+ from dotenv import load_dotenv, find_dotenv
141
+ load_dotenv(find_dotenv())
142
+ HF_TOKEN = os.environ.get("HF_TOKEN")
143
+
144
+ login(HF_TOKEN)
145
+
146
+
147
+ if sb3.__version__ < "2.0":
148
+ raise ValueError(
149
+ """On going migration: run the following command to install new dependencies
150
+ pip install "stable_baselines3==2.0.0a1" "gymnasium[atari,accept-rom-license]==0.28.1" "ale-py==0.8.1"
151
+ """
152
+ )
153
+
154
+ args = parse_args()
155
+ run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
156
+ if args.track:
157
+ import wandb
158
+
159
+ wandb.init(
160
+ project=args.wandb_project_name,
161
+ entity=args.wandb_entity,
162
+ sync_tensorboard=True,
163
+ config=vars(args),
164
+ name=run_name,
165
+ monitor_gym=True,
166
+ save_code=True
167
+ )
168
+
169
+ writer = SummaryWriter(f"runs/{run_name}")
170
+ writer.add_text(
171
+ "hyperparameters",
172
+ "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
173
+ )
174
+
175
+ random.seed(args.seed)
176
+ np.random.seed(args.seed)
177
+ torch.manual_seed(args.seed)
178
+ torch.backends.cudnn.deterministic = args.torch_deterministic
179
+
180
+ device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
181
+
182
+ envs = gym.vector.SyncVectorEnv(
183
+ [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
184
+ )
185
+ assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
186
+
187
+ q_network = QNetwork(envs).to(device)
188
+ optimizer = optim.Adam(q_network.parameters(), lr=args.learning_rate)
189
+ target_network = QNetwork(envs).to(device)
190
+ target_network.load_state_dict(q_network.state_dict())
191
+
192
+ rb = ReplayBuffer(
193
+ args.buffer_size,
194
+ envs.single_observation_space,
195
+ envs.single_action_space,
196
+ device,
197
+ optimize_memory_usage=True,
198
+ handle_timeout_termination=False
199
+ )
200
+ start_time = time.time()
201
+
202
+ obs, _ = envs.reset(seed=args.seed)
203
+ for global_step in range(args.total_timesteps):
204
+ epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step)
205
+ if random.random() < epsilon:
206
+ actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
207
+ else:
208
+ q_values = q_network(torch.Tensor(obs).to(device))
209
+ actions = torch.argmax(q_values, dim=1).cpu().numpy()
210
+
211
+ next_obs, rewards, terminated, truncated, infos = envs.step(actions)
212
+
213
+ if "final_info" in infos:
214
+ for info in infos["final_info"]:
215
+ if "episode" not in info:
216
+ continue
217
+ print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
218
+ writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
219
+ writer.add_scalar("charts/episode_length", info["episode"]["l"], global_step)
220
+ writer.add_scalar("charts/epsilon", epsilon, global_step)
221
+
222
+ real_next_obs = next_obs.copy()
223
+ for idx, d in enumerate(truncated):
224
+ if d:
225
+ real_next_obs[idx] = infos["final_observation"][idx]
226
+ rb.add(obs, real_next_obs, actions, rewards, terminated, infos)
227
+
228
+ obs = next_obs
229
+
230
+ if global_step > args.learning_starts:
231
+ if global_step % args.train_frequency == 0:
232
+ data = rb.sample(args.batch_size)
233
+ with torch.no_grad():
234
+ target_max, _ = target_network(data.next_observations).max(dim=1)
235
+ td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten())
236
+ old_val = q_network(data.observations).gather(1, data.actions).squeeze()
237
+ loss = F.mse_loss(td_target, old_val)
238
+
239
+ if global_step % 100 == 0:
240
+ writer.add_scalar("losses/td_loss", loss, global_step)
241
+ writer.add_scalar("losses/q_values", old_val.mean().item(), global_step)
242
+ print("SPS:", int(global_step / (time.time() - start_time)))
243
+ writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
244
+
245
+ optimizer.zero_grad()
246
+ loss.backward()
247
+ optimizer.step()
248
+
249
+ if global_step % args.target_network_frequency == 0:
250
+ for target_network_param, q_network_param in zip(target_network.parameters(), q_network.parameters()):
251
+ target_network_param.data.copy_(
252
+ args.tau * q_network_param.data + (1.0 - args.tau) * target_network_param.data
253
+ )
254
+
255
+ if args.save_model:
256
+ model_path = f"runs/{run_name}/{args.exp_name}.pth"
257
+ torch.save(q_network.state_dict(), model_path)
258
+ print(f"model saved to {model_path}")
259
+
260
+ from dqn_eval import evaluate
261
+
262
+ episodic_returns = evaluate(
263
+ model_path,
264
+ make_env,
265
+ args.env_id,
266
+ eval_episode=10,
267
+ run_name=f"{run_name}-eval",
268
+ Model=QNetwork,
269
+ device=device,
270
+ epsilon=0.05,
271
+ )
272
+
273
+ for idx, episodic_return in enumerate(episodic_returns):
274
+ writer.add_scalar("eval/episodic_return", episodic_return, idx)
275
+
276
+ if args.upload_model:
277
+ from huggingface import push_to_hub
278
+
279
+ repo_name = f"{args.exp_name}"
280
+ repo_id = f"{args.hf_entity}/{repo_name}" if args.hf_entity else repo_name
281
+ push_to_hub(args, episodic_returns, repo_id, "DQN", f"runs/{run_name}", f"videos/{run_name}-eval")
282
+
283
+ envs.close()
284
+ writer.close()
285
+
events.out.tfevents.1718240475.Acer.11548.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac2628bb5538773ac11c99615d8e752cef92f228caee55516d33033981f9fa18
3
+ size 385732
replay.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0527f30166c9bbbd735342d0edc2bbc120d279facb484d90274f2bab395716ae
3
+ size 3181955
videos//ALE//MontezumaRevenge-v5__Montezuma__1__1718240471-eval//rl-video-episode-0.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ee63c910640ef273371f3370879316cfb10801154791d472ff226e02f7523b9
3
+ size 3283903
videos//ALE//MontezumaRevenge-v5__Montezuma__1__1718240471-eval//rl-video-episode-1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f4d1848b2a538dc22118053bb97c7887474f826def11351326e17ee363860e1
3
+ size 3292630
videos//ALE//MontezumaRevenge-v5__Montezuma__1__1718240471-eval//rl-video-episode-8.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0527f30166c9bbbd735342d0edc2bbc120d279facb484d90274f2bab395716ae
3
+ size 3181955