jw2yang commited on
Commit
34f3c9b
·
1 Parent(s): 4f00e93
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  os.system('pip install flash-attn --no-build-isolation')
3
  os.system("pip install gradio==4.44.1")
4
 
@@ -11,9 +12,6 @@ from PIL import Image
11
  from transformers import AutoModelForCausalLM, AutoProcessor
12
  import re
13
  import random
14
- from vlms.magma import MagmaAgent
15
- from vlms.llavaov import LLaVAOVAgent
16
- from vlms.qwen2vl import Qwen2VLAgent
17
 
18
  pygame.mixer.quit() # Disable sound
19
 
@@ -36,9 +34,12 @@ STATIC = (0, 0)
36
 
37
  ACTIONS = ["up", "down", "left", "right", "static"]
38
 
 
39
  dtype = torch.bfloat16
40
- agent_1 = MagmaAgent("cuda:0", dtype)
41
- agent_2 = Qwen2VLAgent("cuda:0", dtype)
 
 
42
 
43
  magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
44
  magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
@@ -76,14 +77,14 @@ class MagmaFindGPU:
76
  elif action == "static":
77
  self.direction = STATIC
78
 
79
- # if self.game_over:
80
- # self.reset()
81
- # return self.render(), self.score
82
 
83
  new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
84
 
85
  if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
86
- # self.game_over = True
87
  return self.render(), self.score
88
 
89
  self.snake = [new_head] # Keep only the head (single block snake)
@@ -132,15 +133,32 @@ class MagmaFindGPU:
132
  def get_state(self):
133
  return self.render()
134
 
135
- game_1 = MagmaFindGPU()
136
- game_2 = MagmaFindGPU()
137
 
138
- def play_game(game, agent):
139
  state, state_som = game.get_state()
140
  pil_img = Image.fromarray(state_som)
141
- action = agent.generate_response(pil_img, "Which mark is closer to green block? Answer with a single number.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  # extract mark id fro action use re
143
- # print(agent.__class__.__name__, action)
144
  match = re.search(r'\d+', action)
145
  if match:
146
  action = match.group(0)
@@ -153,46 +171,42 @@ def play_game(game, agent):
153
  action = random.choice(ACTIONS[:-1])
154
 
155
  img, score = game.step(action)
156
- return img[0], f"Score: {score}"
157
-
158
- def play_game_1():
159
- return play_game(game_1, agent_1)
160
 
161
- def play_game_2():
162
- return play_game(game_2, agent_2)
 
163
 
164
- def reset_games():
165
- game_1.reset()
166
- game_2.reset()
167
- return game_1.render()[0], "Score: 0", game_2.render()[0], "Score: 0"
168
  MARKDOWN = """
169
  <div align="center">
170
  <h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
171
 
172
  \[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
173
 
174
- <h3>Magma Arena: A battle between two agents to collect the green blocks by automatically moving up, down, left and right.</h3>
175
-
176
  This demo is powered by [Gradio](https://gradio.app/).
177
 
 
 
178
  </div>
179
  """
180
 
181
  with gr.Blocks() as interface:
182
  gr.Markdown(MARKDOWN)
183
  with gr.Row():
 
184
  with gr.Column():
185
- img_output_1 = gr.Image(label="{}".format(agent_1.__class__.__name__))
186
- score_output_1 = gr.Text(label="Score 1")
187
- with gr.Column():
188
- img_output_2 = gr.Image(label="{}".format(agent_2.__class__.__name__))
189
- score_output_2 = gr.Text(label="Score 2")
190
-
191
- start_btn = gr.Button("Start/Reset Game")
192
-
193
- interface.load(fn=play_game_1, every=1, inputs=[], outputs=[img_output_1, score_output_1])
194
- interface.load(fn=play_game_2, every=1, inputs=[], outputs=[img_output_2, score_output_2])
195
-
196
- start_btn.click(fn=reset_games, inputs=[], outputs=[img_output_1, score_output_1, img_output_2, score_output_2])
197
-
198
- interface.launch(server_port=7861)
 
1
  import os
2
+ # add a command for installing flash-attn
3
  os.system('pip install flash-attn --no-build-isolation')
4
  os.system("pip install gradio==4.44.1")
5
 
 
12
  from transformers import AutoModelForCausalLM, AutoProcessor
13
  import re
14
  import random
 
 
 
15
 
16
  pygame.mixer.quit() # Disable sound
17
 
 
34
 
35
  ACTIONS = ["up", "down", "left", "right", "static"]
36
 
37
+ # Load AI Model
38
  dtype = torch.bfloat16
39
+ magma_model_id = "microsoft/Magma-8B"
40
+ magam_model = AutoModelForCausalLM.from_pretrained(magma_model_id, trust_remote_code=True, torch_dtype=dtype)
41
+ magma_processor = AutoProcessor.from_pretrained(magma_model_id, trust_remote_code=True)
42
+ magam_model.to("cuda")
43
 
44
  magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
45
  magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
 
77
  elif action == "static":
78
  self.direction = STATIC
79
 
80
+ if self.game_over:
81
+ self.reset()
82
+ return self.render(), self.score
83
 
84
  new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
85
 
86
  if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
87
+ self.game_over = True
88
  return self.render(), self.score
89
 
90
  self.snake = [new_head] # Keep only the head (single block snake)
 
133
  def get_state(self):
134
  return self.render()
135
 
136
+ game = MagmaFindGPU()
 
137
 
138
+ def play_game():
139
  state, state_som = game.get_state()
140
  pil_img = Image.fromarray(state_som)
141
+ convs = [
142
+ {"role": "system", "content": "You are an agent that can see, talk, and act. Avoid hitting the wall."},
143
+ {"role": "user", "content": "<image_start><image><image_end>\nWhich mark is closer to green block? Answer with a single number."},
144
+ ]
145
+ prompt = magma_processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
146
+ inputs = magma_processor(images=[pil_img], texts=prompt, return_tensors="pt")
147
+ inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
148
+ inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
149
+ inputs = inputs.to("cuda").to(dtype)
150
+ generation_args = {
151
+ "max_new_tokens": 10,
152
+ "temperature": 0.3,
153
+ "do_sample": True,
154
+ "use_cache": True,
155
+ "num_beams": 1,
156
+ }
157
+ with torch.inference_mode():
158
+ generate_ids = magam_model.generate(**inputs, **generation_args)
159
+ generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
160
+ action = magma_processor.decode(generate_ids[0], skip_special_tokens=True).strip()
161
  # extract mark id fro action use re
 
162
  match = re.search(r'\d+', action)
163
  if match:
164
  action = match.group(0)
 
171
  action = random.choice(ACTIONS[:-1])
172
 
173
  img, score = game.step(action)
174
+ img = img[0]
175
+ return img, f"Score: {score}"
 
 
176
 
177
+ def reset_game():
178
+ game.reset()
179
+ return game.render()[0], "Score: 0"
180
 
 
 
 
 
181
  MARKDOWN = """
182
  <div align="center">
183
  <h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
184
 
185
  \[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
186
 
 
 
187
  This demo is powered by [Gradio](https://gradio.app/).
188
 
189
+ <b>Goal: Collects the green blocks by automatically moving up, down, left and right.</b>
190
+
191
  </div>
192
  """
193
 
194
  with gr.Blocks() as interface:
195
  gr.Markdown(MARKDOWN)
196
  with gr.Row():
197
+ image_output = gr.Image(label="Game Screen")
198
  with gr.Column():
199
+ score_output = gr.Text(label="Score", elem_classes="large-text")
200
+ gr.HTML("""
201
+ <style>
202
+ .large-text textarea {
203
+ font-size: 24px !important;
204
+ }
205
+ </style>
206
+ """)
207
+ start_btn = gr.Button("Start/Reset Game")
208
+
209
+ interface.load(fn=play_game, every=1, inputs=[], outputs=[image_output, score_output])
210
+ start_btn.click(fn=reset_game, inputs=[], outputs=[image_output, score_output])
211
+
212
+ interface.launch()
vlms/__pycache__/llavanext.cpython-310.pyc DELETED
Binary file (1.79 kB)
 
vlms/__pycache__/llavaov.cpython-310.pyc DELETED
Binary file (1.81 kB)
 
vlms/__pycache__/magma.cpython-310.pyc DELETED
Binary file (1.93 kB)
 
vlms/__pycache__/qwen25vl.cpython-310.pyc DELETED
Binary file (2.05 kB)
 
vlms/__pycache__/qwen2vl.cpython-310.pyc DELETED
Binary file (2.01 kB)