Spaces:

microsoft
/

Magma-Gaming

Configuration error

App Files Files Community

jw2yang commited on 21 days ago

Commit

34f3c9b

1 Parent(s): 4f00e93

update

Browse files

Files changed (6) hide show

app.py +54 -40
vlms/__pycache__/llavanext.cpython-310.pyc +0 -0
vlms/__pycache__/llavaov.cpython-310.pyc +0 -0
vlms/__pycache__/magma.cpython-310.pyc +0 -0
vlms/__pycache__/qwen25vl.cpython-310.pyc +0 -0
vlms/__pycache__/qwen2vl.cpython-310.pyc +0 -0

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 os.system('pip install flash-attn --no-build-isolation')
 os.system("pip install gradio==4.44.1")
@@ -11,9 +12,6 @@ from PIL import Image
 from transformers import AutoModelForCausalLM, AutoProcessor
 import re
 import random
-from vlms.magma import MagmaAgent
-from vlms.llavaov import LLaVAOVAgent
-from vlms.qwen2vl import Qwen2VLAgent
 pygame.mixer.quit()  # Disable sound
@@ -36,9 +34,12 @@ STATIC = (0, 0)
 ACTIONS = ["up", "down", "left", "right", "static"]
 dtype = torch.bfloat16
-agent_1 = MagmaAgent("cuda:0", dtype)
-agent_2 = Qwen2VLAgent("cuda:0", dtype)
 magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
 magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
@@ -76,14 +77,14 @@ class MagmaFindGPU:
         elif action == "static":
             self.direction = STATIC
-        # if self.game_over:
-        #     self.reset()
-        #     return self.render(), self.score
         new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
         if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
-            # self.game_over = True
             return self.render(), self.score
         self.snake = [new_head]  # Keep only the head (single block snake)
@@ -132,15 +133,32 @@ class MagmaFindGPU:
     def get_state(self):
         return self.render()
-game_1 = MagmaFindGPU()
-game_2 = MagmaFindGPU()
-def play_game(game, agent):
     state, state_som = game.get_state()
     pil_img = Image.fromarray(state_som)
-    action = agent.generate_response(pil_img, "Which mark is closer to green block? Answer with a single number.")
     # extract mark id fro action use re
-    # print(agent.__class__.__name__, action)
     match = re.search(r'\d+', action)
     if match:
         action = match.group(0)
@@ -153,46 +171,42 @@ def play_game(game, agent):
         action = random.choice(ACTIONS[:-1])
     img, score = game.step(action)
-    return img[0], f"Score: {score}"
-def play_game_1():
-    return play_game(game_1, agent_1)
-def play_game_2():
-    return play_game(game_2, agent_2)
-def reset_games():
-    game_1.reset()
-    game_2.reset()
-    return game_1.render()[0], "Score: 0", game_2.render()[0], "Score: 0"
 MARKDOWN = """
 <div align="center">
 <h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
 \[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
-<h3>Magma Arena: A battle between two agents to collect the green blocks by automatically moving up, down, left and right.</h3>
 This demo is powered by [Gradio](https://gradio.app/).
 </div>
 """
 with gr.Blocks() as interface:
     gr.Markdown(MARKDOWN)
     with gr.Row():
         with gr.Column():
-            img_output_1 = gr.Image(label="{}".format(agent_1.__class__.__name__))
-            score_output_1 = gr.Text(label="Score 1")
-        with gr.Column():
-            img_output_2 = gr.Image(label="{}".format(agent_2.__class__.__name__))
-            score_output_2 = gr.Text(label="Score 2")
-    start_btn = gr.Button("Start/Reset Game")
-    interface.load(fn=play_game_1, every=1, inputs=[], outputs=[img_output_1, score_output_1])
-    interface.load(fn=play_game_2, every=1, inputs=[], outputs=[img_output_2, score_output_2])
-    start_btn.click(fn=reset_games, inputs=[], outputs=[img_output_1, score_output_1, img_output_2, score_output_2])
-interface.launch(server_port=7861)

 import os
+# add a command for installing flash-attn
 os.system('pip install flash-attn --no-build-isolation')
 os.system("pip install gradio==4.44.1")
 from transformers import AutoModelForCausalLM, AutoProcessor
 import re
 import random
 pygame.mixer.quit()  # Disable sound
 ACTIONS = ["up", "down", "left", "right", "static"]
+# Load AI Model
 dtype = torch.bfloat16
+magma_model_id = "microsoft/Magma-8B"
+magam_model = AutoModelForCausalLM.from_pretrained(magma_model_id, trust_remote_code=True, torch_dtype=dtype)
+magma_processor = AutoProcessor.from_pretrained(magma_model_id, trust_remote_code=True)
+magam_model.to("cuda")
 magma_img = pygame.image.load("./assets/images/magma_game_thin.png")
 magma_img = pygame.transform.scale(magma_img, (GRID_SIZE, GRID_SIZE))
         elif action == "static":
             self.direction = STATIC
+        if self.game_over:
+            self.reset()
+            return self.render(), self.score
         new_head = (self.snake[0][0] + self.direction[0], self.snake[0][1] + self.direction[1])
         if new_head[0] < 0 or new_head[1] < 0 or new_head[0] >= WIDTH // GRID_SIZE or new_head[1] >= HEIGHT // GRID_SIZE:
+            self.game_over = True
             return self.render(), self.score
         self.snake = [new_head]  # Keep only the head (single block snake)
     def get_state(self):
         return self.render()
+game = MagmaFindGPU()
+def play_game():
     state, state_som = game.get_state()
     pil_img = Image.fromarray(state_som)
+    convs = [
+        {"role": "system", "content": "You are an agent that can see, talk, and act. Avoid hitting the wall."},
+        {"role": "user", "content": "<image_start><image><image_end>\nWhich mark is closer to green block? Answer with a single number."},
+    ]
+    prompt = magma_processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
+    inputs = magma_processor(images=[pil_img], texts=prompt, return_tensors="pt")
+    inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
+    inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
+    inputs = inputs.to("cuda").to(dtype)
+    generation_args = {
+        "max_new_tokens": 10,
+        "temperature": 0.3,
+        "do_sample": True,
+        "use_cache": True,
+        "num_beams": 1,
+    }
+    with torch.inference_mode():
+        generate_ids = magam_model.generate(**inputs, **generation_args)
+    generate_ids = generate_ids[:, inputs["input_ids"].shape[-1] :]
+    action = magma_processor.decode(generate_ids[0], skip_special_tokens=True).strip()
     # extract mark id fro action use re
     match = re.search(r'\d+', action)
     if match:
         action = match.group(0)
         action = random.choice(ACTIONS[:-1])
     img, score = game.step(action)
+    img = img[0]
+    return img, f"Score: {score}"
+def reset_game():
+    game.reset()
+    return game.render()[0], "Score: 0"
 MARKDOWN = """
 <div align="center">
 <h2>Magma: A Foundation Model for Multimodal AI Agents</h2>
 \[[arXiv Paper](https://www.arxiv.org/pdf/2502.13130)\] &nbsp; \[[Project Page](https://microsoft.github.io/Magma/)\] &nbsp; \[[Github Repo](https://github.com/microsoft/Magma)\] &nbsp; \[[Hugging Face Model](https://huggingface.co/microsoft/Magma-8B)\] &nbsp;
 This demo is powered by [Gradio](https://gradio.app/).
+<b>Goal: Collects the green blocks by automatically moving up, down, left and right.</b>
 </div>
 """
 with gr.Blocks() as interface:
     gr.Markdown(MARKDOWN)
     with gr.Row():
+        image_output = gr.Image(label="Game Screen")
         with gr.Column():
+            score_output = gr.Text(label="Score", elem_classes="large-text")
+            gr.HTML("""
+                <style>
+                .large-text textarea {
+                    font-size: 24px !important;
+                }
+                </style>
+            """)
+            start_btn = gr.Button("Start/Reset Game")
+    interface.load(fn=play_game, every=1, inputs=[], outputs=[image_output, score_output])
+    start_btn.click(fn=reset_game, inputs=[], outputs=[image_output, score_output])
+interface.launch()

vlms/__pycache__/llavanext.cpython-310.pyc DELETED Viewed

Binary file (1.79 kB)

vlms/__pycache__/llavaov.cpython-310.pyc DELETED Viewed

Binary file (1.81 kB)

vlms/__pycache__/magma.cpython-310.pyc DELETED Viewed

Binary file (1.93 kB)

vlms/__pycache__/qwen25vl.cpython-310.pyc DELETED Viewed

Binary file (2.05 kB)

vlms/__pycache__/qwen2vl.cpython-310.pyc DELETED Viewed

Binary file (2.01 kB)