Spaces:
Paused
Paused
add reference code from vllm
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ import spaces
|
|
14 |
import math
|
15 |
from typing import List, Optional, Tuple
|
16 |
|
17 |
-
title = "#
|
18 |
description = """
|
19 |
This demo showcases two capabilities of the Pixtral model:
|
20 |
1. Image-to-Text Generation
|
@@ -27,6 +27,7 @@ This demo showcases two capabilities of the Pixtral model:
|
|
27 |
model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
|
28 |
with open(f'{model_path}/params.json', 'r') as f:
|
29 |
params = json.load(f)
|
|
|
30 |
with open(f'{model_path}/tekken.json', 'r') as f:
|
31 |
tokenizer_config = json.load(f)
|
32 |
|
@@ -177,14 +178,14 @@ class PixtralModel(nn.Module):
|
|
177 |
def __init__(self, params):
|
178 |
super().__init__()
|
179 |
self.vision_encoder = VisionTransformer(params['vision_encoder'])
|
180 |
-
self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['
|
181 |
self.language_model = nn.TransformerDecoder(
|
182 |
-
nn.TransformerDecoderLayer(d_model=params['
|
183 |
-
nhead=params['
|
184 |
-
dim_feedforward=params['
|
185 |
-
num_layers=params['
|
186 |
)
|
187 |
-
self.lm_head = nn.Linear(params['
|
188 |
|
189 |
def forward(self, image, input_ids=None):
|
190 |
vision_output = self.vision_encoder(image)
|
@@ -274,16 +275,18 @@ def calculate_similarity(image1, image2):
|
|
274 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
275 |
gr.Markdown(title)
|
276 |
gr.Markdown("## Model Details")
|
|
|
|
|
|
|
277 |
gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
|
278 |
gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
|
279 |
-
gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
|
280 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
281 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
282 |
gr.Markdown("## How it works")
|
283 |
gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
|
284 |
gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
|
285 |
gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
|
286 |
-
|
287 |
gr.Markdown(description)
|
288 |
|
289 |
with gr.Tabs():
|
|
|
14 |
import math
|
15 |
from typing import List, Optional, Tuple
|
16 |
|
17 |
+
title = "# 🙋🏻♂️Welcome to Tonic's Pixtral Model Demo"
|
18 |
description = """
|
19 |
This demo showcases two capabilities of the Pixtral model:
|
20 |
1. Image-to-Text Generation
|
|
|
27 |
model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
|
28 |
with open(f'{model_path}/params.json', 'r') as f:
|
29 |
params = json.load(f)
|
30 |
+
|
31 |
with open(f'{model_path}/tekken.json', 'r') as f:
|
32 |
tokenizer_config = json.load(f)
|
33 |
|
|
|
178 |
def __init__(self, params):
|
179 |
super().__init__()
|
180 |
self.vision_encoder = VisionTransformer(params['vision_encoder'])
|
181 |
+
self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['dim'])
|
182 |
self.language_model = nn.TransformerDecoder(
|
183 |
+
nn.TransformerDecoderLayer(d_model=params['dim'],
|
184 |
+
nhead=params['n_heads'],
|
185 |
+
dim_feedforward=params['hidden_dim']),
|
186 |
+
num_layers=params['n_layers']
|
187 |
)
|
188 |
+
self.lm_head = nn.Linear(params['dim'], params['vocab_size'], bias=False)
|
189 |
|
190 |
def forward(self, image, input_ids=None):
|
191 |
vision_output = self.vision_encoder(image)
|
|
|
275 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
276 |
gr.Markdown(title)
|
277 |
gr.Markdown("## Model Details")
|
278 |
+
gr.Markdown(f"- Model Dimension: {params['dim']}")
|
279 |
+
gr.Markdown(f"- Number of Layers: {params['n_layers']}")
|
280 |
+
gr.Markdown(f"- Number of Attention Heads: {params['n_heads']}")
|
281 |
gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
|
282 |
gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
|
283 |
+
gr.Markdown(f"- Number of Vision Encoder Attention Heads: {params['vision_encoder']['num_attention_heads']}")
|
284 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
285 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
286 |
gr.Markdown("## How it works")
|
287 |
gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
|
288 |
gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
|
289 |
gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
|
|
|
290 |
gr.Markdown(description)
|
291 |
|
292 |
with gr.Tabs():
|