Tonic commited on
Commit
c6378e6
·
unverified ·
1 Parent(s): 6408837

add reference code from vllm

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -14,7 +14,7 @@ import spaces
14
  import math
15
  from typing import List, Optional, Tuple
16
 
17
- title = "# 🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
18
  description = """
19
  This demo showcases two capabilities of the Pixtral model:
20
  1. Image-to-Text Generation
@@ -27,6 +27,7 @@ This demo showcases two capabilities of the Pixtral model:
27
  model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
28
  with open(f'{model_path}/params.json', 'r') as f:
29
  params = json.load(f)
 
30
  with open(f'{model_path}/tekken.json', 'r') as f:
31
  tokenizer_config = json.load(f)
32
 
@@ -177,14 +178,14 @@ class PixtralModel(nn.Module):
177
  def __init__(self, params):
178
  super().__init__()
179
  self.vision_encoder = VisionTransformer(params['vision_encoder'])
180
- self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['text_config']['hidden_size'])
181
  self.language_model = nn.TransformerDecoder(
182
- nn.TransformerDecoderLayer(d_model=params['text_config']['hidden_size'],
183
- nhead=params['text_config']['num_attention_heads'],
184
- dim_feedforward=params['text_config']['intermediate_size']),
185
- num_layers=params['text_config']['num_hidden_layers']
186
  )
187
- self.lm_head = nn.Linear(params['text_config']['hidden_size'], params['text_config']['vocab_size'], bias=False)
188
 
189
  def forward(self, image, input_ids=None):
190
  vision_output = self.vision_encoder(image)
@@ -274,16 +275,18 @@ def calculate_similarity(image1, image2):
274
  with gr.Blocks(theme=gr.themes.Base()) as demo:
275
  gr.Markdown(title)
276
  gr.Markdown("## Model Details")
 
 
 
277
  gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
278
  gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
279
- gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
280
  gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
281
  gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
282
  gr.Markdown("## How it works")
283
  gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
284
  gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
285
  gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
286
-
287
  gr.Markdown(description)
288
 
289
  with gr.Tabs():
 
14
  import math
15
  from typing import List, Optional, Tuple
16
 
17
+ title = "# 🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
18
  description = """
19
  This demo showcases two capabilities of the Pixtral model:
20
  1. Image-to-Text Generation
 
27
  model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
28
  with open(f'{model_path}/params.json', 'r') as f:
29
  params = json.load(f)
30
+
31
  with open(f'{model_path}/tekken.json', 'r') as f:
32
  tokenizer_config = json.load(f)
33
 
 
178
  def __init__(self, params):
179
  super().__init__()
180
  self.vision_encoder = VisionTransformer(params['vision_encoder'])
181
+ self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['dim'])
182
  self.language_model = nn.TransformerDecoder(
183
+ nn.TransformerDecoderLayer(d_model=params['dim'],
184
+ nhead=params['n_heads'],
185
+ dim_feedforward=params['hidden_dim']),
186
+ num_layers=params['n_layers']
187
  )
188
+ self.lm_head = nn.Linear(params['dim'], params['vocab_size'], bias=False)
189
 
190
  def forward(self, image, input_ids=None):
191
  vision_output = self.vision_encoder(image)
 
275
  with gr.Blocks(theme=gr.themes.Base()) as demo:
276
  gr.Markdown(title)
277
  gr.Markdown("## Model Details")
278
+ gr.Markdown(f"- Model Dimension: {params['dim']}")
279
+ gr.Markdown(f"- Number of Layers: {params['n_layers']}")
280
+ gr.Markdown(f"- Number of Attention Heads: {params['n_heads']}")
281
  gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
282
  gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
283
+ gr.Markdown(f"- Number of Vision Encoder Attention Heads: {params['vision_encoder']['num_attention_heads']}")
284
  gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
285
  gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
286
  gr.Markdown("## How it works")
287
  gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
288
  gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
289
  gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
 
290
  gr.Markdown(description)
291
 
292
  with gr.Tabs():