Tonic commited on
Commit
55d5adb
·
unverified ·
1 Parent(s): e9ec3b8

add image similarity demo

Browse files
Files changed (2) hide show
  1. app.py +164 -56
  2. et --hard HEAD@{1} +20 -0
app.py CHANGED
@@ -12,8 +12,21 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
12
  from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
13
  import spaces
14
 
15
- title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Image-to-Text Model Demo"
16
- description = """Upload an image to encode it. This is a **work in progress** , just showing off some demo features here until it's ready.
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ### Join us :
18
  🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
19
  """
@@ -86,16 +99,14 @@ class VisionEncoder(nn.Module):
86
  x = self.gelu(x)
87
  return x
88
 
 
89
  class PixtralModel(nn.Module):
90
  def __init__(self, params):
91
  super().__init__()
92
  self.vision_encoder = VisionEncoder(params['vision_encoder'])
93
- # Add text generation components here
94
 
95
  def forward(self, image):
96
- vision_output = self.vision_encoder(image)
97
- # Add text generation logic here
98
- return vision_output
99
 
100
  def load_model(params, model_path):
101
  model = PixtralModel(params)
@@ -110,74 +121,171 @@ def load_model(params, model_path):
110
 
111
  # Initialize the model
112
  model = load_model(params, model_path)
113
- tokenizer = MistralTokenizer.from_model("pixtral")
114
 
115
- @spaces.GPU
116
- def process_image_and_text(image, prompt):
117
- # Prepare the image
118
  image = image.convert('RGB')
119
  image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
120
  image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
121
- image_tensor = image_tensor.cuda()
122
-
123
- # Tokenize the input
124
- tokenized = tokenizer.encode_chat_completion(
125
- ChatCompletionRequest(
126
- messages=[
127
- UserMessage(
128
- content=[
129
- TextChunk(text=prompt),
130
- ImageChunk(image=image),
131
- ]
132
- )
133
- ],
134
- model="pixtral",
135
- )
136
- )
137
- tokens, text, images = tokenized.tokens, tokenized.text, tokenized.images
138
 
139
- # Process the image and generate text
 
 
 
 
 
 
140
  with torch.no_grad():
141
- model.cuda()
142
- vision_output = model(image_tensor)
 
143
  model.cpu()
144
- generated_text = f"Generated text based on the image and prompt: {prompt}"
145
 
146
- return generated_text, len(tokens), len(images)
 
 
 
147
 
148
  # Gradio interface
149
  with gr.Blocks() as demo:
150
  gr.Markdown(title)
 
 
 
 
 
 
 
 
 
 
 
151
  gr.Markdown(description)
152
 
153
  with gr.Row():
154
- with gr.Column(scale=1):
155
- input_image = gr.Image(type="pil")
156
- input_prompt = gr.Textbox(label="Prompt")
157
- submit_btn = gr.Button("Generate Text")
158
-
159
- with gr.Column(scale=1):
160
- output_text = gr.Textbox(label="Generated Text")
161
- token_count = gr.Number(label="Number of Tokens")
162
- image_count = gr.Number(label="Number of Images")
163
 
164
  submit_btn.click(
165
- fn=process_image_and_text,
166
- inputs=[input_image, input_prompt],
167
- outputs=[output_text, token_count, image_count]
168
  )
169
 
170
- gr.Markdown("## How it works")
171
- gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
172
- gr.Markdown("2. The encoder uses GELU activation in its layers.")
173
- gr.Markdown("3. The encoded image and the prompt are used to generate descriptive text.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- gr.Markdown("## Model Details")
176
- gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
177
- gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
178
- gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
179
- gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
180
- gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
 
181
 
182
- if __name__ == "__main__":
183
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
13
  import spaces
14
 
15
+ title = "# 🖼️ Pixtral Image Similarity Demo"
16
+ description = """
17
+ Upload two images to compare their similarity based on the embeddings produced by the Pixtral model.
18
+ This demo uses the vision encoder part of the Pixtral model to generate embeddings and then calculates
19
+ the cosine similarity between them.
20
+
21
+ ### How it works:
22
+ 1. Upload two images
23
+ 2. The Pixtral vision encoder processes both images
24
+ 3. The cosine similarity between the embeddings is calculated
25
+ 4. The similarity score is displayed (1.0 means identical, 0.0 means completely different)
26
+
27
+ ### Note:
28
+ This is a demonstration of the vision encoder capabilities and does not use the full Pixtral model for text generation.
29
+
30
  ### Join us :
31
  🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
32
  """
 
99
  x = self.gelu(x)
100
  return x
101
 
102
+
103
  class PixtralModel(nn.Module):
104
  def __init__(self, params):
105
  super().__init__()
106
  self.vision_encoder = VisionEncoder(params['vision_encoder'])
 
107
 
108
  def forward(self, image):
109
+ return self.vision_encoder(image)
 
 
110
 
111
  def load_model(params, model_path):
112
  model = PixtralModel(params)
 
121
 
122
  # Initialize the model
123
  model = load_model(params, model_path)
 
124
 
125
+ def preprocess_image(image):
 
 
126
  image = image.convert('RGB')
127
  image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
128
  image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
129
+ return image_tensor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ @spaces.GPU
132
+ def calculate_similarity(image1, image2):
133
+ # Preprocess images
134
+ tensor1 = preprocess_image(image1).cuda()
135
+ tensor2 = preprocess_image(image2).cuda()
136
+
137
+ # Generate embeddings
138
  with torch.no_grad():
139
+ model.cuda()
140
+ embedding1 = model(tensor1).mean(dim=1) # Average over spatial dimensions
141
+ embedding2 = model(tensor2).mean(dim=1)
142
  model.cpu()
 
143
 
144
+ # Calculate cosine similarity
145
+ similarity = F.cosine_similarity(embedding1, embedding2).item()
146
+
147
+ return similarity
148
 
149
  # Gradio interface
150
  with gr.Blocks() as demo:
151
  gr.Markdown(title)
152
+ gr.Markdown("## Model Details")
153
+ gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
154
+ gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
155
+ gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
156
+ gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
157
+ gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
158
+ gr.Markdown("## How it works")
159
+ gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
160
+ gr.Markdown("2. The encoder uses GELU activation in its layers.")
161
+ gr.Markdown("3. The encoded image and the prompt are used to generate descriptive text.")
162
+
163
  gr.Markdown(description)
164
 
165
  with gr.Row():
166
+ image1_input = gr.Image(type="pil", label="Image 1")
167
+ image2_input = gr.Image(type="pil", label="Image 2")
168
+
169
+ submit_btn = gr.Button("📸🌬️Calculate Similarity")
170
+ similarity_output = gr.Number(label="Similarity Score (0.0 to 1.0)")
 
 
 
 
171
 
172
  submit_btn.click(
173
+ fn=calculate_similarity,
174
+ inputs=[image1_input, image2_input],
175
+ outputs=[similarity_output]
176
  )
177
 
178
+ if __name__ == "__main__":
179
+ demo.launch()
180
+
181
+ # import torch
182
+ # import torch.nn as nn
183
+ # import torch.nn.functional as F
184
+ # from safetensors import safe_open
185
+ # import json
186
+ # import gradio as gr
187
+ # from PIL import Image
188
+ # import numpy as np
189
+ # from huggingface_hub import snapshot_download
190
+ # from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
191
+ # from mistral_common.protocol.instruct.request import ChatCompletionRequest
192
+ # from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
193
+ # import spaces
194
+
195
+ # title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Image-to-Text Model Demo"
196
+
197
+
198
+ # # Download model files
199
+ # model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
200
+
201
+ # # Load model parameters and tokenizer configuration
202
+ # with open(f'{model_path}/params.json', 'r') as f:
203
+ # params = json.load(f)
204
+
205
+ # with open(f'{model_path}/tekken.json', 'r') as f:
206
+ # tokenizer_config = json.load(f)
207
+
208
+ # class PixtralModel(nn.Module):
209
+ # def __init__(self, params):
210
+ # super().__init__()
211
+ # self.vision_encoder = VisionEncoder(params['vision_encoder'])
212
+ # # Add text generation components here
213
+
214
+ # def forward(self, image):
215
+ # vision_output = self.vision_encoder(image)
216
+ # # Add text generation logic here
217
+ # return vision_output
218
+
219
+ # def load_model(params, model_path):
220
+ # model = PixtralModel(params)
221
 
222
+ # with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
223
+ # for name, param in model.named_parameters():
224
+ # if name in f.keys():
225
+ # param.data = f.get_tensor(name)
226
+
227
+ # model.eval()
228
+ # return model
229
 
230
+ # # Initialize the model
231
+ # model = load_model(params, model_path)
232
+ # tokenizer = MistralTokenizer.from_model("pixtral")
233
+
234
+ # @spaces.GPU
235
+ # def process_image_and_text(image, prompt):
236
+ # # Prepare the image
237
+ # image = image.convert('RGB')
238
+ # image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
239
+ # image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
240
+ # image_tensor = image_tensor.cuda()
241
+
242
+ # # Tokenize the input
243
+ # tokenized = tokenizer.encode_chat_completion(
244
+ # ChatCompletionRequest(
245
+ # messages=[
246
+ # UserMessage(
247
+ # content=[
248
+ # TextChunk(text=prompt),
249
+ # ImageChunk(image=image),
250
+ # ]
251
+ # )
252
+ # ],
253
+ # model="pixtral",
254
+ # )
255
+ # )
256
+ # tokens, text, images = tokenized.tokens, tokenized.text, tokenized.images
257
+
258
+ # # Process the image and generate text
259
+ # with torch.no_grad():
260
+ # model.cuda()
261
+ # vision_output = model(image_tensor)
262
+ # model.cpu()
263
+ # generated_text = f"Generated text based on the image and prompt: {prompt}"
264
+
265
+ # return generated_text, len(tokens), len(images)
266
+
267
+ # # Gradio interface
268
+ # with gr.Blocks() as demo:
269
+ # gr.Markdown(title)
270
+
271
+
272
+ # gr.Markdown(description)
273
+ # with gr.Row():
274
+ # with gr.Column(scale=1):
275
+ # input_image = gr.Image(type="pil")
276
+ # input_prompt = gr.Textbox(label="Prompt")
277
+ # submit_btn = gr.Button("Generate Text")
278
+
279
+ # with gr.Column(scale=1):
280
+ # output_text = gr.Textbox(label="Generated Text")
281
+ # token_count = gr.Number(label="Number of Tokens")
282
+ # image_count = gr.Number(label="Number of Images")
283
+
284
+ # submit_btn.click(
285
+ # fn=process_image_and_text,
286
+ # inputs=[input_image, input_prompt],
287
+ # outputs=[output_text, token_count, image_count]
288
+ # )
289
+
290
+ # if __name__ == "__main__":
291
+ # demo.launch()
et --hard HEAD@{1} ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ e9ec3b8 (HEAD -> main) HEAD@{0}: reset: moving to e9ec3b854fe2a9afd80971f66b304230ff10ae01
2
+ 2eab2f8 (origin/main, origin/HEAD) HEAD@{1}: commit: spaces cuda fix
3
+ 190c58f HEAD@{2}: commit: spaces cuda fix
4
+ 1336464 HEAD@{3}: commit: spaces cuda fix
5
+ 1a1c0e2 HEAD@{4}: commit: spaces cuda fix
6
+ 24151b3 HEAD@{5}: commit: spaces cuda fix
7
+ 0e70c79 HEAD@{6}: commit: spaces cuda fix
8
+ 04d2804 HEAD@{7}: commit: hidden layers fix
9
+ 8bf2b2c HEAD@{8}: commit: hidden layers fix
10
+ 8ee6779 HEAD@{9}: commit: add text decoding
11
+ e9ec3b8 (HEAD -> main) HEAD@{10}: commit: add description
12
+ e562e7a HEAD@{11}: commit: add snapshot download
13
+ f570b2f HEAD@{12}: commit: add snapshot download
14
+ 9a64677 HEAD@{13}: commit: add snapshot download
15
+ 337337b HEAD@{14}: rebase (finish): returning to refs/heads/main
16
+ 337337b HEAD@{15}: rebase (continue): add demo
17
+ 2b9d6b2 HEAD@{16}: rebase (start): checkout HEAD~2
18
+ 1ccbc31 HEAD@{17}: commit: add demo
19
+ cc42170 HEAD@{18}: commit: add demo
20
+ 2b9d6b2 HEAD@{19}: clone: from https://huggingface.co/spaces/Tonic/Pixtral