Spaces:

Tonic
/

Pixtral

Paused

App Files Files Community

Tonic commited on Sep 11, 2024

Commit

55d5adb

unverified ·

1 Parent(s): e9ec3b8

add image similarity demo

Browse files

Files changed (2) hide show

app.py +164 -56
et --hard HEAD@{1} +20 -0

app.py CHANGED Viewed

@@ -12,8 +12,21 @@ from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 import spaces
-title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Image-to-Text Model Demo"
-description = """Upload an image to encode it. This is a **work in progress** , just showing off some demo features here until it's ready.
 ### Join us :
 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
@@ -86,16 +99,14 @@ class VisionEncoder(nn.Module):
         x = self.gelu(x)
         return x
 class PixtralModel(nn.Module):
     def __init__(self, params):
         super().__init__()
         self.vision_encoder = VisionEncoder(params['vision_encoder'])
-        # Add text generation components here
     def forward(self, image):
-        vision_output = self.vision_encoder(image)
-        # Add text generation logic here
-        return vision_output
 def load_model(params, model_path):
     model = PixtralModel(params)
@@ -110,74 +121,171 @@ def load_model(params, model_path):
 # Initialize the model
 model = load_model(params, model_path)
-tokenizer = MistralTokenizer.from_model("pixtral")
-@spaces.GPU
-def process_image_and_text(image, prompt):
-    # Prepare the image
     image = image.convert('RGB')
     image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
-    image_tensor = image_tensor.cuda()
-    # Tokenize the input
-    tokenized = tokenizer.encode_chat_completion(
-        ChatCompletionRequest(
-            messages=[
-                UserMessage(
-                    content=[
-                        TextChunk(text=prompt),
-                        ImageChunk(image=image),
-                    ]
-                )
-            ],
-            model="pixtral",
-        )
-    )
-    tokens, text, images = tokenized.tokens, tokenized.text, tokenized.images
-    # Process the image and generate text
     with torch.no_grad():
-        model.cuda()
-        vision_output = model(image_tensor)
         model.cpu()
-        generated_text = f"Generated text based on the image and prompt: {prompt}"
-    return generated_text, len(tokens), len(images)
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown(title)
     gr.Markdown(description)
     with gr.Row():
-        with gr.Column(scale=1):
-            input_image = gr.Image(type="pil")
-            input_prompt = gr.Textbox(label="Prompt")
-            submit_btn = gr.Button("Generate Text")
-        with gr.Column(scale=1):
-            output_text = gr.Textbox(label="Generated Text")
-            token_count = gr.Number(label="Number of Tokens")
-            image_count = gr.Number(label="Number of Images")
     submit_btn.click(
-        fn=process_image_and_text,
-        inputs=[input_image, input_prompt],
-        outputs=[output_text, token_count, image_count]
     )
-    gr.Markdown("## How it works")
-    gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
-    gr.Markdown("2. The encoder uses GELU activation in its layers.")
-    gr.Markdown("3. The encoded image and the prompt are used to generate descriptive text.")
-    gr.Markdown("## Model Details")
-    gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
-    gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
-    gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
-    gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
-    gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
-if __name__ == "__main__":
-    demo.launch()

 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 import spaces
+title = "# 🖼️ Pixtral Image Similarity Demo"
+description = """
+Upload two images to compare their similarity based on the embeddings produced by the Pixtral model.
+This demo uses the vision encoder part of the Pixtral model to generate embeddings and then calculates
+the cosine similarity between them.
+### How it works:
+1. Upload two images
+2. The Pixtral vision encoder processes both images
+3. The cosine similarity between the embeddings is calculated
+4. The similarity score is displayed (1.0 means identical, 0.0 means completely different)
+### Note:
+This is a demonstration of the vision encoder capabilities and does not use the full Pixtral model for text generation.
 ### Join us :
 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
         x = self.gelu(x)
         return x
 class PixtralModel(nn.Module):
     def __init__(self, params):
         super().__init__()
         self.vision_encoder = VisionEncoder(params['vision_encoder'])
     def forward(self, image):
+        return self.vision_encoder(image)
 def load_model(params, model_path):
     model = PixtralModel(params)
 # Initialize the model
 model = load_model(params, model_path)
+def preprocess_image(image):
     image = image.convert('RGB')
     image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
+    return image_tensor
+@spaces.GPU
+def calculate_similarity(image1, image2):
+    # Preprocess images
+    tensor1 = preprocess_image(image1).cuda()
+    tensor2 = preprocess_image(image2).cuda()
+    # Generate embeddings
     with torch.no_grad():
+        model.cuda()
+        embedding1 = model(tensor1).mean(dim=1)  # Average over spatial dimensions
+        embedding2 = model(tensor2).mean(dim=1)
         model.cpu()
+    # Calculate cosine similarity
+    similarity = F.cosine_similarity(embedding1, embedding2).item()
+    return similarity
 # Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown(title)
+    gr.Markdown("## Model Details")
+    gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
+    gr.Markdown(f"- Number of Vision Encoder Layers: {params['vision_encoder']['num_hidden_layers']}")
+    gr.Markdown(f"- Number of Attention Heads: {params['vision_encoder']['num_attention_heads']}")
+    gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
+    gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
+    gr.Markdown("## How it works")
+    gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
+    gr.Markdown("2. The encoder uses GELU activation in its layers.")
+    gr.Markdown("3. The encoded image and the prompt are used to generate descriptive text.")
     gr.Markdown(description)
     with gr.Row():
+        image1_input = gr.Image(type="pil", label="Image 1")
+        image2_input = gr.Image(type="pil", label="Image 2")
+    submit_btn = gr.Button("📸🌬️Calculate Similarity")
+    similarity_output = gr.Number(label="Similarity Score (0.0 to 1.0)")
     submit_btn.click(
+        fn=calculate_similarity,
+        inputs=[image1_input, image2_input],
+        outputs=[similarity_output]
     )
+if __name__ == "__main__":
+    demo.launch()
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
+# from safetensors import safe_open
+# import json
+# import gradio as gr
+# from PIL import Image
+# import numpy as np
+# from huggingface_hub import snapshot_download
+# from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
+# from mistral_common.protocol.instruct.request import ChatCompletionRequest
+# from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+# import spaces
+# title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Image-to-Text Model Demo"
+# # Download model files
+# model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
+# # Load model parameters and tokenizer configuration
+# with open(f'{model_path}/params.json', 'r') as f:
+#     params = json.load(f)
+# with open(f'{model_path}/tekken.json', 'r') as f:
+#     tokenizer_config = json.load(f)
+# class PixtralModel(nn.Module):
+#     def __init__(self, params):
+#         super().__init__()
+#         self.vision_encoder = VisionEncoder(params['vision_encoder'])
+#         # Add text generation components here
+#     def forward(self, image):
+#         vision_output = self.vision_encoder(image)
+#         # Add text generation logic here
+#         return vision_output
+# def load_model(params, model_path):
+#     model = PixtralModel(params)
+#     with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
+#         for name, param in model.named_parameters():
+#             if name in f.keys():
+#                 param.data = f.get_tensor(name)
+#     model.eval()
+#     return model
+# # Initialize the model
+# model = load_model(params, model_path)
+# tokenizer = MistralTokenizer.from_model("pixtral")
+# @spaces.GPU
+# def process_image_and_text(image, prompt):
+#     # Prepare the image
+#     image = image.convert('RGB')
+#     image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
+#     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
+#     image_tensor = image_tensor.cuda()
+#     # Tokenize the input
+#     tokenized = tokenizer.encode_chat_completion(
+#         ChatCompletionRequest(
+#             messages=[
+#                 UserMessage(
+#                     content=[
+#                         TextChunk(text=prompt),
+#                         ImageChunk(image=image),
+#                     ]
+#                 )
+#             ],
+#             model="pixtral",
+#         )
+#     )
+#     tokens, text, images = tokenized.tokens, tokenized.text, tokenized.images
+#     # Process the image and generate text
+#     with torch.no_grad():
+#         model.cuda()
+#         vision_output = model(image_tensor)
+#         model.cpu()
+#         generated_text = f"Generated text based on the image and prompt: {prompt}"
+#     return generated_text, len(tokens), len(images)
+# # Gradio interface
+# with gr.Blocks() as demo:
+#     gr.Markdown(title)
+#     gr.Markdown(description)
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             input_image = gr.Image(type="pil")
+#             input_prompt = gr.Textbox(label="Prompt")
+#             submit_btn = gr.Button("Generate Text")
+#         with gr.Column(scale=1):
+#             output_text = gr.Textbox(label="Generated Text")
+#             token_count = gr.Number(label="Number of Tokens")
+#             image_count = gr.Number(label="Number of Images")
+#     submit_btn.click(
+#         fn=process_image_and_text,
+#         inputs=[input_image, input_prompt],
+#         outputs=[output_text, token_count, image_count]
+#     )
+# if __name__ == "__main__":
+#     demo.launch()

et --hard HEAD@{1} ADDED Viewed

	@@ -0,0 +1,20 @@

+[33me9ec3b8[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m)[m HEAD@{0}: reset: moving to e9ec3b854fe2a9afd80971f66b304230ff10ae01
+[33m2eab2f8[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m HEAD@{1}: commit: spaces cuda fix
+[33m190c58f[m HEAD@{2}: commit: spaces cuda fix
+[33m1336464[m HEAD@{3}: commit: spaces cuda fix
+[33m1a1c0e2[m HEAD@{4}: commit: spaces cuda fix
+[33m24151b3[m HEAD@{5}: commit: spaces cuda fix
+[33m0e70c79[m HEAD@{6}: commit: spaces cuda fix
+[33m04d2804[m HEAD@{7}: commit: hidden layers fix
+[33m8bf2b2c[m HEAD@{8}: commit: hidden layers fix
+[33m8ee6779[m HEAD@{9}: commit: add text decoding
+[33me9ec3b8[m[33m ([m[1;36mHEAD -> [m[1;32mmain[m[33m)[m HEAD@{10}: commit: add description
+[33me562e7a[m HEAD@{11}: commit: add snapshot download
+[33mf570b2f[m HEAD@{12}: commit: add snapshot download
+[33m9a64677[m HEAD@{13}: commit: add snapshot download
+[33m337337b[m HEAD@{14}: rebase (finish): returning to refs/heads/main
+[33m337337b[m HEAD@{15}: rebase (continue): add demo
+[33m2b9d6b2[m HEAD@{16}: rebase (start): checkout HEAD~2
+[33m1ccbc31[m HEAD@{17}: commit: add demo
+[33mcc42170[m HEAD@{18}: commit: add demo
+[33m2b9d6b2[m HEAD@{19}: clone: from https://huggingface.co/spaces/Tonic/Pixtral