Spaces:

Tonic
/

Pixtral

Paused

App Files Files Community

Tonic commited on Sep 12, 2024

Commit

6408837

unverified ·

1 Parent(s): cbd9440

add reference code from vllm

Browse files

Files changed (1) hide show

app.py +229 -195

app.py CHANGED Viewed

@@ -11,100 +11,192 @@ from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, Im
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 import spaces
-title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Image-Similarity Model Demo"
 description = """
-Upload two images to compare their similarity based on the embeddings produced by the Pixtral model.
-This demo uses the vision encoder part of the Pixtral model to generate embeddings and then calculates
-the cosine similarity between them.
-### How it works:
-1. Upload two images
-2. The Pixtral vision encoder processes both images
-3. The cosine similarity between the embeddings is calculated
-4. The similarity score is displayed (1.0 means identical, 0.0 means completely different)
-### Note:
-This is a demonstration of the vision encoder capabilities and does not use the full Pixtral model for text generation.
 ### Join us :
 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
-# Download model files
-model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
-# Load model parameters and tokenizer configuration
 with open(f'{model_path}/params.json', 'r') as f:
     params = json.load(f)
 with open(f'{model_path}/tekken.json', 'r') as f:
     tokenizer_config = json.load(f)
-class GELU(nn.Module):
-    def __init__(self, dim_in, dim_out, approximate='none', bias=True):
         super().__init__()
-        self.linear = nn.Linear(dim_in, dim_out, bias=bias)
-        self.approximate = approximate
-    def forward(self, x):
-        if self.approximate == 'tanh':
-            return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        else:
-            return F.gelu(self.linear(x))
-def precompute_freqs_cis_2d(dim: int, height: int, width: int, theta: float) -> torch.Tensor:
-    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
     h = torch.arange(height, device=freqs.device)
     w = torch.arange(width, device=freqs.device)
     freqs_h = torch.outer(h, freqs[::2]).float()
     freqs_w = torch.outer(w, freqs[1::2]).float()
-    freqs_2d = torch.cat([
-        freqs_h[:, None, :].repeat(1, width, 1),
-        freqs_w[None, :, :].repeat(height, 1, 1),
-    ], dim=-1)
     return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
-class Rope2D(nn.Module):
-    def __init__(self, dim, max_position_embeddings=1024, base=10000):
         super().__init__()
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-    def forward(self, x, height, width):
-        freqs_cis = precompute_freqs_cis_2d(self.dim, height, width, self.base)
-        return freqs_cis.to(x.device)
-class VisionEncoder(nn.Module):
-    def __init__(self, config):
         super().__init__()
-        self.config = config
-        self.embed = nn.Conv2d(config['num_channels'], config['hidden_size'], kernel_size=config['patch_size'], stride=config['patch_size'])
-        self.rope = Rope2D(config['hidden_size'] // config['num_attention_heads'], base=config['rope_theta'])
-        self.layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=config['hidden_size'], nhead=config['num_attention_heads'], dim_feedforward=config['intermediate_size']) for _ in range(config['num_hidden_layers'])])
-        self.norm = nn.LayerNorm(config['hidden_size'])
-        self.gelu = GELU(config['hidden_size'], config['hidden_size'])
-    def forward(self, pixel_values):
-        x = self.embed(pixel_values)
-        b, c, h, w = x.shape
         x = x.flatten(2).transpose(1, 2)
-        freqs_cis = self.rope(x, h, w)
-        for layer in self.layers:
-            x = layer(x)
-        x = self.norm(x)
-        x = self.gelu(x)
         return x
 class PixtralModel(nn.Module):
     def __init__(self, params):
         super().__init__()
-        self.vision_encoder = VisionEncoder(params['vision_encoder'])
-    def forward(self, image):
-        return self.vision_encoder(image)
 def load_model(params, model_path):
     model = PixtralModel(params)
@@ -117,8 +209,8 @@ def load_model(params, model_path):
     model.eval()
     return model
-# Initialize the model
 model = load_model(params, model_path)
 def preprocess_image(image):
     image = image.convert('RGB')
@@ -126,6 +218,41 @@ def preprocess_image(image):
     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
     return image_tensor
 @spaces.GPU
 def calculate_similarity(image1, image2):
     # Preprocess images
@@ -144,8 +271,7 @@ def calculate_similarity(image1, image2):
     return similarity
-# Gradio interface
-with gr.Blocks() as demo:
     gr.Markdown(title)
     gr.Markdown("## Model Details")
     gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
@@ -155,135 +281,43 @@ with gr.Blocks() as demo:
     gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
     gr.Markdown("## How it works")
     gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
-    gr.Markdown("2. The encoder uses GELU activation in its layers.")
-    gr.Markdown("3. The encoded image and the prompt are used to generate descriptive text.")
     gr.Markdown(description)
-    with gr.Row():
-        image1_input = gr.Image(type="pil", label="Image 1")
-        image2_input = gr.Image(type="pil", label="Image 2")
-    submit_btn = gr.Button("📸🌬️Calculate Similarity")
-    similarity_output = gr.Number(label="Similarity Score (0.0 to 1.0)")
-    submit_btn.click(
-        fn=calculate_similarity,
-        inputs=[image1_input, image2_input],
-        outputs=[similarity_output]
-    )
-if __name__ == "__main__":
-    demo.launch()
-# import torch
-# import torch.nn as nn
-# import torch.nn.functional as F
-# from safetensors import safe_open
-# import json
-# import gradio as gr
-# from PIL import Image
-# import numpy as np
-# from huggingface_hub import snapshot_download
-# from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
-# from mistral_common.protocol.instruct.request import ChatCompletionRequest
-# from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-# import spaces
-# title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Image-to-Text Model Demo"
-# # Download model files
-# model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
-# # Load model parameters and tokenizer configuration
-# with open(f'{model_path}/params.json', 'r') as f:
-#     params = json.load(f)
-# with open(f'{model_path}/tekken.json', 'r') as f:
-#     tokenizer_config = json.load(f)
-# class PixtralModel(nn.Module):
-#     def __init__(self, params):
-#         super().__init__()
-#         self.vision_encoder = VisionEncoder(params['vision_encoder'])
-#         # Add text generation components here
-#     def forward(self, image):
-#         vision_output = self.vision_encoder(image)
-#         # Add text generation logic here
-#         return vision_output
-# def load_model(params, model_path):
-#     model = PixtralModel(params)
-#     with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
-#         for name, param in model.named_parameters():
-#             if name in f.keys():
-#                 param.data = f.get_tensor(name)
-#     model.eval()
-#     return model
-# # Initialize the model
-# model = load_model(params, model_path)
-# tokenizer = MistralTokenizer.from_model("pixtral")
-# @spaces.GPU
-# def process_image_and_text(image, prompt):
-#     # Prepare the image
-#     image = image.convert('RGB')
-#     image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
-#     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
-#     image_tensor = image_tensor.cuda()
-#     # Tokenize the input
-#     tokenized = tokenizer.encode_chat_completion(
-#         ChatCompletionRequest(
-#             messages=[
-#                 UserMessage(
-#                     content=[
-#                         TextChunk(text=prompt),
-#                         ImageChunk(image=image),
-#                     ]
-#                 )
-#             ],
-#             model="pixtral",
-#         )
-#     )
-#     tokens, text, images = tokenized.tokens, tokenized.text, tokenized.images
-#     # Process the image and generate text
-#     with torch.no_grad():
-#         model.cuda()
-#         vision_output = model(image_tensor)
-#         model.cpu()
-#         generated_text = f"Generated text based on the image and prompt: {prompt}"
-#     return generated_text, len(tokens), len(images)
-# # Gradio interface
-# with gr.Blocks() as demo:
-#     gr.Markdown(title)
-#     gr.Markdown(description)
-#     with gr.Row():
-#         with gr.Column(scale=1):
-#             input_image = gr.Image(type="pil")
-#             input_prompt = gr.Textbox(label="Prompt")
-#             submit_btn = gr.Button("Generate Text")
-#         with gr.Column(scale=1):
-#             output_text = gr.Textbox(label="Generated Text")
-#             token_count = gr.Number(label="Number of Tokens")
-#             image_count = gr.Number(label="Number of Images")
-#     submit_btn.click(
-#         fn=process_image_and_text,
-#         inputs=[input_image, input_prompt],
-#         outputs=[output_text, token_count, image_count]
-#     )
-# if __name__ == "__main__":
-#     demo.launch()

 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 import spaces
+import math
+from typing import List, Optional, Tuple
+title = "#  🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
 description = """
+This demo showcases two capabilities of the Pixtral model:
+1. Image-to-Text Generation
+2. Image Similarity Comparison
 ### Join us :
 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
+model_path = snapshot_download(repo_id="mistralai/Pixtral-12B-2409")
 with open(f'{model_path}/params.json', 'r') as f:
     params = json.load(f)
 with open(f'{model_path}/tekken.json', 'r') as f:
     tokenizer_config = json.load(f)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
         super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
+def precompute_freqs_cis_2d(
+    dim: int,
+    height: int,
+    width: int,
+    theta: float,
+) -> torch.Tensor:
+    freqs = 1.0 / (theta**(torch.arange(0, dim, 2).float() / dim))
     h = torch.arange(height, device=freqs.device)
     w = torch.arange(width, device=freqs.device)
     freqs_h = torch.outer(h, freqs[::2]).float()
     freqs_w = torch.outer(w, freqs[1::2]).float()
+    freqs_2d = torch.cat(
+        [
+            freqs_h[:, None, :].repeat(1, width, 1),
+            freqs_w[None, :, :].repeat(height, 1, 1),
+        ],
+        dim=-1,
+    )
     return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
+def apply_rotary_emb_vit(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = freqs_cis.view(*freqs_cis.shape[:2], 1, freqs_cis.shape[-1])
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class Attention(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.n_heads = args['num_attention_heads']
+        self.head_dim = args['hidden_size'] // args['num_attention_heads']
+        self.wq = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
+        self.wk = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
+        self.wv = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
+        self.wo = nn.Linear(args['hidden_size'], args['hidden_size'], bias=False)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+        batch, patches, _ = x.shape
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = q.reshape(batch, patches, self.n_heads, self.head_dim)
+        k = k.reshape(batch, patches, self.n_heads, self.head_dim)
+        v = v.reshape(batch, patches, self.n_heads, self.head_dim)
+        q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.head_dim)
+        attn = F.softmax(scores, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
+        return self.wo(out)
+class FeedForward(nn.Module):
+    def __init__(self, args):
         super().__init__()
+        self.w1 = nn.Linear(args['hidden_size'], args['intermediate_size'], bias=False)
+        self.w2 = nn.Linear(args['intermediate_size'], args['hidden_size'], bias=False)
+        self.w3 = nn.Linear(args['hidden_size'], args['intermediate_size'], bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TransformerBlock(nn.Module):
+    def __init__(self, args):
         super().__init__()
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(args)
+        self.attention_norm = RMSNorm(args['hidden_size'], eps=1e-5)
+        self.ffn_norm = RMSNorm(args['hidden_size'], eps=1e-5)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+        r = self.attention.forward(self.attention_norm(x), freqs_cis=freqs_cis)
+        h = x + r
+        r = self.feed_forward.forward(self.ffn_norm(h))
+        out = h + r
+        return out
+class VisionTransformer(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.patch_conv = nn.Conv2d(
+            in_channels=args['num_channels'],
+            out_channels=args['hidden_size'],
+            kernel_size=args['patch_size'],
+            stride=args['patch_size'],
+            bias=False,
+        )
+        self.ln_pre = RMSNorm(args['hidden_size'], eps=1e-5)
+        self.transformer = nn.ModuleList([TransformerBlock(args) for _ in range(args['num_hidden_layers'])])
+        self.max_patches_per_side = args['image_size'] // args['patch_size']
+        self._freqs_cis = None
+    @property
+    def freqs_cis(self) -> torch.Tensor:
+        if self._freqs_cis is None:
+            self._freqs_cis = precompute_freqs_cis_2d(
+                dim=self.args['hidden_size'] // self.args['num_attention_heads'],
+                height=self.max_patches_per_side,
+                width=self.max_patches_per_side,
+                theta=self.args['rope_theta'],
+            )
+        return self._freqs_cis.to(self.patch_conv.weight.device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_conv(x)
         x = x.flatten(2).transpose(1, 2)
+        x = self.ln_pre(x)
+        freqs_cis = self.freqs_cis
+        for layer in self.transformer:
+            x = layer(x, freqs_cis=freqs_cis)
         return x
+class VisionLanguageAdapter(nn.Module):
+    def __init__(self, args, dim: int):
+        super().__init__()
+        self.w_in = nn.Linear(args['hidden_size'], dim, bias=True)
+        self.gelu = nn.GELU()
+        self.w_out = nn.Linear(dim, dim, bias=True)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_out(self.gelu(self.w_in(x)))
 class PixtralModel(nn.Module):
     def __init__(self, params):
         super().__init__()
+        self.vision_encoder = VisionTransformer(params['vision_encoder'])
+        self.vision_language_adapter = VisionLanguageAdapter(params['vision_encoder'], params['text_config']['hidden_size'])
+        self.language_model = nn.TransformerDecoder(
+            nn.TransformerDecoderLayer(d_model=params['text_config']['hidden_size'],
+                                       nhead=params['text_config']['num_attention_heads'],
+                                       dim_feedforward=params['text_config']['intermediate_size']),
+            num_layers=params['text_config']['num_hidden_layers']
+        )
+        self.lm_head = nn.Linear(params['text_config']['hidden_size'], params['text_config']['vocab_size'], bias=False)
+    def forward(self, image, input_ids=None):
+        vision_output = self.vision_encoder(image)
+        vision_output = self.vision_language_adapter(vision_output)
+        if input_ids is not None:
+            tgt = self.lm_head.weight[input_ids].transpose(0, 1)
+            output = self.language_model(tgt, vision_output)
+            logits = self.lm_head(output)
+            return logits
+        else:
+            return vision_output
 def load_model(params, model_path):
     model = PixtralModel(params)
     model.eval()
     return model
 model = load_model(params, model_path)
+tokenizer = MistralTokenizer.from_model("pixtral")
 def preprocess_image(image):
     image = image.convert('RGB')
     image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
     return image_tensor
+@spaces.GPU
+def generate_text(image, prompt):
+    image_tensor = preprocess_image(image).cuda()
+    tokenized = tokenizer.encode_chat_completion(
+        ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text=prompt),
+                        ImageChunk(image=image),
+                    ]
+                )
+            ],
+            model="pixtral",
+        )
+    )
+    input_ids = torch.tensor(tokenized.tokens).unsqueeze(0).cuda()
+    # Generate text
+    with torch.no_grad():
+        model.cuda()
+        max_length = 100  # add slider
+        for _ in range(max_length):
+            logits = model(image_tensor, input_ids)
+            next_token_logits = logits[0, -1, :]
+            next_token = torch.argmax(next_token_logits, dim=-1)
+            input_ids = torch.cat([input_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1)
+            if next_token.item() == tokenizer.eos_token_id:
+                break
+        model.cpu()
+    generated_text = tokenizer.decode(input_ids[0].tolist())
+    return generated_text, len(input_ids[0]), 1  # 1 image processed
 @spaces.GPU
 def calculate_similarity(image1, image2):
     # Preprocess images
     return similarity
+with gr.Blocks(theme=gr.themes.Base()) as demo:
     gr.Markdown(title)
     gr.Markdown("## Model Details")
     gr.Markdown(f"- Vision Encoder Hidden Size: {params['vision_encoder']['hidden_size']}")
     gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
     gr.Markdown("## How it works")
     gr.Markdown("1. The image is processed by a Vision Encoder using 2D ROPE (Rotary Position Embedding).")
+    gr.Markdown("2. The encoder uses SiLU activation in its feed-forward layers.")
+    gr.Markdown("3. The encoded image is used for text generation or similarity comparison.")
     gr.Markdown(description)
+    with gr.Tabs():
+        with gr.TabItem("Image-to-Text Generation"):
+            with gr.Row():
+                with gr.Column():
+                    input_image = gr.Image(type="pil", label="Input Image")
+                    input_prompt = gr.Textbox(label="Prompt")
+                    submit_btn = gr.Button("Generate Text")
+                with gr.Column():
+                    output_text = gr.Textbox(label="Generated Text")
+                    token_count = gr.Number(label="Number of Tokens")
+                    image_count = gr.Number(label="Number of Images Processed")
+            submit_btn.click(
+                fn=generate_text,
+                inputs=[input_image, input_prompt],
+                outputs=[output_text, token_count, image_count]
+            )
+        with gr.TabItem("Image Similarity Comparison"):
+            with gr.Row():
+                image1_input = gr.Image(type="pil", label="Image 1")
+                image2_input = gr.Image(type="pil", label="Image 2")
+            similarity_btn = gr.Button("📸🌬️Calculate Similarity")
+            similarity_output = gr.Number(label="Similarity Score (0.0 to 1.0)")
+            similarity_btn.click(
+                fn=calculate_similarity,
+                inputs=[image1_input, image2_input],
+                outputs=[similarity_output]
+            )
+if __name__ == "__main__":
+    demo.launch()