Spaces:
Paused
Paused
add snapshot download
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
import torch.nn.functional as F
|
@@ -6,20 +7,20 @@ import json
|
|
6 |
import gradio as gr
|
7 |
from PIL import Image
|
8 |
import numpy as np
|
|
|
9 |
from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
|
10 |
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
11 |
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
12 |
-
|
13 |
-
import Spaces
|
14 |
-
|
15 |
|
16 |
# Download model files
|
17 |
model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
|
18 |
|
19 |
-
|
|
|
20 |
params = json.load(f)
|
21 |
|
22 |
-
with open('
|
23 |
tokenizer_config = json.load(f)
|
24 |
|
25 |
class GELU(nn.Module):
|
@@ -28,6 +29,7 @@ class GELU(nn.Module):
|
|
28 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
29 |
self.approximate = approximate
|
30 |
|
|
|
31 |
def forward(self, x):
|
32 |
if self.approximate == 'tanh':
|
33 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
@@ -46,6 +48,7 @@ class Rope2D(nn.Module):
|
|
46 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
47 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
48 |
|
|
|
49 |
def forward(self, x, seq_len=None):
|
50 |
if seq_len > self.max_seq_len_cached:
|
51 |
self.max_seq_len_cached = seq_len
|
@@ -69,6 +72,7 @@ class VisionEncoder(nn.Module):
|
|
69 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
70 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
71 |
|
|
|
72 |
def forward(self, pixel_values):
|
73 |
x = self.embed(pixel_values)
|
74 |
b, c, h, w = x.shape
|
@@ -86,30 +90,34 @@ class PixtralModel(nn.Module):
|
|
86 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
87 |
# Add text generation components here
|
88 |
|
|
|
89 |
def forward(self, image):
|
90 |
vision_output = self.vision_encoder(image)
|
91 |
# Add text generation logic here
|
92 |
return vision_output
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
with safe_open('consolidated.safetensors', framework="pt", device="
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
model.eval()
|
|
|
104 |
|
105 |
-
|
106 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
107 |
|
|
|
108 |
def process_image_and_text(image, prompt):
|
109 |
# Prepare the image
|
110 |
image = image.convert('RGB')
|
111 |
image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
|
112 |
image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
|
|
|
113 |
|
114 |
# Tokenize the input
|
115 |
tokenized = tokenizer.encode_chat_completion(
|
@@ -169,4 +177,5 @@ with gr.Blocks() as demo:
|
|
169 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
170 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
171 |
|
172 |
-
|
|
|
|
1 |
+
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
import torch.nn.functional as F
|
|
|
7 |
import gradio as gr
|
8 |
from PIL import Image
|
9 |
import numpy as np
|
10 |
+
from huggingface_hub import snapshot_download
|
11 |
from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageChunk
|
12 |
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
13 |
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
14 |
+
import spaces
|
|
|
|
|
15 |
|
16 |
# Download model files
|
17 |
model_path = snapshot_download(repo_id="mistral-community/pixtral-12b-240910")
|
18 |
|
19 |
+
# Load model parameters and tokenizer configuration
|
20 |
+
with open(f'{model_path}/params.json', 'r') as f:
|
21 |
params = json.load(f)
|
22 |
|
23 |
+
with open(f'{model_path}/tekken.json', 'r') as f:
|
24 |
tokenizer_config = json.load(f)
|
25 |
|
26 |
class GELU(nn.Module):
|
|
|
29 |
self.linear = nn.Linear(dim_in, dim_out, bias=bias)
|
30 |
self.approximate = approximate
|
31 |
|
32 |
+
@spaces.GPU
|
33 |
def forward(self, x):
|
34 |
if self.approximate == 'tanh':
|
35 |
return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
|
|
48 |
self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
|
49 |
self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
|
50 |
|
51 |
+
@spaces.GPU
|
52 |
def forward(self, x, seq_len=None):
|
53 |
if seq_len > self.max_seq_len_cached:
|
54 |
self.max_seq_len_cached = seq_len
|
|
|
72 |
self.norm = nn.LayerNorm(config['hidden_size'])
|
73 |
self.gelu = GELU(config['hidden_size'], config['hidden_size'])
|
74 |
|
75 |
+
@spaces.GPU
|
76 |
def forward(self, pixel_values):
|
77 |
x = self.embed(pixel_values)
|
78 |
b, c, h, w = x.shape
|
|
|
90 |
self.vision_encoder = VisionEncoder(params['vision_encoder'])
|
91 |
# Add text generation components here
|
92 |
|
93 |
+
@spaces.GPU
|
94 |
def forward(self, image):
|
95 |
vision_output = self.vision_encoder(image)
|
96 |
# Add text generation logic here
|
97 |
return vision_output
|
98 |
|
99 |
+
@spaces.GPU
|
100 |
+
def load_model(params, model_path):
|
101 |
+
model = PixtralModel(params)
|
102 |
+
|
103 |
+
with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cuda") as f:
|
104 |
+
for name, param in model.named_parameters():
|
105 |
+
if name in f.keys():
|
106 |
+
param.data = f.get_tensor(name)
|
107 |
+
|
108 |
+
model.eval()
|
109 |
+
return model.cuda()
|
110 |
|
111 |
+
model = load_model(params, model_path)
|
112 |
tokenizer = MistralTokenizer.from_model("pixtral")
|
113 |
|
114 |
+
@spaces.GPU
|
115 |
def process_image_and_text(image, prompt):
|
116 |
# Prepare the image
|
117 |
image = image.convert('RGB')
|
118 |
image = image.resize((params['vision_encoder']['image_size'], params['vision_encoder']['image_size']))
|
119 |
image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
|
120 |
+
image_tensor = image_tensor.cuda()
|
121 |
|
122 |
# Tokenize the input
|
123 |
tokenized = tokenizer.encode_chat_completion(
|
|
|
177 |
gr.Markdown(f"- Image Size: {params['vision_encoder']['image_size']}x{params['vision_encoder']['image_size']}")
|
178 |
gr.Markdown(f"- Patch Size: {params['vision_encoder']['patch_size']}x{params['vision_encoder']['patch_size']}")
|
179 |
|
180 |
+
if __name__ == "__main__":
|
181 |
+
demo.launch()
|