Spaces:
Paused
Paused
add reference code from vllm
Browse files
app.py
CHANGED
@@ -13,6 +13,8 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|
13 |
import spaces
|
14 |
import math
|
15 |
from typing import List, Optional, Tuple
|
|
|
|
|
16 |
|
17 |
title = "# **WIP / DEMO** 🙋🏻♂️Welcome to Tonic's Pixtral Model Demo"
|
18 |
description = """
|
@@ -189,48 +191,71 @@ def preprocess_image(image):
|
|
189 |
image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
|
190 |
return image_tensor
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
@spaces.GPU(duration=120)
|
193 |
def generate_text(image, prompt, max_tokens):
|
194 |
try:
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
|
|
|
|
203 |
)
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
216 |
# model.to("cpu")
|
217 |
-
|
|
|
|
|
218 |
except Exception as e:
|
219 |
return f"Error: {str(e)}", 0, 0
|
220 |
|
221 |
@spaces.GPU(duration=60)
|
222 |
def calculate_similarity(image1, image2):
|
223 |
try:
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
228 |
|
229 |
-
|
230 |
-
|
|
|
231 |
|
232 |
-
|
|
|
|
|
233 |
# model.to("cpu")
|
|
|
|
|
234 |
return similarity
|
235 |
except Exception as e:
|
236 |
return f"Error: {str(e)}"
|
|
|
13 |
import spaces
|
14 |
import math
|
15 |
from typing import List, Optional, Tuple
|
16 |
+
import gc
|
17 |
+
from contextlib import contextmanager
|
18 |
|
19 |
title = "# **WIP / DEMO** 🙋🏻♂️Welcome to Tonic's Pixtral Model Demo"
|
20 |
description = """
|
|
|
191 |
image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
|
192 |
return image_tensor
|
193 |
|
194 |
+
@contextmanager
|
195 |
+
def gpu_memory_manager():
|
196 |
+
try:
|
197 |
+
torch.cuda.empty_cache()
|
198 |
+
yield
|
199 |
+
finally:
|
200 |
+
torch.cuda.empty_cache()
|
201 |
+
gc.collect()
|
202 |
+
|
203 |
+
|
204 |
@spaces.GPU(duration=120)
|
205 |
def generate_text(image, prompt, max_tokens):
|
206 |
try:
|
207 |
+
with gpu_memory_manager():
|
208 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
209 |
+
image_tensor = preprocess_image(image).to(device)
|
210 |
+
model.to(device)
|
211 |
+
|
212 |
+
tokenized = tokenizer.encode_chat_completion(
|
213 |
+
ChatCompletionRequest(
|
214 |
+
messages=[UserMessage(content=[TextChunk(text=prompt), ImageChunk(image=image)])],
|
215 |
+
model="pixtral",
|
216 |
+
)
|
217 |
)
|
218 |
+
input_ids = torch.tensor(tokenized.tokens).unsqueeze(0).to(device)
|
219 |
+
|
220 |
+
generated_ids = input_ids.clone()
|
221 |
+
for _ in range(max_tokens):
|
222 |
+
with torch.no_grad():
|
223 |
+
logits = model(image_tensor, generated_ids)
|
224 |
+
next_token_logits = logits[0, -1, :]
|
225 |
+
next_token = torch.argmax(next_token_logits, dim=-1)
|
226 |
+
generated_ids = torch.cat([generated_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1)
|
227 |
+
if next_token.item() == tokenizer.eos_token_id:
|
228 |
+
break
|
229 |
+
|
230 |
+
generated_text = tokenizer.decode(generated_ids[0].tolist())
|
231 |
+
|
232 |
+
# # Move model back to CPU and clear CUDA memory
|
233 |
# model.to("cpu")
|
234 |
+
torch.cuda.empty_cache()
|
235 |
+
|
236 |
+
return generated_text, len(generated_ids[0]), 1
|
237 |
except Exception as e:
|
238 |
return f"Error: {str(e)}", 0, 0
|
239 |
|
240 |
@spaces.GPU(duration=60)
|
241 |
def calculate_similarity(image1, image2):
|
242 |
try:
|
243 |
+
with gpu_memory_manager():
|
244 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
245 |
+
tensor1 = preprocess_image(image1).to(device)
|
246 |
+
tensor2 = preprocess_image(image2).to(device)
|
247 |
+
model.to(device)
|
248 |
|
249 |
+
with torch.no_grad():
|
250 |
+
embedding1 = model(tensor1).mean(dim=1)
|
251 |
+
embedding2 = model(tensor2).mean(dim=1)
|
252 |
|
253 |
+
similarity = F.cosine_similarity(embedding1, embedding2).item()
|
254 |
+
|
255 |
+
# # Move model back to CPU and clear CUDA memory
|
256 |
# model.to("cpu")
|
257 |
+
torch.cuda.empty_cache()
|
258 |
+
|
259 |
return similarity
|
260 |
except Exception as e:
|
261 |
return f"Error: {str(e)}"
|