Tonic commited on
Commit
56352f5
·
unverified ·
1 Parent(s): a595550

add reference code from vllm

Browse files
Files changed (1) hide show
  1. app.py +53 -28
app.py CHANGED
@@ -13,6 +13,8 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
13
  import spaces
14
  import math
15
  from typing import List, Optional, Tuple
 
 
16
 
17
  title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
18
  description = """
@@ -189,48 +191,71 @@ def preprocess_image(image):
189
  image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
190
  return image_tensor
191
 
 
 
 
 
 
 
 
 
 
 
192
  @spaces.GPU(duration=120)
193
  def generate_text(image, prompt, max_tokens):
194
  try:
195
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
196
- image_tensor = preprocess_image(image).to(device)
197
- model.to(device)
198
-
199
- tokenized = tokenizer.encode_chat_completion(
200
- ChatCompletionRequest(
201
- messages=[UserMessage(content=[TextChunk(text=prompt), ImageChunk(image=image)])],
202
- model="pixtral",
 
 
203
  )
204
- )
205
- input_ids = torch.tensor(tokenized.tokens).unsqueeze(0).to(device)
206
-
207
- for _ in range(max_tokens):
208
- logits = model(image_tensor, input_ids)
209
- next_token_logits = logits[0, -1, :]
210
- next_token = torch.argmax(next_token_logits, dim=-1)
211
- input_ids = torch.cat([input_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1)
212
- if next_token.item() == tokenizer.eos_token_id:
213
- break
214
-
215
- generated_text = tokenizer.decode(input_ids[0].tolist())
 
 
 
216
  # model.to("cpu")
217
- return generated_text, len(input_ids[0]), 1
 
 
218
  except Exception as e:
219
  return f"Error: {str(e)}", 0, 0
220
 
221
  @spaces.GPU(duration=60)
222
  def calculate_similarity(image1, image2):
223
  try:
224
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
225
- tensor1 = preprocess_image(image1).to(device)
226
- tensor2 = preprocess_image(image2).to(device)
227
- model.to(device)
 
228
 
229
- embedding1 = model(tensor1).mean(dim=1)
230
- embedding2 = model(tensor2).mean(dim=1)
 
231
 
232
- similarity = F.cosine_similarity(embedding1, embedding2).item()
 
 
233
  # model.to("cpu")
 
 
234
  return similarity
235
  except Exception as e:
236
  return f"Error: {str(e)}"
 
13
  import spaces
14
  import math
15
  from typing import List, Optional, Tuple
16
+ import gc
17
+ from contextlib import contextmanager
18
 
19
  title = "# **WIP / DEMO** 🙋🏻‍♂️Welcome to Tonic's Pixtral Model Demo"
20
  description = """
 
191
  image_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
192
  return image_tensor
193
 
194
+ @contextmanager
195
+ def gpu_memory_manager():
196
+ try:
197
+ torch.cuda.empty_cache()
198
+ yield
199
+ finally:
200
+ torch.cuda.empty_cache()
201
+ gc.collect()
202
+
203
+
204
  @spaces.GPU(duration=120)
205
  def generate_text(image, prompt, max_tokens):
206
  try:
207
+ with gpu_memory_manager():
208
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
209
+ image_tensor = preprocess_image(image).to(device)
210
+ model.to(device)
211
+
212
+ tokenized = tokenizer.encode_chat_completion(
213
+ ChatCompletionRequest(
214
+ messages=[UserMessage(content=[TextChunk(text=prompt), ImageChunk(image=image)])],
215
+ model="pixtral",
216
+ )
217
  )
218
+ input_ids = torch.tensor(tokenized.tokens).unsqueeze(0).to(device)
219
+
220
+ generated_ids = input_ids.clone()
221
+ for _ in range(max_tokens):
222
+ with torch.no_grad():
223
+ logits = model(image_tensor, generated_ids)
224
+ next_token_logits = logits[0, -1, :]
225
+ next_token = torch.argmax(next_token_logits, dim=-1)
226
+ generated_ids = torch.cat([generated_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1)
227
+ if next_token.item() == tokenizer.eos_token_id:
228
+ break
229
+
230
+ generated_text = tokenizer.decode(generated_ids[0].tolist())
231
+
232
+ # # Move model back to CPU and clear CUDA memory
233
  # model.to("cpu")
234
+ torch.cuda.empty_cache()
235
+
236
+ return generated_text, len(generated_ids[0]), 1
237
  except Exception as e:
238
  return f"Error: {str(e)}", 0, 0
239
 
240
  @spaces.GPU(duration=60)
241
  def calculate_similarity(image1, image2):
242
  try:
243
+ with gpu_memory_manager():
244
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
245
+ tensor1 = preprocess_image(image1).to(device)
246
+ tensor2 = preprocess_image(image2).to(device)
247
+ model.to(device)
248
 
249
+ with torch.no_grad():
250
+ embedding1 = model(tensor1).mean(dim=1)
251
+ embedding2 = model(tensor2).mean(dim=1)
252
 
253
+ similarity = F.cosine_similarity(embedding1, embedding2).item()
254
+
255
+ # # Move model back to CPU and clear CUDA memory
256
  # model.to("cpu")
257
+ torch.cuda.empty_cache()
258
+
259
  return similarity
260
  except Exception as e:
261
  return f"Error: {str(e)}"