Staticaliza commited on
Commit
944743c
·
verified ·
1 Parent(s): 3a7347e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -1,6 +1,9 @@
1
  import torch
2
  from transformers import TextIteratorStreamer
3
  import threading
 
 
 
4
 
5
  class ModelWrapper:
6
  def __init__(self):
@@ -9,16 +12,18 @@ class ModelWrapper:
9
  @spaces.GPU
10
  def generate(self, prompt):
11
  if self.model is None:
12
- # Load the model when GPU is allocated
13
  self.model = AutoGPTQForCausalLM.from_quantized(
14
  model_id,
15
- device_map='auto',
16
  trust_remote_code=True,
17
  )
18
- self.model.eval()
 
19
 
20
  # Tokenize the input prompt
21
  inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
 
22
 
23
  # Set up the streamer
24
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
 
1
  import torch
2
  from transformers import TextIteratorStreamer
3
  import threading
4
+ import spaces
5
+
6
+ print("Is CUDA available?", torch.cuda.is_available())
7
 
8
  class ModelWrapper:
9
  def __init__(self):
 
12
  @spaces.GPU
13
  def generate(self, prompt):
14
  if self.model is None:
15
+ # Explicitly set device_map to 'cuda'
16
  self.model = AutoGPTQForCausalLM.from_quantized(
17
  model_id,
18
+ device_map={'': 'cuda:0'},
19
  trust_remote_code=True,
20
  )
21
+
22
+ print("Model is on device:", next(self.model.parameters()).device)
23
 
24
  # Tokenize the input prompt
25
  inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
26
+ print("Inputs are on device:", inputs['input_ids'].device)
27
 
28
  # Set up the streamer
29
  streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)