Staticaliza commited on
Commit
3a7347e
·
verified ·
1 Parent(s): b3bce7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -32
app.py CHANGED
@@ -1,20 +1,12 @@
1
- import spaces
2
- import gradio as gr
3
- from transformers import AutoTokenizer
4
- from auto_gptq import AutoGPTQForCausalLM
5
 
6
- # Model identifier
7
- model_id = "xmadai/Mistral-Large-Instruct-2407-xMADai-INT4"
8
-
9
- # Load the tokenizer
10
- tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)
11
-
12
- # Removed the invalid decorator
13
  class ModelWrapper:
14
  def __init__(self):
15
  self.model = None # Model will be loaded when GPU is allocated
16
 
17
- @spaces.GPU # Use the correct decorator
18
  def generate(self, prompt):
19
  if self.model is None:
20
  # Load the model when GPU is allocated
@@ -23,32 +15,28 @@ class ModelWrapper:
23
  device_map='auto',
24
  trust_remote_code=True,
25
  )
 
26
 
27
  # Tokenize the input prompt
28
  inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
29
 
30
- # Generate text
31
- outputs = self.model.generate(
 
 
 
32
  **inputs,
 
33
  do_sample=True,
34
- max_new_tokens=512
35
  )
36
 
37
- # Decode the generated text
38
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
39
- return generated_text
40
-
41
- # Instantiate the model wrapper
42
- model_wrapper = ModelWrapper()
43
-
44
- # Create the Gradio interface
45
- interface = gr.Interface(
46
- fn=model_wrapper.generate,
47
- inputs=gr.Textbox(lines=5, label="Input Prompt"),
48
- outputs=gr.Textbox(label="Generated Text"),
49
- title="Mistral-Large-Instruct-2407 Text Completion",
50
- description="Enter a prompt and receive a text completion using the Mistral-Large-Instruct-2407 INT4 model."
51
- )
52
 
53
- if __name__ == "__main__":
54
- interface.launch()
 
 
 
 
1
+ import torch
2
+ from transformers import TextIteratorStreamer
3
+ import threading
 
4
 
 
 
 
 
 
 
 
5
  class ModelWrapper:
6
  def __init__(self):
7
  self.model = None # Model will be loaded when GPU is allocated
8
 
9
+ @spaces.GPU
10
  def generate(self, prompt):
11
  if self.model is None:
12
  # Load the model when GPU is allocated
 
15
  device_map='auto',
16
  trust_remote_code=True,
17
  )
18
+ self.model.eval()
19
 
20
  # Tokenize the input prompt
21
  inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
22
 
23
+ # Set up the streamer
24
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
25
+
26
+ # Prepare generation arguments
27
+ generation_kwargs = dict(
28
  **inputs,
29
+ streamer=streamer,
30
  do_sample=True,
31
+ max_new_tokens=512,
32
  )
33
 
34
+ # Start generation in a separate thread to enable streaming
35
+ thread = threading.Thread(target=self.model.generate, kwargs=generation_kwargs)
36
+ thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Yield generated text in real-time
39
+ generated_text = ""
40
+ for new_text in streamer:
41
+ generated_text += new_text
42
+ yield generated_text