Voice-CPU / app.py
Staticaliza's picture
Update app.py
9111b7f verified
raw
history blame
1.64 kB
import spaces
import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
# Model identifier
model_id = "xmadai/Mistral-Large-Instruct-2407-xMADai-INT4"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)
@spaces.DynamicGpuSupport()
class ModelWrapper:
def __init__(self):
self.model = None # Model will be loaded when GPU is allocated
@spaces.GPU
def generate(self, prompt):
if self.model is None:
# Load the model when GPU is allocated
self.model = AutoGPTQForCausalLM.from_quantized(
model_id,
device_map='auto',
trust_remote_code=True,
)
# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
# Generate text
outputs = self.model.generate(
**inputs,
do_sample=True,
max_new_tokens=512
)
# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text
# Instantiate the model wrapper
model_wrapper = ModelWrapper()
# Create the Gradio interface
interface = gr.Interface(
fn=model_wrapper.generate,
inputs=gr.Textbox(lines=5, label="Input Prompt"),
outputs=gr.Textbox(label="Generated Text"),
title="Mistral-Large-Instruct-2407 Text Completion",
description="Enter a prompt and receive a text completion using the Mistral-Large-Instruct-2407 INT4 model."
)
if __name__ == "__main__":
interface.launch()