Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
from transformers import AutoTokenizer | |
from auto_gptq import AutoGPTQForCausalLM | |
# Model identifier | |
model_id = "xmadai/Mistral-Large-Instruct-2407-xMADai-INT4" | |
# Load the tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True) | |
class ModelWrapper: | |
def __init__(self): | |
self.model = None # Model will be loaded when GPU is allocated | |
def generate(self, prompt): | |
if self.model is None: | |
# Load the model when GPU is allocated | |
self.model = AutoGPTQForCausalLM.from_quantized( | |
model_id, | |
device_map='auto', | |
trust_remote_code=True, | |
) | |
# Tokenize the input prompt | |
inputs = tokenizer(prompt, return_tensors='pt').to('cuda') | |
# Generate text | |
outputs = self.model.generate( | |
**inputs, | |
do_sample=True, | |
max_new_tokens=512 | |
) | |
# Decode the generated text | |
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return generated_text | |
# Instantiate the model wrapper | |
model_wrapper = ModelWrapper() | |
# Create the Gradio interface | |
interface = gr.Interface( | |
fn=model_wrapper.generate, | |
inputs=gr.Textbox(lines=5, label="Input Prompt"), | |
outputs=gr.Textbox(label="Generated Text"), | |
title="Mistral-Large-Instruct-2407 Text Completion", | |
description="Enter a prompt and receive a text completion using the Mistral-Large-Instruct-2407 INT4 model." | |
) | |
if __name__ == "__main__": | |
interface.launch() |