import spaces
import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# Model identifier
model_id = "xmadai/Mistral-Large-Instruct-2407-xMADai-INT4"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)

@spaces.DynamicGpuSupport()
class ModelWrapper:
    def __init__(self):
        self.model = None  # Model will be loaded when GPU is allocated

    @spaces.GPU
    def generate(self, prompt):
        if self.model is None:
            # Load the model when GPU is allocated
            self.model = AutoGPTQForCausalLM.from_quantized(
                model_id,
                device_map='auto',
                trust_remote_code=True,
            )

        # Tokenize the input prompt
        inputs = tokenizer(prompt, return_tensors='pt').to('cuda')

        # Generate text
        outputs = self.model.generate(
            **inputs,
            do_sample=True,
            max_new_tokens=512
        )

        # Decode the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text

# Instantiate the model wrapper
model_wrapper = ModelWrapper()

# Create the Gradio interface
interface = gr.Interface(
    fn=model_wrapper.generate,
    inputs=gr.Textbox(lines=5, label="Input Prompt"),
    outputs=gr.Textbox(label="Generated Text"),
    title="Mistral-Large-Instruct-2407 Text Completion",
    description="Enter a prompt and receive a text completion using the Mistral-Large-Instruct-2407 INT4 model."
)

if __name__ == "__main__":
    interface.launch()