import spaces import gradio as gr from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM # Model identifier model_id = "xmadai/Mistral-Large-Instruct-2407-xMADai-INT4" # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True) @spaces.DynamicGpuSupport() class ModelWrapper: def __init__(self): self.model = None # Model will be loaded when GPU is allocated @spaces.GPU def generate(self, prompt): if self.model is None: # Load the model when GPU is allocated self.model = AutoGPTQForCausalLM.from_quantized( model_id, device_map='auto', trust_remote_code=True, ) # Tokenize the input prompt inputs = tokenizer(prompt, return_tensors='pt').to('cuda') # Generate text outputs = self.model.generate( **inputs, do_sample=True, max_new_tokens=512 ) # Decode the generated text generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return generated_text # Instantiate the model wrapper model_wrapper = ModelWrapper() # Create the Gradio interface interface = gr.Interface( fn=model_wrapper.generate, inputs=gr.Textbox(lines=5, label="Input Prompt"), outputs=gr.Textbox(label="Generated Text"), title="Mistral-Large-Instruct-2407 Text Completion", description="Enter a prompt and receive a text completion using the Mistral-Large-Instruct-2407 INT4 model." ) if __name__ == "__main__": interface.launch()