Spaces:

Staticaliza
/

Voice-CPU

Running on Zero

Voice-CPU / app.py

Update app.py

9111b7f verified 3 months ago

1.64 kB

	import spaces
	import gradio as gr
	from transformers import AutoTokenizer
	from auto_gptq import AutoGPTQForCausalLM

	# Model identifier
	model_id = "xmadai/Mistral-Large-Instruct-2407-xMADai-INT4"

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)

	@spaces.DynamicGpuSupport()
	class ModelWrapper:
	def __init__(self):
	self.model = None # Model will be loaded when GPU is allocated

	@spaces.GPU
	def generate(self, prompt):
	if self.model is None:
	# Load the model when GPU is allocated
	self.model = AutoGPTQForCausalLM.from_quantized(
	model_id,
	device_map='auto',
	trust_remote_code=True,
	)

	# Tokenize the input prompt
	inputs = tokenizer(prompt, return_tensors='pt').to('cuda')

	# Generate text
	outputs = self.model.generate(
	**inputs,
	do_sample=True,
	max_new_tokens=512
	)

	# Decode the generated text
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return generated_text

	# Instantiate the model wrapper
	model_wrapper = ModelWrapper()

	# Create the Gradio interface
	interface = gr.Interface(
	fn=model_wrapper.generate,
	inputs=gr.Textbox(lines=5, label="Input Prompt"),
	outputs=gr.Textbox(label="Generated Text"),
	title="Mistral-Large-Instruct-2407 Text Completion",
	description="Enter a prompt and receive a text completion using the Mistral-Large-Instruct-2407 INT4 model."
	)

	if __name__ == "__main__":
	interface.launch()