import spaces import gradio as gr # from airllm import HuggingFaceModelLoader, AutoModelForCausalLM from airllm import AutoModel import mlx.core as mx model = AutoModel("meta-llama/Meta-Llama-3-8B-Instruct") # model = AutoModel.from_pretrained(model_loader) MAX_LENGTH = 128 @spaces.GPU def generate_text(input_text): input_tokens = model.tokenizer(input_text, return_tensors="np", return_attention_mask=False, truncation=True, max_length=MAX_LENGTH, padding=False) output = model.generate(mx.array(input_tokens['input_ids']), max_new_tokens=20, use_cache=True, return_dict_in_generate=True) # input_ids = model.tokenizer.encode(input_text, return_tensors="np") # output = model.generate(input_ids, max_length=100) # return model.tokenizer.decode(output[0]) return output iface = gr.Interface( fn=generate_text, inputs=gr.Textbox(placeholder="Enter prompt..."), outputs="text", title="LLaMA 3 70B Text Generation" ) iface.launch(server_name="0.0.0.0", server_port=7860)