|
import spaces |
|
import gradio as gr |
|
|
|
|
|
|
|
from airllm import AutoModel |
|
import mlx.core as mx |
|
|
|
model = AutoModel.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") |
|
|
|
MAX_LENGTH = 128 |
|
|
|
|
|
@spaces.GPU |
|
def generate_text(input_text): |
|
|
|
input_tokens = model.tokenizer(input_text, |
|
return_tensors="np", |
|
return_attention_mask=False, |
|
truncation=True, |
|
max_length=MAX_LENGTH, |
|
padding=False) |
|
|
|
|
|
output = model.generate(mx.array(input_tokens['input_ids']), |
|
max_new_tokens=20, |
|
use_cache=True, |
|
return_dict_in_generate=True) |
|
|
|
|
|
|
|
return output |
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_text, |
|
inputs=gr.Textbox(placeholder="Enter prompt..."), |
|
outputs="text", |
|
title="LLaMA 3 70B Text Generation" |
|
) |
|
|
|
iface.launch(server_name="0.0.0.0", server_port=7860) |