from threading import Thread from llama_cpp import Llama import torch import gradio as gr import re torch_device = "cuda" if torch.cuda.is_available() else "cpu" print("Running on device:", torch_device) print("CPU threads:", torch.get_num_threads()) llm = Llama(model_path = 'Llama-2-ko-7B-chat-gguf-q4_0.bin', n_ctx=1024, ) def gen(x, max_new_tokens): output = llm(f"Q: {x} A: ", max_tokens=1024, stop=["Q:", "\n"], echo=True) return output['choices'][0]['text'].replace('▁',' ') def reset_textbox(): return gr.update(value='') with gr.Blocks() as demo: duplicate_link = "https://huggingface.co/spaces/beomi/KoRWKV-1.5B?duplicate=true" gr.Markdown( "duplicated from beomi/KoRWKV-1.5B, baseModel:EleutherAI/polyglot-ko-1.3b" ) with gr.Row(): with gr.Column(scale=4): user_text = gr.Textbox( placeholder='우리 여행 갈래?', label="User input" ) model_output = gr.Textbox(label="Model output", lines=10, interactive=False) button_submit = gr.Button(value="Submit") with gr.Column(scale=1): max_new_tokens = gr.Slider( minimum=1, maximum=200, value=20, step=1, interactive=True, label="Max New Tokens", ) button_submit.click(gen, [user_text, max_new_tokens], model_output) demo.queue(max_size=32).launch(enable_queue=True)