import gradio as gr import copy from llama_cpp import Llama from huggingface_hub import hf_hub_download # load from huggingfaces CONST_REPO_ID = "TheBloke/Llama-2-7B-Chat-GGML" CONST_FILENAME = "llama-2-7b-chat.ggmlv3.q6_K.bin" N_CTX = 4096 llm = Llama(model_path=hf_hub_download( repo_id=CONST_REPO_ID, filename=CONST_FILENAME), n_ctx=N_CTX ) history = N_CTX pre_prompt = \ " The user and the AI are having a conversation : <|endoftext|> \n" def generate_text(input_text, history): temp = "" if history == []: input_text_with_history = f"SYSTEM:{pre_prompt}" + \ "\n" + f"USER: {input_text} " + "\n" + " ASSISTANT:" else: input_text_with_history = f"{history[-1][1]}" + "\n" input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:" output = llm(input_text_with_history, max_tokens=4096, stop=[ "<|prompter|>", "<|endoftext|>", "<|endoftext|> \n", "ASSISTANT:", "USER:", "SYSTEM:"], stream=True ) for out in output: stream = copy.deepcopy(out) temp += stream["choices"][0]["text"] yield temp history = ["init", input_text_with_history] demo = gr.ChatInterface(generate_text, title=f"Lama2 on CPU: {CONST_FILENAME}", description=f"Running Llama2 with llama_cpp: \ \r\n{CONST_REPO_ID} {CONST_FILENAME}", examples=["Hi!", "Does it hard to be machine?", "When i am need a doctor?", "Ты говоришь по русски? Я злой." ], cache_examples=True, undo_btn="Undo", clear_btn="Clear") demo.queue(concurrency_count=10, max_size=50) demo.launch()