llama-cpp-agent

Paused

File size: 3,542 Bytes

import spaces
import subprocess
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider

from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
from llama_index.core.memory import ChatMemoryBuffer

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent', shell=True)

hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)

    provider = LlamaCppPythonProvider(llama_model)

    agent = LlamaCppAgent(
      provider,
      system_prompt=f"{system_message}",
      predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
      debug_output=True
    )

    settings = provider.get_provider_default_settings()
    settings.stream = True
    settings.max_tokens = max_tokens
    settings.temperature = temperature
    settings.top_p = top_p

    yield agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True)
    # stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
    # chat_template = '<s>[INST] ' + system_message
    # # for human, assistant in history:
    # #     chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
    # chat_template += ' ' + message + ' [/INST]'

    # print(chat_template)
    
    # llm = LlamaCPP(
    #     model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
    #     temperature=temperature,
    #     max_new_tokens=max_tokens,
    #     context_window=2048,
    #     generate_kwargs={
    #         "top_k": 50,
    #         "top_p": top_p,
    #         "repeat_penalty": 1.3
    #     },
    #     model_kwargs={
    #         "n_threads": 0,
    #         "n_gpu_layers": 33
    #     },
    #     messages_to_prompt=messages_to_prompt,
    #     completion_to_prompt=completion_to_prompt,
    #     verbose=True,
    # )
    # # response = ""
    # # for chunk in llm.stream_complete(message):
    # #     print(chunk.delta, end="", flush=True)
    # #     response += str(chunk.delta)
    # #     yield response
    # outputs = []
    # for chunk in llm.stream_complete(message):
    #     outputs.append(chunk.delta)
    #     if chunk.delta in stop_tokens:
    #         break
    #     yield "".join(outputs)

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()