How to run this code locally?

#5
by ali0une - opened

Hi there!

i find this space amazing!

i'd like to run this code but using my local gguf models.

What should be modified in the app.py code to achieve this?

i've put models in gguf format in the models directory.

i tried like this :

import spaces
import logging
import gradio as gr
from huggingface_hub import hf_hub_download

from llama_cpp import Llama
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.llm_output_settings import (
    LlmStructuredOutputSettings,
    LlmStructuredOutputType,
)
from llama_cpp_agent.tools import WebSearchTool
from llama_cpp_agent.prompt_templates import web_search_system_prompt, research_system_prompt
from ui import css, PLACEHOLDER
from utils import CitingSources
from settings import get_context_by_model, get_messages_formatter_type

#hf_hub_download(
#    repo_id="bartowski/Mistral-7B-Instruct-v0.3-GGUF",
#    filename="Mistral-7B-Instruct-v0.3-Q4_K_M.gguf",
#    local_dir="./models"
#)
#hf_hub_download(
#    repo_id="bartowski/Meta-Llama-3-8B-Instruct-GGUF",
#    filename="Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
#    local_dir="./models"
#)
#hf_hub_download(
#    repo_id="TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF",
#    filename="mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
#    local_dir="./models"
#)

examples = [
    ["latest news about Yann LeCun"],
    ["Latest news site:github.blog"],
    ["Where I can find best hotel in Galapagos, Ecuador intitle:hotel"],
    ["filetype:pdf intitle:python"]
]

def write_message_to_user():
    """
    Let you write a message to the user.
    """
    return "Please write the message to the user."


@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    model = 'Mistral-7B-Instruct-v0.3-Q4_K_M.gguf',
    system_message = 'Helpful assistant',
    max_tokens = 2048,
    temperature = 0.45,
    top_p = 0.95,
    top_k = 40,
    repeat_penalty = 1.1,
):

    if "Mistral" in model:
        model = 'Mistral-7B-Instruct-v0.3-Q4_K_M.gguf'
    elif "Mixtral" in model:
        model = 'mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf'
    else:
        model = 'Meta-Llama-3-8B-Instruct-Q4_K_M.gguf'
    yield model
    
    chat_template = get_messages_formatter_type(model)
    llm = Llama(
        model_path=f"models/{model}",
        flash_attn=True,
        n_gpu_layers=10,
        n_batch=1024,
        n_ctx=get_context_by_model(model),
    )
    provider = LlamaCppPythonProvider(llm)
    logging.info(f"Loaded chat examples: {chat_template}")
    search_tool = WebSearchTool(
        llm_provider=provider,
        message_formatter_type=chat_template,
        max_tokens_search_results=12000,
        max_tokens_per_summary=2048,
    )

    web_search_agent = LlamaCppAgent(
        provider,
        system_prompt=web_search_system_prompt,
        predefined_messages_formatter_type=chat_template,
        debug_output=True,
    )

    answer_agent = LlamaCppAgent(
        provider,
        system_prompt=research_system_prompt,
        predefined_messages_formatter_type=chat_template,
        debug_output=True,
    )

    settings = provider.get_provider_default_settings()
    settings.stream = False
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p

    settings.max_tokens = max_tokens
    settings.repeat_penalty = repeat_penalty

    output_settings = LlmStructuredOutputSettings.from_functions(
        [search_tool.get_tool()]
    )

    messages = BasicChatHistory()

    for msn in history:
        user = {"role": Roles.user, "content": msn[0]}
        assistant = {"role": Roles.assistant, "content": msn[1]}
        messages.add_message(user)
        messages.add_message(assistant)

    result = web_search_agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        structured_output_settings=output_settings,
        add_message_to_chat_history=False,
        add_response_to_chat_history=False,
        print_output=False,
    )

    outputs = ""

    settings.stream = True
    response_text = answer_agent.get_chat_response(
        f"Write a detailed and complete research document that fulfills the following user request: '{message}', based on the information from the web below.\n\n" +
        result[0]["return_value"],
        role=Roles.tool,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=True,
        print_output=False,
    )

    for text in response_text:
        outputs += text
        yield outputs

    output_settings = LlmStructuredOutputSettings.from_pydantic_models(
        [CitingSources], LlmStructuredOutputType.object_instance
    )

    citing_sources = answer_agent.get_chat_response(
        "Cite the sources you used in your response.",
        role=Roles.tool,
        llm_sampling_settings=settings,
        chat_history=messages,
        returns_streaming_generator=False,
        structured_output_settings=output_settings,
        print_output=False,
    )
    outputs += "\n\nSources:\n"
    outputs += "\n".join(citing_sources.sources)
    yield outputs


demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Dropdown([
            'Mistral 7B Instruct v0.3',
            'Mixtral 8x7b Instruct v0.1',
            'Llama 3 8B Instruct'
        ],
            value="Mistral 7B Instruct v0.3",
            label="Model"
        ),
        gr.Textbox(value=web_search_system_prompt, label="System message"),
        gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.45, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
        ),
        gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition penalty",
        ),
    ],
    theme=gr.themes.Soft(
        primary_hue="green",
        secondary_hue="lime",
        neutral_hue="gray",
        font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
            body_background_fill_dark="#0c0505",
            block_background_fill_dark="#0c0505",
            block_border_width="1px",
            block_title_background_fill_dark="#1b0f0f",
            input_background_fill_dark="#140b0b",
            button_secondary_background_fill_dark="#140b0b",
            border_color_accent_dark="#1b0f0f",
            border_color_primary_dark="#1b0f0f",
            background_fill_secondary_dark="#0c0505",
            color_accent_soft_dark="transparent",
            code_background_fill_dark="#140b0b"
        ),
        css=css,
        retry_btn="Retry",
        undo_btn="Undo",
        clear_btn="Clear",
        submit_btn="Send",
        examples = (examples),
        description="Llama-cpp-agent: Chat Web Search DDG Agent",
        analytics_enabled=False,
        chatbot=gr.Chatbot(
            scale=1,
            placeholder=PLACEHOLDER,
            show_copy_button=True
        )
    )

if __name__ == "__main__":
    demo.launch()

but i have an error :

  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/queueing.py", line 580, in process_events
    response = await route_utils.call_process_api(
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/route_utils.py", line 276, in call_process_api
    output = await app.get_blocks().process_api(
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/blocks.py", line 1928, in process_api
    result = await self.call_function(
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/blocks.py", line 1526, in call_function
    prediction = await utils.async_iteration(iterator)
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 657, in async_iteration
    return await iterator.__anext__()
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 783, in asyncgen_wrapper
    response = await iterator.__anext__()
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/chat_interface.py", line 608, in _stream_fn
    async for response in generator:
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 650, in __anext__
    return await anyio.to_thread.run_sync(
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
    return await future
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
    result = context.run(func, *args)
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/gradio/utils.py", line 633, in run_sync_iterator_async
    return next(iterator)
  File "/whatever/ddg-web-search-chat/app.py", line 73, in respond
    llm = Llama(
  File "/whatever/ddg-web-search-chat/venv/lib/python3.10/site-packages/llama_cpp/llama.py", line 279, in __init__
    self.n_batch = min(n_ctx, n_batch)  # ???
TypeError: '<' not supported between instances of 'int' and 'NoneType'
ali0une changed discussion title from How to run this code localy? to How to run this code locally?

Hello!

Love you like it!

remove:

import spaces

@spaces.GPU(duration=120)

you need gpu if you don't have it then you need to adjust

n_batch=1024, // use 512
n_gpu_layers=10,

your error look like your self.n_batch = min(n_ctx, n_batch) # ??? is not a int

Thank you!

i made the modification your proposed but still have this same error.
TypeError: '<' not supported between instances of 'int' and 'NoneType'

your error look like your self.n_batch = min(n_ctx, n_batch) # ??? is not a int

Yes i'm currently searching the web, and it looks like this is the problem.
But i don't know why!

What part of the code could send NoneType instead of an int?

i thought if the code could run on a space it should run on my computer in a python venv.

Poscye org

yeah it would run.. I would create a local repo in github but currently i don't have gpu that is why I use humble HF Spaces hehe

Sign up or log in to comment