import spaces import json import subprocess import os import sys def run_command(command): process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) output, error = process.communicate() if process.returncode != 0: print(f"Error executing command: {command}") print(error.decode('utf-8')) exit(1) return output.decode('utf-8') # Download CUDA installer download_command = "wget https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run" result = run_command(download_command) if result is None: print("Failed to download CUDA installer.") exit(1) # Run CUDA installer in silent mode install_command = "sh cuda_12.2.0_535.54.03_linux.run --silent --toolkit --samples --override" result = run_command(install_command) if result is None: print("Failed to run CUDA installer.") exit(1) print("CUDA installation process completed.") def install_packages(): # Clone the repository with submodules run_command("git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git") # Change to the cloned directory os.chdir("llama-cpp-python") # Checkout the specific commit in the llama.cpp submodule os.chdir("vendor/llama.cpp") run_command("git checkout 50e0535") os.chdir("../..") # Upgrade pip run_command("pip install --upgrade pip") # Install all optional dependencies with CUDA support run_command('CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DCUDA_PATH=/usr/local/cuda-12.2 -DCUDAToolkit_ROOT=/usr/local/cuda-12.2 -DCUDAToolkit_INCLUDE_DIR=/usr/local/cuda-12/include -DCUDAToolkit_LIBRARY_DIR=/usr/local/cuda-12.2/lib64" FORCE_CMAKE=1 pip install -e .') run_command("make clean && GGML_OPENBLAS=1 make -j") # Reinstall the package with CUDA support run_command('CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DCUDA_PATH=/usr/local/cuda-12.2 -DCUDAToolkit_ROOT=/usr/local/cuda-12.2 -DCUDAToolkit_INCLUDE_DIR=/usr/local/cuda-12/include -DCUDAToolkit_LIBRARY_DIR=/usr/local/cuda-12.2/lib64" FORCE_CMAKE=1 pip install -e .') # Install llama-cpp-agent run_command("pip install llama-cpp-agent") run_command("export PYTHONPATH=$PYTHONPATH:$(pwd)") print("Installation complete!") try: install_packages() # Add a delay to allow for package registration import time time.sleep(5) # Force Python to reload the site packages import site import importlib importlib.reload(site) # Now try to import the libraries from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType from llama_cpp_agent.providers import LlamaCppPythonProvider from llama_cpp_agent.chat_history import BasicChatHistory from llama_cpp_agent.chat_history.messages import Roles print("Libraries imported successfully!") except Exception as e: print(f"Installation failed or libraries couldn't be imported: {str(e)}") sys.exit(1) import gradio as gr from huggingface_hub import hf_hub_download hf_hub_download( repo_id="MaziyarPanahi/Mistral-Nemo-Instruct-2407-GGUF", filename="Mistral-Nemo-Instruct-2407.Q5_K_M.gguf", local_dir="./models" ) # Initialize LLM outside the respond function llm = Llama( model_path="models/Mistral-Nemo-Instruct-2407.Q5_K_M.gguf", flash_attn=True, n_gpu_layers=81, n_batch=1024, n_ctx=32768, ) provider = LlamaCppPythonProvider(llm) @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, top_k, repeat_penalty, ): chat_template = MessagesFormatterType.MISTRAL agent = LlamaCppAgent( provider, system_prompt=f"{system_message}", predefined_messages_formatter_type=chat_template, debug_output=True ) settings = provider.get_provider_default_settings() settings.temperature = temperature settings.top_k = top_k settings.top_p = top_p settings.max_tokens = max_tokens settings.repeat_penalty = repeat_penalty settings.stream = True messages = BasicChatHistory() for msn in history: user = { 'role': Roles.user, 'content': msn[0] } assistant = { 'role': Roles.assistant, 'content': msn[1] } messages.add_message(user) messages.add_message(assistant) stream = agent.get_chat_response( message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False ) outputs = "" for output in stream: outputs += output yield outputs description = """

[Instruct Model] [Base Model] [GGUF Version]

""" demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a helpful assistant.", label="System message"), gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p", ), gr.Slider( minimum=0, maximum=100, value=40, step=1, label="Top-k", ), gr.Slider( minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Repetition penalty", ), ], retry_btn="Retry", undo_btn="Undo", clear_btn="Clear", submit_btn="Send", title="Chat with Mistral-NeMo using llama.cpp", description=description, chatbot=gr.Chatbot( scale=1, likeable=False, show_copy_button=True ) ) if __name__ == "__main__": demo.launch(debug=True)