import gradio as gr from llama_cpp import Llama # Load the model llm = Llama.from_pretrained( repo_id="bartowski/Marco-o1-GGUF", filename="Marco-o1-Q4_K_M.gguf", ) # Access the tokenizer from the Llama model tokenizer = llm.get_tokenizer() def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): # Initialize an empty list to hold tokenized messages tokenized_messages = [] # Tokenize the system message tokenized_messages.append(tokenizer.encode(system_message)) # Tokenize the history messages for val in history: if val[0]: tokenized_messages.append(tokenizer.encode(val[0])) # User message if val[1]: tokenized_messages.append(tokenizer.encode(val[1])) # Assistant message # Tokenize the current user message tokenized_messages.append(tokenizer.encode(message)) response = "" # Use llm.create_completion with tokenized message