import gradio as gr #from llama_cpp import Llama import random import subprocess import time # Initialize model #llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31)) """ def generate_response(user_message): encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:" tokens = llm.tokenize(encodeduserm) output = b"" count = 0 for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=0.72, repeat_penalty=1.1): text = llm.detokenize([token]) output += text count += 1 if count >= 500 or (token == llm.token_eos()): break return output.decode() """ """ def generate_response(user_message): print("Before request") cmd = [ "/app/llama.cpp/main", # Path to the executable "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", "-p", user_message, "-n", "400", "-e" ] result = subprocess.run(cmd, capture_output=True, text=True) print("After response") return result.stdout """ def generate_response(user_message): cmd = [ "/app/llama.cpp/main", # Path to the executable "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", "-p", user_message, "-n", "400", "-e" ] # Start the subprocess process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) start_time = time.time() alllines = "" # Yield each line of output as it becomes available for line in process.stdout: alllines += " " + line elapsed_time = time.time() - start_time # Calculate elapsed time yield f"{alllines} [Inference time: {elapsed_time:.2f} seconds]" # Wait for the subprocess to finish if it hasn't already process.wait() # Check for any errors if process.returncode != 0: error_message = process.stderr.read() print(f"Error: {error_message}") iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=2, placeholder="Type your message here..."), outputs="text", title="LLaMA Chat Interface", description="Enter your message and get a response from the LLaMA model.", flagging_dir="/usr/src/app/flagged", ) iface.launch(server_name="0.0.0.0") #share=True)