import gradio as gr import psutil import subprocess import time def generate_response(user_message): #generate_response_token_by_token cmd = [ "/app/llama.cpp/main", # Path to the executable "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", "-p", user_message, "-n", "400", "-e" ] process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1) process_monitor = psutil.Process(process.pid) start_time = time.time() monitor_start_time = time.time() alltokens = "" token_buffer = '' tokencount = 0 try: while True: # Read one character at a time char = process.stdout.read(1) if char == '' and process.poll() is not None: break if char != '': token_buffer += char if char == ' ' or char == '\n': # Token delimiters elapsed_time = time.time() - start_time # Calculate elapsed time alltokens += token_buffer tokencount += 1 yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Tokens: { tokencount }]" token_buffer = '' # Reset token buffer # Log resource usage every minute if time.time() - monitor_start_time > 60: cpu_usage = process_monitor.cpu_percent() memory_usage = process_monitor.memory_info().rss # in bytes print(f"Subprocess CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage / 1024 ** 2} MB") monitor_start_time = time.time() # Reset the timer # Yield the last token if there is any if token_buffer: elapsed_time = time.time() - start_time # Calculate elapsed time alltokens += token_buffer yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Average Tokens per second: { round(tokencount / elapsed_time, 2) }]" finally: try: # Wait for the process to complete, with a timeout process.wait(timeout=60) # Timeout in seconds except subprocess.TimeoutExpired: print("Process didn't complete within the timeout. Killing it.") process.kill() process.wait() # Ensure proper cleanup # Wait for the subprocess to finish if it hasn't already process.stdout.close() process.stderr.close() # Check for any errors if process.returncode != 0: error_message = process.stderr.read() print(f"Error: {error_message}") def custom_generate_response(cust_user_message, prompt_index): """ Generates a custom response based on the user message and the selected prompt, including a custom ending specific to the prompt. Parameters: - cust_user_message: The message input from the user. - prompt_index: The index of the custom prompt to use. """ prompt, ending = CustomPrompts[prompt_index] # Unpack the prompt and its ending cust_user_message = f"{prompt}\n\n{cust_user_message}\n\n{ending}" yield from generate_response(cust_user_message) CustomPrompts = [ ("Write a Class Diagram based on the following text:", "Class Diagram:"), ("Write a Pydot code based on the following text:", "Pydot Code:"), ("Describe what a standard happy scene in any movie would be planned in great detail, based on the following text:", "Scene Details"), ("Explain a teardown of the product mentioned in the following text:", "Teardown Details:"), ("Explain the manufacturing of the product mentioned in the following text:", "Manufacturing Details:"), ("Explain the marketing considerations of the product mentioned in the following text:", "Considerations:"), ("Explain the target users considerations of the product mentioned in the following text:", "Target Users Considerations:"), ("My problem to solve is", "- please make 10 sub problems have to solve from this:"), ] with gr.Blocks() as iface: gr.HTML("Stabilityai's demo - https://huggingface.co/spaces/stabilityai/stablelm-2-1_6b-zephyr") gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=2, placeholder="Type your message here..."), outputs="text", title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test (Inconsistent Performance - 100 tokens in 50 secs (when this HF space is updated) or 800+ secs(HF space open for long))", description="No Prompt template used yet (Essentially autocomplete). No Message History for now - Enter your message and get a response.", flagging_dir="/usr/src/app/flagged", ) with gr.Group(): gr.HTML("Test for wrapping generator (Instead of buttons tabs and dropdowns?)") MainOutput = gr.TextArea(placeholder='Output will show here') CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here') # Dynamically create buttons and assign actions for index, (prompt, _) in enumerate(CustomPrompts): button = gr.Button(prompt) button.click(custom_generate_response, inputs=[CustomButtonInput, gr.State(index)], outputs=MainOutput) iface.queue().launch(server_name="0.0.0.0", share=True)