kwabs22 commited on
Commit
7e3fb58
·
1 Parent(s): ee032a8

after bufsize=1 change, exploring word or token level stream

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -3,6 +3,7 @@ import random
3
  import subprocess
4
  import time
5
 
 
6
  def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
7
  cmd = [
8
  "/app/llama.cpp/main", # Path to the executable
@@ -32,9 +33,9 @@ def generate_response(user_message): #Figure Out the parameters later and find a
32
  if process.returncode != 0:
33
  error_message = process.stderr.read()
34
  print(f"Error: {error_message}")
 
35
 
36
-
37
- def generate_response_token_by_token(user_message):
38
  cmd = [
39
  "/app/llama.cpp/main", # Path to the executable
40
  "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
@@ -58,13 +59,14 @@ def generate_response_token_by_token(user_message):
58
  if char == ' ' or char == '\n': # Token delimiters
59
  elapsed_time = time.time() - start_time # Calculate elapsed time
60
  alltokens += token_buffer
61
- yield f"{alltokens} [Inference time: {elapsed_time:.2f} seconds]"
62
  token_buffer = '' # Reset token buffer
63
 
64
  # Yield the last token if there is any
65
  if token_buffer:
66
  elapsed_time = time.time() - start_time # Calculate elapsed time
67
- yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
 
68
 
69
  # Wait for the subprocess to finish if it hasn't already
70
  process.wait()
@@ -93,7 +95,7 @@ with gr.Blocks() as iface:
93
  description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
94
  flagging_dir="/usr/src/app/flagged",
95
  )
96
- gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
97
  with gr.Group():
98
  gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
99
  MainOutput = gr.TextArea(placeholder='Output will show here')
 
3
  import subprocess
4
  import time
5
 
6
+ """
7
  def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
8
  cmd = [
9
  "/app/llama.cpp/main", # Path to the executable
 
33
  if process.returncode != 0:
34
  error_message = process.stderr.read()
35
  print(f"Error: {error_message}")
36
+ """
37
 
38
+ def generate_response(user_message): #generate_response_token_by_token
 
39
  cmd = [
40
  "/app/llama.cpp/main", # Path to the executable
41
  "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
 
59
  if char == ' ' or char == '\n': # Token delimiters
60
  elapsed_time = time.time() - start_time # Calculate elapsed time
61
  alltokens += token_buffer
62
+ yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds]"
63
  token_buffer = '' # Reset token buffer
64
 
65
  # Yield the last token if there is any
66
  if token_buffer:
67
  elapsed_time = time.time() - start_time # Calculate elapsed time
68
+ alltokens += token_buffer
69
+ yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds]"
70
 
71
  # Wait for the subprocess to finish if it hasn't already
72
  process.wait()
 
95
  description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
96
  flagging_dir="/usr/src/app/flagged",
97
  )
98
+ #gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
99
  with gr.Group():
100
  gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
101
  MainOutput = gr.TextArea(placeholder='Output will show here')