kwabs22 commited on
Commit
e3894fb
1 Parent(s): 5dd2646

Working but added inference times

Browse files
Files changed (2) hide show
  1. README.md +4 -0
  2. app.py +5 -3
README.md CHANGED
@@ -5,6 +5,10 @@ colorFrom: blue
5
  colorTo: purple
6
  sdk: docker
7
  pinned: false
 
 
8
  ---
9
 
 
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
5
  colorTo: purple
6
  sdk: docker
7
  pinned: false
8
+ models:
9
+ - stabilityai/stablelm-2-zephyr-1_6b
10
  ---
11
 
12
+ Example of running llama.cpp (and by extension simple cpp) from python without pip package dependency issues
13
+
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  #from llama_cpp import Llama
3
  import random
4
  import subprocess
 
 
5
 
6
  # Initialize model
7
  #llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
@@ -46,21 +48,21 @@ def generate_response(user_message):
46
  "-e"
47
  ]
48
 
49
- print("Before request")
50
  # Start the subprocess
51
  process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
52
 
 
53
  alllines = ""
54
 
55
  # Yield each line of output as it becomes available
56
  for line in process.stdout:
57
  alllines += " " + line
58
- yield alllines
 
59
 
60
  # Wait for the subprocess to finish if it hasn't already
61
  process.wait()
62
 
63
- print("After response")
64
  # Check for any errors
65
  if process.returncode != 0:
66
  error_message = process.stderr.read()
 
2
  #from llama_cpp import Llama
3
  import random
4
  import subprocess
5
+ import time
6
+
7
 
8
  # Initialize model
9
  #llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
 
48
  "-e"
49
  ]
50
 
 
51
  # Start the subprocess
52
  process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
53
 
54
+ start_time = time.time()
55
  alllines = ""
56
 
57
  # Yield each line of output as it becomes available
58
  for line in process.stdout:
59
  alllines += " " + line
60
+ elapsed_time = time.time() - start_time # Calculate elapsed time
61
+ yield f"{alllines} [Inference time: {elapsed_time:.2f} seconds]"
62
 
63
  # Wait for the subprocess to finish if it hasn't already
64
  process.wait()
65
 
 
66
  # Check for any errors
67
  if process.returncode != 0:
68
  error_message = process.stderr.read()