KingNish commited on
Commit
d3ffa5e
·
verified ·
1 Parent(s): 66b33d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -3
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import subprocess
 
3
  from llama_cpp import Llama
4
  from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
5
  from llama_cpp_agent.providers import LlamaCppPythonProvider
@@ -40,9 +41,9 @@ def respond(
40
  if llm is None or llm_model != model:
41
  llm = Llama(
42
  model_path=f"models/{model}",
43
- n_gpu_layers=0,
44
- n_batch=32000,
45
- n_ctx=2048,
46
  )
47
  llm_model = model
48
 
@@ -77,6 +78,9 @@ def respond(
77
  messages.add_message(user)
78
  messages.add_message(assistant)
79
 
 
 
 
80
  stream = agent.get_chat_response(
81
  message,
82
  llm_sampling_settings=settings,
@@ -88,8 +92,15 @@ def respond(
88
  outputs = ""
89
  for output in stream:
90
  outputs += output
 
91
  yield outputs
92
 
 
 
 
 
 
 
93
  description = """<p><center>
94
  <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
95
 
 
1
  import json
2
  import subprocess
3
+ import time
4
  from llama_cpp import Llama
5
  from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
6
  from llama_cpp_agent.providers import LlamaCppPythonProvider
 
41
  if llm is None or llm_model != model:
42
  llm = Llama(
43
  model_path=f"models/{model}",
44
+ n_gpu_layers=4, # Adjust based on your GPU
45
+ n_batch=64000, # Adjust based on your RAM
46
+ n_ctx=1024, # Adjust based on your RAM and desired context length
47
  )
48
  llm_model = model
49
 
 
78
  messages.add_message(user)
79
  messages.add_message(assistant)
80
 
81
+ start_time = time.time()
82
+ token_count = 0
83
+
84
  stream = agent.get_chat_response(
85
  message,
86
  llm_sampling_settings=settings,
 
92
  outputs = ""
93
  for output in stream:
94
  outputs += output
95
+ token_count += len(output.split())
96
  yield outputs
97
 
98
+ end_time = time.time()
99
+ latency = end_time - start_time
100
+ speed = token_count / (end_time - start_time)
101
+ print(f"Latency: {latency} seconds")
102
+ print(f"Speed: {speed} tokens/second")
103
+
104
  description = """<p><center>
105
  <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
106