Cran-May commited on
Commit
b6a6743
·
verified ·
1 Parent(s): e07a8ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -27
app.py CHANGED
@@ -10,20 +10,23 @@ from llama_cpp import Llama
10
  from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
11
  from llama_cpp_agent.providers import LlamaCppPythonProvider
12
  from llama_cpp_agent.chat_history import BasicChatHistory
13
- from llama_cpp_agent.chat_history.messages import Roles
14
  import gradio as gr
15
  from huggingface_hub import hf_hub_download
16
 
17
  llm = None
18
  llm_model = None
19
 
20
- # Download the new model
 
 
21
  hf_hub_download(
22
  repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
23
  filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
24
- local_dir="./models"
25
  )
26
 
 
27
  def get_messages_formatter_type(model_name):
28
  return MessagesFormatterType.LLAMA_3
29
 
@@ -31,36 +34,29 @@ def chat_fn(message, history, model, system_message, max_tokens, temperature, to
31
  history_list = history or []
32
  response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
33
 
34
- full_response = ""
35
  for current_history in response_generator:
36
- full_response = current_history[-1][1] # 获取最新的回复
37
- yield current_history, history # 在每次迭代中返回当前历史记录和历史记录
38
-
39
-
40
- def respond(
41
- message,
42
- history: list[tuple[str, str]],
43
- model,
44
- system_message,
45
- max_tokens,
46
- temperature,
47
- top_p,
48
- top_k,
49
- repeat_penalty,
50
- ):
51
  global llm
52
  global llm_model
53
-
54
  chat_template = get_messages_formatter_type(model)
55
 
56
  if llm is None or llm_model != model:
57
  llm = Llama(
58
- model_path=f"models/{model}",
59
  n_gpu_layers=0,
60
- n_batch=4096, # 增加batch size提升速度
61
- n_ctx=8192, # 增加上下文长度到8192
62
- n_threads=2, # 使用所有可用CPU核心
63
- f16_kv=True, # 使用FP16来减少内存使用
64
  )
65
  llm_model = model
66
 
@@ -77,7 +73,7 @@ def respond(
77
  settings.temperature = temperature
78
  settings.top_k = top_k
79
  settings.top_p = top_p
80
- settings.max_tokens = min(max_tokens, 8192) # 确保max_tokens不超过n_ctx
81
  settings.repeat_penalty = repeat_penalty
82
  settings.stream = True
83
 
@@ -121,6 +117,7 @@ def respond(
121
  print(f"Latency: {latency} seconds")
122
  print(f"Speed: {speed} tokens/second")
123
 
 
124
  description = """<p><center>
125
  <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
126
  Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
@@ -141,7 +138,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet"
141
  code_background_fill_dark="#292733",
142
  )) as demo:
143
 
144
- chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages') # 修改 chatbot 类型
145
  message = gr.Textbox(label="Your message")
146
  model_dropdown = gr.Dropdown(
147
  ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],
 
10
  from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
11
  from llama_cpp_agent.providers import LlamaCppPythonProvider
12
  from llama_cpp_agent.chat_history import BasicChatHistory
13
+ from llama_cpp_agent.chat_history.messages import Roles, ChatMessage
14
  import gradio as gr
15
  from huggingface_hub import hf_hub_download
16
 
17
  llm = None
18
  llm_model = None
19
 
20
+ # Download the model (ensure the directory exists)
21
+ model_dir = "./models"
22
+ os.makedirs(model_dir, exist_ok=True)
23
  hf_hub_download(
24
  repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
25
  filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
26
+ local_dir=model_dir
27
  )
28
 
29
+
30
  def get_messages_formatter_type(model_name):
31
  return MessagesFormatterType.LLAMA_3
32
 
 
34
  history_list = history or []
35
  response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
36
 
 
37
  for current_history in response_generator:
38
+ messages = []
39
+ for user_msg, bot_msg in current_history:
40
+ messages.append(ChatMessage(role="user", content=user_msg))
41
+ messages.append(ChatMessage(role="assistant", content=bot_msg))
42
+
43
+ yield messages, history
44
+
45
+
46
+ def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
 
 
 
 
 
 
47
  global llm
48
  global llm_model
49
+
50
  chat_template = get_messages_formatter_type(model)
51
 
52
  if llm is None or llm_model != model:
53
  llm = Llama(
54
+ model_path=f"{model_dir}/{model}",
55
  n_gpu_layers=0,
56
+ n_batch=4096,
57
+ n_ctx=8192,
58
+ n_threads=os.cpu_count(),
59
+ f16_kv=True,
60
  )
61
  llm_model = model
62
 
 
73
  settings.temperature = temperature
74
  settings.top_k = top_k
75
  settings.top_p = top_p
76
+ settings.max_tokens = min(max_tokens, 8192)
77
  settings.repeat_penalty = repeat_penalty
78
  settings.stream = True
79
 
 
117
  print(f"Latency: {latency} seconds")
118
  print(f"Speed: {speed} tokens/second")
119
 
120
+
121
  description = """<p><center>
122
  <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
123
  Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
 
138
  code_background_fill_dark="#292733",
139
  )) as demo:
140
 
141
+ chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages')
142
  message = gr.Textbox(label="Your message")
143
  model_dropdown = gr.Dropdown(
144
  ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],