Cran-May commited on
Commit
039937f
·
verified ·
1 Parent(s): 0f4d06b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -25
app.py CHANGED
@@ -10,19 +10,18 @@ from llama_cpp import Llama
10
  from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
11
  from llama_cpp_agent.providers import LlamaCppPythonProvider
12
  from llama_cpp_agent.chat_history import BasicChatHistory
 
13
  import gradio as gr
14
  from huggingface_hub import hf_hub_download
15
 
16
  llm = None
17
  llm_model = None
18
 
19
- # Download the model (ensure the directory exists)
20
- model_dir = "./models"
21
- os.makedirs(model_dir, exist_ok=True)
22
  hf_hub_download(
23
  repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
24
  filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
25
- local_dir=model_dir
26
  )
27
 
28
  def get_messages_formatter_type(model_name):
@@ -32,24 +31,36 @@ def chat_fn(message, history, model, system_message, max_tokens, temperature, to
32
  history_list = history or []
33
  response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
34
 
35
- for chat_history in response_generator:
36
- yield chat_history.get_messages(), history
37
-
38
-
39
- def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
 
 
 
 
 
 
 
 
 
 
 
 
40
  global llm
41
  global llm_model
42
-
43
  chat_template = get_messages_formatter_type(model)
44
 
45
  if llm is None or llm_model != model:
46
  llm = Llama(
47
- model_path=f"{model_dir}/{model}",
48
  n_gpu_layers=0,
49
- n_batch=4096,
50
- n_ctx=8192,
51
- n_threads=os.cpu_count(),
52
- f16_kv=True,
53
  )
54
  llm_model = model
55
 
@@ -66,18 +77,24 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
66
  settings.temperature = temperature
67
  settings.top_k = top_k
68
  settings.top_p = top_p
69
- settings.max_tokens = min(max_tokens, 8192)
70
  settings.repeat_penalty = repeat_penalty
71
  settings.stream = True
72
 
73
  messages = BasicChatHistory()
74
 
75
- for user_msg, bot_msg in history:
76
- messages.add_message({"role": "user", "content": user_msg})
77
- messages.add_message({"role": "assistant", "content": bot_msg})
 
 
 
 
 
 
 
 
78
 
79
- messages.add_message({"role": "user", "content": message})
80
-
81
  start_time = time.time()
82
  token_count = 0
83
 
@@ -90,12 +107,13 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
90
  )
91
 
92
  outputs = ""
93
-
 
94
  for output in stream:
95
  outputs += output
96
  token_count += len(output.split())
97
- messages.add_message({"role": "assistant", "content": output})
98
- yield messages
99
 
100
  end_time = time.time()
101
  latency = end_time - start_time
@@ -123,7 +141,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet"
123
  code_background_fill_dark="#292733",
124
  )) as demo:
125
 
126
- chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages')
127
  message = gr.Textbox(label="Your message")
128
  model_dropdown = gr.Dropdown(
129
  ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],
 
10
  from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
11
  from llama_cpp_agent.providers import LlamaCppPythonProvider
12
  from llama_cpp_agent.chat_history import BasicChatHistory
13
+ from llama_cpp_agent.chat_history.messages import Roles
14
  import gradio as gr
15
  from huggingface_hub import hf_hub_download
16
 
17
  llm = None
18
  llm_model = None
19
 
20
+ # Download the new model
 
 
21
  hf_hub_download(
22
  repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
23
  filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
24
+ local_dir="./models"
25
  )
26
 
27
  def get_messages_formatter_type(model_name):
 
31
  history_list = history or []
32
  response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
33
 
34
+ full_response = ""
35
+ for current_history in response_generator:
36
+ full_response = current_history[-1][1] # 获取最新的回复
37
+ yield current_history, history # 在每次迭代中返回当前历史记录和历史记录
38
+
39
+
40
+ def respond(
41
+ message,
42
+ history: list[tuple[str, str]],
43
+ model,
44
+ system_message,
45
+ max_tokens,
46
+ temperature,
47
+ top_p,
48
+ top_k,
49
+ repeat_penalty,
50
+ ):
51
  global llm
52
  global llm_model
53
+
54
  chat_template = get_messages_formatter_type(model)
55
 
56
  if llm is None or llm_model != model:
57
  llm = Llama(
58
+ model_path=f"models/{model}",
59
  n_gpu_layers=0,
60
+ n_batch=4096, # 增加batch size提升速度
61
+ n_ctx=8192, # 增加上下文长度到8192
62
+ n_threads=2, # 使用所有可用CPU核心
63
+ f16_kv=True, # 使用FP16来减少内存使用
64
  )
65
  llm_model = model
66
 
 
77
  settings.temperature = temperature
78
  settings.top_k = top_k
79
  settings.top_p = top_p
80
+ settings.max_tokens = min(max_tokens, 8192) # 确保max_tokens不超过n_ctx
81
  settings.repeat_penalty = repeat_penalty
82
  settings.stream = True
83
 
84
  messages = BasicChatHistory()
85
 
86
+ for msn in history:
87
+ user = {
88
+ 'role': Roles.user,
89
+ 'content': msn[0]
90
+ }
91
+ assistant = {
92
+ 'role': Roles.assistant,
93
+ 'content': msn[1]
94
+ }
95
+ messages.add_message(user)
96
+ messages.add_message(assistant)
97
 
 
 
98
  start_time = time.time()
99
  token_count = 0
100
 
 
107
  )
108
 
109
  outputs = ""
110
+ current_history = list(history)
111
+
112
  for output in stream:
113
  outputs += output
114
  token_count += len(output.split())
115
+ current_history = history + [(message, outputs)]
116
+ yield current_history
117
 
118
  end_time = time.time()
119
  latency = end_time - start_time
 
141
  code_background_fill_dark="#292733",
142
  )) as demo:
143
 
144
+ chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages') # 修改 chatbot 类型
145
  message = gr.Textbox(label="Your message")
146
  model_dropdown = gr.Dropdown(
147
  ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],