Shi-Ci-app

Runtime error

App Files Files Community

Cran-May commited on Oct 27, 2024

Commit

039937f

verified ·

1 Parent(s): 0f4d06b

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -25

app.py CHANGED Viewed

@@ -10,19 +10,18 @@ from llama_cpp import Llama
 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 from llama_cpp_agent.chat_history import BasicChatHistory
 import gradio as gr
 from huggingface_hub import hf_hub_download
 llm = None
 llm_model = None
-# Download the model (ensure the directory exists)
-model_dir = "./models"
-os.makedirs(model_dir, exist_ok=True)
 hf_hub_download(
     repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
     filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
-    local_dir=model_dir
 )
 def get_messages_formatter_type(model_name):
@@ -32,24 +31,36 @@ def chat_fn(message, history, model, system_message, max_tokens, temperature, to
     history_list = history or []
     response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
-    for chat_history in response_generator:
-        yield chat_history.get_messages(), history
-def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
     global llm
     global llm_model
     chat_template = get_messages_formatter_type(model)
     if llm is None or llm_model != model:
         llm = Llama(
-            model_path=f"{model_dir}/{model}",
             n_gpu_layers=0,
-            n_batch=4096,
-            n_ctx=8192,
-            n_threads=os.cpu_count(),
-            f16_kv=True,
         )
         llm_model = model
@@ -66,18 +77,24 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
     settings.temperature = temperature
     settings.top_k = top_k
     settings.top_p = top_p
-    settings.max_tokens = min(max_tokens, 8192)
     settings.repeat_penalty = repeat_penalty
     settings.stream = True
     messages = BasicChatHistory()
-    for user_msg, bot_msg in history:
-        messages.add_message({"role": "user", "content": user_msg})
-        messages.add_message({"role": "assistant", "content": bot_msg})
-    messages.add_message({"role": "user", "content": message})
     start_time = time.time()
     token_count = 0
@@ -90,12 +107,13 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
     )
     outputs = ""
     for output in stream:
         outputs += output
         token_count += len(output.split())
-        messages.add_message({"role": "assistant", "content": output})
-        yield messages
     end_time = time.time()
     latency = end_time - start_time
@@ -123,7 +141,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet"
         code_background_fill_dark="#292733",
     )) as demo:
-    chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages')
     message = gr.Textbox(label="Your message")
     model_dropdown = gr.Dropdown(
         ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],

 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 from llama_cpp_agent.chat_history import BasicChatHistory
+from llama_cpp_agent.chat_history.messages import Roles
 import gradio as gr
 from huggingface_hub import hf_hub_download
 llm = None
 llm_model = None
+# Download the new model
 hf_hub_download(
     repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
     filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
+    local_dir="./models"
 )
 def get_messages_formatter_type(model_name):
     history_list = history or []
     response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
+    full_response = ""
+    for current_history in response_generator:
+        full_response = current_history[-1][1]  # 获取最新的回复
+        yield current_history, history #  在每次迭代中返回当前历史记录和历史记录
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    model,
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+    top_k,
+    repeat_penalty,
+):
     global llm
     global llm_model
     chat_template = get_messages_formatter_type(model)
     if llm is None or llm_model != model:
         llm = Llama(
+            model_path=f"models/{model}",
             n_gpu_layers=0,
+            n_batch=4096,    # 增加batch size提升速度
+            n_ctx=8192,      # 增加上下文长度到8192
+            n_threads=2,     # 使用所有可用CPU核心
+            f16_kv=True,     # 使用FP16来减少内存使用
         )
         llm_model = model
     settings.temperature = temperature
     settings.top_k = top_k
     settings.top_p = top_p
+    settings.max_tokens = min(max_tokens, 8192)  # 确保max_tokens不超过n_ctx
     settings.repeat_penalty = repeat_penalty
     settings.stream = True
     messages = BasicChatHistory()
+    for msn in history:
+        user = {
+            'role': Roles.user,
+            'content': msn[0]
+        }
+        assistant = {
+            'role': Roles.assistant,
+            'content': msn[1]
+        }
+        messages.add_message(user)
+        messages.add_message(assistant)
     start_time = time.time()
     token_count = 0
     )
     outputs = ""
+    current_history = list(history)
     for output in stream:
         outputs += output
         token_count += len(output.split())
+        current_history = history + [(message, outputs)]
+        yield current_history
     end_time = time.time()
     latency = end_time - start_time
         code_background_fill_dark="#292733",
     )) as demo:
+    chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages') #  修改 chatbot 类型
     message = gr.Textbox(label="Your message")
     model_dropdown = gr.Dropdown(
         ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],