Shi-Ci-app

Runtime error

App Files Files Community

Cran-May commited on Oct 27, 2024

Commit

b6a6743

verified ·

1 Parent(s): e07a8ca

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -27

app.py CHANGED Viewed

@@ -10,20 +10,23 @@ from llama_cpp import Llama
 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 from llama_cpp_agent.chat_history import BasicChatHistory
-from llama_cpp_agent.chat_history.messages import Roles
 import gradio as gr
 from huggingface_hub import hf_hub_download
 llm = None
 llm_model = None
-# Download the new model
 hf_hub_download(
     repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
     filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
-    local_dir="./models"
 )
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
@@ -31,36 +34,29 @@ def chat_fn(message, history, model, system_message, max_tokens, temperature, to
     history_list = history or []
     response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
-    full_response = ""
     for current_history in response_generator:
-        full_response = current_history[-1][1]  # 获取最新的回复
-        yield current_history, history #  在每次迭代中返回当前历史记录和历史记录
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    model,
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    top_k,
-    repeat_penalty,
-):
     global llm
     global llm_model
     chat_template = get_messages_formatter_type(model)
     if llm is None or llm_model != model:
         llm = Llama(
-            model_path=f"models/{model}",
             n_gpu_layers=0,
-            n_batch=4096,    # 增加batch size提升速度
-            n_ctx=8192,      # 增加上下文长度到8192
-            n_threads=2,     # 使用所有可用CPU核心
-            f16_kv=True,     # 使用FP16来减少内存使用
         )
         llm_model = model
@@ -77,7 +73,7 @@ def respond(
     settings.temperature = temperature
     settings.top_k = top_k
     settings.top_p = top_p
-    settings.max_tokens = min(max_tokens, 8192)  # 确保max_tokens不超过n_ctx
     settings.repeat_penalty = repeat_penalty
     settings.stream = True
@@ -121,6 +117,7 @@ def respond(
     print(f"Latency: {latency} seconds")
     print(f"Speed: {speed} tokens/second")
 description = """<p><center>
 <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
 Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
@@ -141,7 +138,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", secondary_hue="violet"
         code_background_fill_dark="#292733",
     )) as demo:
-    chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages') #  修改 chatbot 类型
     message = gr.Textbox(label="Your message")
     model_dropdown = gr.Dropdown(
         ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],

 from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 from llama_cpp_agent.chat_history import BasicChatHistory
+from llama_cpp_agent.chat_history.messages import Roles, ChatMessage
 import gradio as gr
 from huggingface_hub import hf_hub_download
 llm = None
 llm_model = None
+# Download the model (ensure the directory exists)
+model_dir = "./models"
+os.makedirs(model_dir, exist_ok=True)
 hf_hub_download(
     repo_id="Cran-May/openbuddy-llama3.2-3b-v23.2-131k-Q5_K_M-GGUF",
     filename="openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf",
+    local_dir=model_dir
 )
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
     history_list = history or []
     response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
     for current_history in response_generator:
+        messages = []
+        for user_msg, bot_msg in current_history:
+            messages.append(ChatMessage(role="user", content=user_msg))
+            messages.append(ChatMessage(role="assistant", content=bot_msg))
+        yield messages, history
+def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
     global llm
     global llm_model
     chat_template = get_messages_formatter_type(model)
     if llm is None or llm_model != model:
         llm = Llama(
+            model_path=f"{model_dir}/{model}",
             n_gpu_layers=0,
+            n_batch=4096,
+            n_ctx=8192,
+            n_threads=os.cpu_count(),
+            f16_kv=True,
         )
         llm_model = model
     settings.temperature = temperature
     settings.top_k = top_k
     settings.top_p = top_p
+    settings.max_tokens = min(max_tokens, 8192)
     settings.repeat_penalty = repeat_penalty
     settings.stream = True
     print(f"Latency: {latency} seconds")
     print(f"Speed: {speed} tokens/second")
 description = """<p><center>
 <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
 Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
         code_background_fill_dark="#292733",
     )) as demo:
+    chatbot = gr.Chatbot(scale=1, show_copy_button=True, type='messages')
     message = gr.Textbox(label="Your message")
     model_dropdown = gr.Dropdown(
         ["openbuddy-llama3.2-3b-v23.2-131k-q5_k_m-imat.gguf"],