Shi-Ci-app

Runtime error

App Files Files Community

Cran-May commited on Oct 27, 2024

Commit

447fdae

verified ·

1 Parent(s): b6a6743

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -24

app.py CHANGED Viewed

@@ -26,7 +26,6 @@ hf_hub_download(
     local_dir=model_dir
 )
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
@@ -34,13 +33,8 @@ def chat_fn(message, history, model, system_message, max_tokens, temperature, to
     history_list = history or []
     response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
-    for current_history in response_generator:
-        messages = []
-        for user_msg, bot_msg in current_history:
-            messages.append(ChatMessage(role="user", content=user_msg))
-            messages.append(ChatMessage(role="assistant", content=bot_msg))
-        yield messages, history
 def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
@@ -79,18 +73,12 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
     messages = BasicChatHistory()
-    for msn in history:
-        user = {
-            'role': Roles.user,
-            'content': msn[0]
-        }
-        assistant = {
-            'role': Roles.assistant,
-            'content': msn[1]
-        }
-        messages.add_message(user)
-        messages.add_message(assistant)
     start_time = time.time()
     token_count = 0
@@ -103,13 +91,12 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
     )
     outputs = ""
-    current_history = list(history)
     for output in stream:
         outputs += output
         token_count += len(output.split())
-        current_history = history + [(message, outputs)]
-        yield current_history
     end_time = time.time()
     latency = end_time - start_time
@@ -117,7 +104,6 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
     print(f"Latency: {latency} seconds")
     print(f"Speed: {speed} tokens/second")
 description = """<p><center>
 <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
 Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.

     local_dir=model_dir
 )
 def get_messages_formatter_type(model_name):
     return MessagesFormatterType.LLAMA_3
     history_list = history or []
     response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
+    for chat_history in response_generator:
+        yield chat_history.get_messages(), history #  直接yield BasicChatHistory 对象
 def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
     messages = BasicChatHistory()
+    for user_msg, bot_msg in history:
+        messages.add_message(ChatMessage(role="user", content=user_msg))
+        messages.add_message(ChatMessage(role="assistant", content=bot_msg))
+    messages.add_message(ChatMessage(role="user", content=message)) # 添加用户当前消息
     start_time = time.time()
     token_count = 0
     )
     outputs = ""
     for output in stream:
         outputs += output
         token_count += len(output.split())
+        messages.add_message(ChatMessage(role="assistant", content=output))
+        yield messages
     end_time = time.time()
     latency = end_time - start_time
     print(f"Latency: {latency} seconds")
     print(f"Speed: {speed} tokens/second")
 description = """<p><center>
 <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
 Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.