Cran-May commited on
Commit
447fdae
·
verified ·
1 Parent(s): b6a6743

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -24
app.py CHANGED
@@ -26,7 +26,6 @@ hf_hub_download(
26
  local_dir=model_dir
27
  )
28
 
29
-
30
  def get_messages_formatter_type(model_name):
31
  return MessagesFormatterType.LLAMA_3
32
 
@@ -34,13 +33,8 @@ def chat_fn(message, history, model, system_message, max_tokens, temperature, to
34
  history_list = history or []
35
  response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
36
 
37
- for current_history in response_generator:
38
- messages = []
39
- for user_msg, bot_msg in current_history:
40
- messages.append(ChatMessage(role="user", content=user_msg))
41
- messages.append(ChatMessage(role="assistant", content=bot_msg))
42
-
43
- yield messages, history
44
 
45
 
46
  def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
@@ -79,18 +73,12 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
79
 
80
  messages = BasicChatHistory()
81
 
82
- for msn in history:
83
- user = {
84
- 'role': Roles.user,
85
- 'content': msn[0]
86
- }
87
- assistant = {
88
- 'role': Roles.assistant,
89
- 'content': msn[1]
90
- }
91
- messages.add_message(user)
92
- messages.add_message(assistant)
93
 
 
 
94
  start_time = time.time()
95
  token_count = 0
96
 
@@ -103,13 +91,12 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
103
  )
104
 
105
  outputs = ""
106
- current_history = list(history)
107
-
108
  for output in stream:
109
  outputs += output
110
  token_count += len(output.split())
111
- current_history = history + [(message, outputs)]
112
- yield current_history
113
 
114
  end_time = time.time()
115
  latency = end_time - start_time
@@ -117,7 +104,6 @@ def respond(message, history, model, system_message, max_tokens, temperature, to
117
  print(f"Latency: {latency} seconds")
118
  print(f"Speed: {speed} tokens/second")
119
 
120
-
121
  description = """<p><center>
122
  <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
123
  Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.
 
26
  local_dir=model_dir
27
  )
28
 
 
29
  def get_messages_formatter_type(model_name):
30
  return MessagesFormatterType.LLAMA_3
31
 
 
33
  history_list = history or []
34
  response_generator = respond(message, history_list, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty)
35
 
36
+ for chat_history in response_generator:
37
+ yield chat_history.get_messages(), history # 直接yield BasicChatHistory 对象
 
 
 
 
 
38
 
39
 
40
  def respond(message, history, model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty):
 
73
 
74
  messages = BasicChatHistory()
75
 
76
+ for user_msg, bot_msg in history:
77
+ messages.add_message(ChatMessage(role="user", content=user_msg))
78
+ messages.add_message(ChatMessage(role="assistant", content=bot_msg))
 
 
 
 
 
 
 
 
79
 
80
+ messages.add_message(ChatMessage(role="user", content=message)) # 添加用户当前消息
81
+
82
  start_time = time.time()
83
  token_count = 0
84
 
 
91
  )
92
 
93
  outputs = ""
94
+
 
95
  for output in stream:
96
  outputs += output
97
  token_count += len(output.split())
98
+ messages.add_message(ChatMessage(role="assistant", content=output))
99
+ yield messages
100
 
101
  end_time = time.time()
102
  latency = end_time - start_time
 
104
  print(f"Latency: {latency} seconds")
105
  print(f"Speed: {speed} tokens/second")
106
 
 
107
  description = """<p><center>
108
  <a href="https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF" target="_blank">[Meta Llama 3.2 (1B)]</a>
109
  Meta Llama 3.2 (1B) is a multilingual large language model (LLM) optimized for conversational dialogue use cases, including agentic retrieval and summarization tasks. It outperforms many open-source and closed chat models on industry benchmarks, and is intended for commercial and research use in multiple languages.