THUDM
/

glm-4-9b-chat

@@ -63,22 +63,22 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
         vocab.update(self.added_tokens_encoder)
         return vocab
-    def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
         """
         Converts a sequence of tokens in a single string.
         """
         text = ""
         temp = b""
         for t in tokens:
-            if isinstance(t, int):
-                t = chr(t)
             if isinstance(t, str):
                 if temp:
                     text += temp.decode("utf-8", errors="replace")
             elif isinstance(t, bytes):
                 temp += t
             else:
-                raise TypeError("token should only be of type int, bytes or str")
         if temp:
             text += temp.decode("utf-8", errors="replace")
         return text
@@ -168,7 +168,8 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
             for item in conversation:
                 if item.get("tools"):
                     tools = item["tools"]
-                    content = "你是一个名为 GLM-4 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。"
                     for tool in tools:
                         if tool["type"] == "function":
                             function = tool["function"]
@@ -203,7 +204,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
                     input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
                 else:
                     input_message += "<|assistant|>"
             return input_ids if tokenize else input_message
         # Main logic to handle different conversation formats

         vocab.update(self.added_tokens_encoder)
         return vocab
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
         """
         Converts a sequence of tokens in a single string.
         """
         text = ""
         temp = b""
         for t in tokens:
             if isinstance(t, str):
                 if temp:
                     text += temp.decode("utf-8", errors="replace")
+                    temp = b""
+                text += t
             elif isinstance(t, bytes):
                 temp += t
             else:
+                raise TypeError("token should only be of type types or str")
         if temp:
             text += temp.decode("utf-8", errors="replace")
         return text
             for item in conversation:
                 if item.get("tools"):
                     tools = item["tools"]
+                    content = "你是一个名为 GhatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。"
+                    content += "\n\n# 可用工具"
                     for tool in tools:
                         if tool["type"] == "function":
                             function = tool["function"]
                     input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
                 else:
                     input_message += "<|assistant|>"
             return input_ids if tokenize else input_message
         # Main logic to handle different conversation formats