zRzRzRzRzRzRzR ksuriuri commited on
Commit
dad3715
·
verified ·
1 Parent(s): d74db5f

Update tokenization_chatglm.py (#3)

Browse files

- Update tokenization_chatglm.py (3f7e063eb632a7f8f93a7eab49a8e76fc59e638c)


Co-authored-by: haoyuanhuang <ksuriuri@users.noreply.huggingface.co>

Files changed (1) hide show
  1. tokenization_chatglm.py +5 -3
tokenization_chatglm.py CHANGED
@@ -62,14 +62,16 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
62
  vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
-
66
- def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
 
 
73
  if isinstance(t, str):
74
  if temp:
75
  text += temp.decode("utf-8", errors="replace")
@@ -78,7 +80,7 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
78
  elif isinstance(t, bytes):
79
  temp += t
80
  else:
81
- raise TypeError("token should only be of type types or str")
82
  if temp:
83
  text += temp.decode("utf-8", errors="replace")
84
  return text
 
62
  vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
63
  vocab.update(self.added_tokens_encoder)
64
  return vocab
65
+
66
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
67
  """
68
  Converts a sequence of tokens in a single string.
69
  """
70
  text = ""
71
  temp = b""
72
  for t in tokens:
73
+ if isinstance(t, int):
74
+ t = chr(t)
75
  if isinstance(t, str):
76
  if temp:
77
  text += temp.decode("utf-8", errors="replace")
 
80
  elif isinstance(t, bytes):
81
  temp += t
82
  else:
83
+ raise TypeError("token should only be of type int, bytes or str")
84
  if temp:
85
  text += temp.decode("utf-8", errors="replace")
86
  return text