yangapku commited on
Commit
acdaf68
1 Parent(s): ff3a904

update tokenization and readme

Browse files
README.md CHANGED
@@ -35,8 +35,6 @@ For more details about the open-source model of Qwen-7B, please refer to the [Gi
35
  * python 3.8及以上版本
36
  * pytorch 1.12及以上版本,推荐2.0及以上版本
37
  * 建议使用CUDA 11.4及以上(GPU用户、flash-attention用户等需考虑此选项)
38
-
39
-
40
  * python 3.8 and above
41
  * pytorch 1.12 and above, 2.0 and above are recommended
42
  * CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
@@ -58,6 +56,8 @@ In addition, it is recommended to install the `flash-attention` library for high
58
  ```bash
59
  git clone -b v1.0.8 https://github.com/Dao-AILab/flash-attention
60
  cd flash-attention && pip install .
 
 
61
  pip install csrc/layer_norm
62
  pip install csrc/rotary
63
  ```
@@ -73,8 +73,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
73
  from transformers.generation import GenerationConfig
74
 
75
  # Note: The default behavior now has injection attack prevention off.
76
- # To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
77
- # For example: tokens = tokenizer(text, allowed_special="all")
78
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
79
 
80
  # use bf16
@@ -114,6 +112,14 @@ print(response)
114
 
115
  For more information, please refer to our [Github repo](https://github.com/QwenLM/Qwen-7B) for more information.
116
 
 
 
 
 
 
 
 
 
117
  ## 模型细节(Model)
118
 
119
  与Qwen-7B预训练模型相同,Qwen-7B-Chat模型规模基本情况如下所示
 
35
  * python 3.8及以上版本
36
  * pytorch 1.12及以上版本,推荐2.0及以上版本
37
  * 建议使用CUDA 11.4及以上(GPU用户、flash-attention用户等需考虑此选项)
 
 
38
  * python 3.8 and above
39
  * pytorch 1.12 and above, 2.0 and above are recommended
40
  * CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 
56
  ```bash
57
  git clone -b v1.0.8 https://github.com/Dao-AILab/flash-attention
58
  cd flash-attention && pip install .
59
+ # 下方安装可选,安装可能比较缓慢。
60
+ # Below are optional. Installing them might be slow.
61
  pip install csrc/layer_norm
62
  pip install csrc/rotary
63
  ```
 
73
  from transformers.generation import GenerationConfig
74
 
75
  # Note: The default behavior now has injection attack prevention off.
 
 
76
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
77
 
78
  # use bf16
 
112
 
113
  For more information, please refer to our [Github repo](https://github.com/QwenLM/Qwen-7B) for more information.
114
 
115
+ ## Tokenizer
116
+
117
+ > 注:作为术语的“tokenization”在中文中尚无共识的概念对应,本文档采用英文表达以利说明。
118
+
119
+ 基于tiktoken的分词器有别于其他分词器,比如sentencepiece分词器。尤其在微调阶段,需要特别注意特殊token的使用。关于tokenizer的更多信息,以及微调时涉及的相关使用,请参阅[文档](https://github.com/QwenLM/Qwen-7B/blob/main/tokenization_note_zh.md)。
120
+
121
+ Our tokenizer based on tiktoken is different from other tokenizers, e.g., sentencepiece tokenizer. You need to pay attention to special tokens, especially in finetuning. For more detailed information on the tokenizer and related use in fine-tuning, please refer to the [documentation](https://github.com/QwenLM/Qwen-7B/blob/main/tokenization_note.md).
122
+
123
  ## 模型细节(Model)
124
 
125
  与Qwen-7B预训练模型相同,Qwen-7B-Chat模型规模基本情况如下所示
modeling_qwen.py CHANGED
@@ -1021,6 +1021,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
1021
  context_length=len(context_tokens),
1022
  chat_format=self.generation_config.chat_format,
1023
  verbose=False,
 
1024
  )
1025
 
1026
  if append_history:
 
1021
  context_length=len(context_tokens),
1022
  chat_format=self.generation_config.chat_format,
1023
  verbose=False,
1024
+ errors='replace'
1025
  )
1026
 
1027
  if append_history:
qwen_generation_utils.py CHANGED
@@ -198,8 +198,9 @@ def _decode_default(
198
  raw_text_len: int,
199
  verbose: bool = False,
200
  return_end_reason: bool = False,
 
201
  ):
202
- trim_decode_tokens = tokenizer.decode(tokens)[raw_text_len:]
203
  if verbose:
204
  print("\nRaw Generate: ", trim_decode_tokens)
205
 
@@ -231,6 +232,7 @@ def _decode_chatml(
231
  context_length: int,
232
  verbose: bool = False,
233
  return_end_reason: bool = False,
 
234
  ):
235
  end_reason = f"Gen length {len(tokens)}"
236
  eod_token_idx = context_length
@@ -239,9 +241,9 @@ def _decode_chatml(
239
  end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
240
  break
241
 
242
- trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx])[raw_text_len:]
243
  if verbose:
244
- print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens)[raw_text_len:])
245
  print("\nRaw Generate:", trim_decode_tokens)
246
  print("\nEnd Reason:", end_reason)
247
  for stop_word in stop_words:
@@ -264,6 +266,7 @@ def decode_tokens(
264
  chat_format: str,
265
  verbose: bool = False,
266
  return_end_reason: bool = False,
 
267
  ) -> str:
268
  if torch.is_tensor(tokens):
269
  tokens = tokens.cpu().numpy().tolist()
@@ -278,6 +281,7 @@ def decode_tokens(
278
  context_length=context_length,
279
  verbose=verbose,
280
  return_end_reason=return_end_reason,
 
281
  )
282
  elif chat_format == "raw":
283
  return _decode_default(
@@ -288,6 +292,7 @@ def decode_tokens(
288
  raw_text_len=raw_text_len,
289
  verbose=verbose,
290
  return_end_reason=return_end_reason,
 
291
  )
292
  else:
293
  raise NotImplementedError(f"Unknown chat format {chat_format!r}")
 
198
  raw_text_len: int,
199
  verbose: bool = False,
200
  return_end_reason: bool = False,
201
+ errors: str='replace',
202
  ):
203
+ trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
204
  if verbose:
205
  print("\nRaw Generate: ", trim_decode_tokens)
206
 
 
232
  context_length: int,
233
  verbose: bool = False,
234
  return_end_reason: bool = False,
235
+ errors: str='replace'
236
  ):
237
  end_reason = f"Gen length {len(tokens)}"
238
  eod_token_idx = context_length
 
241
  end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
242
  break
243
 
244
+ trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
245
  if verbose:
246
+ print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
247
  print("\nRaw Generate:", trim_decode_tokens)
248
  print("\nEnd Reason:", end_reason)
249
  for stop_word in stop_words:
 
266
  chat_format: str,
267
  verbose: bool = False,
268
  return_end_reason: bool = False,
269
+ errors: str="replace",
270
  ) -> str:
271
  if torch.is_tensor(tokens):
272
  tokens = tokens.cpu().numpy().tolist()
 
281
  context_length=context_length,
282
  verbose=verbose,
283
  return_end_reason=return_end_reason,
284
+ errors=errors,
285
  )
286
  elif chat_format == "raw":
287
  return _decode_default(
 
292
  raw_text_len=raw_text_len,
293
  verbose=verbose,
294
  return_end_reason=return_end_reason,
295
+ errors=errors,
296
  )
297
  else:
298
  raise NotImplementedError(f"Unknown chat format {chat_format!r}")
tokenization_qwen.py CHANGED
@@ -35,7 +35,8 @@ SPECIAL_TOKENS = (
35
 
36
 
37
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
- contents = open(tiktoken_bpe_file, "rb").read()
 
39
  return {
40
  base64.b64decode(token): int(rank)
41
  for token, rank in (line.split() for line in contents.splitlines() if line)
@@ -217,10 +218,11 @@ class QWenTokenizer(PreTrainedTokenizer):
217
  self,
218
  token_ids: Union[int, List[int]],
219
  skip_special_tokens: bool = False,
 
220
  **kwargs,
221
  ) -> str:
222
  if isinstance(token_ids, int):
223
  token_ids = [token_ids]
224
  if skip_special_tokens:
225
  token_ids = [i for i in token_ids if i < self.eod_id]
226
- return self.tokenizer.decode(token_ids, errors=self.errors)
 
35
 
36
 
37
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
38
+ with open(tiktoken_bpe_file, "rb") as f:
39
+ contents = f.read()
40
  return {
41
  base64.b64decode(token): int(rank)
42
  for token, rank in (line.split() for line in contents.splitlines() if line)
 
218
  self,
219
  token_ids: Union[int, List[int]],
220
  skip_special_tokens: bool = False,
221
+ errors: str = None,
222
  **kwargs,
223
  ) -> str:
224
  if isinstance(token_ids, int):
225
  token_ids = [token_ids]
226
  if skip_special_tokens:
227
  token_ids = [i for i in token_ids if i < self.eod_id]
228
+ return self.tokenizer.decode(token_ids, errors=errors or self.errors)