update tokenization and readme

Browse files

Files changed (4) hide show

README.md +10 -4
modeling_qwen.py +1 -0
qwen_generation_utils.py +8 -3
tokenization_qwen.py +4 -2

README.md CHANGED Viewed

@@ -35,8 +35,6 @@ For more details about the open-source model of Qwen-7B, please refer to the [Gi
 * python 3.8及以上版本
 * pytorch 1.12及以上版本，推荐2.0及以上版本
 * 建议使用CUDA 11.4及以上（GPU用户、flash-attention用户等需考虑此选项）
 * python 3.8 and above
 * pytorch 1.12 and above, 2.0 and above are recommended
 * CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
@@ -58,6 +56,8 @@ In addition, it is recommended to install the `flash-attention` library for high
 ```bash
 git clone -b v1.0.8 https://github.com/Dao-AILab/flash-attention
 cd flash-attention && pip install .
 pip install csrc/layer_norm
 pip install csrc/rotary
 ```
@@ -73,8 +73,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
 # Note: The default behavior now has injection attack prevention off.
-# To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
-# For example: tokens = tokenizer(text, allowed_special="all")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
 # use bf16
@@ -114,6 +112,14 @@ print(response)
 For more information, please refer to our [Github repo](https://github.com/QwenLM/Qwen-7B) for more information.
 ## 模型细节（Model）
 与Qwen-7B预训练模型相同，Qwen-7B-Chat模型规模基本情况如下所示

 * python 3.8及以上版本
 * pytorch 1.12及以上版本，推荐2.0及以上版本
 * 建议使用CUDA 11.4及以上（GPU用户、flash-attention用户等需考虑此选项）
 * python 3.8 and above
 * pytorch 1.12 and above, 2.0 and above are recommended
 * CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 ```bash
 git clone -b v1.0.8 https://github.com/Dao-AILab/flash-attention
 cd flash-attention && pip install .
+# 下方安装可选，安装可能比较缓慢。
+# Below are optional. Installing them might be slow.
 pip install csrc/layer_norm
 pip install csrc/rotary
 ```
 from transformers.generation import GenerationConfig
 # Note: The default behavior now has injection attack prevention off.
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
 # use bf16
 For more information, please refer to our [Github repo](https://github.com/QwenLM/Qwen-7B) for more information.
+## Tokenizer
+> 注：作为术语的“tokenization”在中文中尚无共识的概念对应，本文档采用英文表达以利说明。
+基于tiktoken的分词器有别于其他分词器，比如sentencepiece分词器。尤其在微调阶段，需要特别注意特殊token的使用。关于tokenizer的更多信息，以及微调时涉及的相关使用，请参阅[文档](https://github.com/QwenLM/Qwen-7B/blob/main/tokenization_note_zh.md)。
+Our tokenizer based on tiktoken is different from other tokenizers, e.g., sentencepiece tokenizer. You need to pay attention to special tokens, especially in finetuning. For more detailed information on the tokenizer and related use in fine-tuning, please refer to the [documentation](https://github.com/QwenLM/Qwen-7B/blob/main/tokenization_note.md).
 ## 模型细节（Model）
 与Qwen-7B预训练模型相同，Qwen-7B-Chat模型规模基本情况如下所示

modeling_qwen.py CHANGED Viewed

@@ -1021,6 +1021,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             context_length=len(context_tokens),
             chat_format=self.generation_config.chat_format,
             verbose=False,
         )
         if append_history:

             context_length=len(context_tokens),
             chat_format=self.generation_config.chat_format,
             verbose=False,
+            errors='replace'
         )
         if append_history:

qwen_generation_utils.py CHANGED Viewed

@@ -198,8 +198,9 @@ def _decode_default(
     raw_text_len: int,
     verbose: bool = False,
     return_end_reason: bool = False,
 ):
-    trim_decode_tokens = tokenizer.decode(tokens)[raw_text_len:]
     if verbose:
         print("\nRaw Generate: ", trim_decode_tokens)
@@ -231,6 +232,7 @@ def _decode_chatml(
     context_length: int,
     verbose: bool = False,
     return_end_reason: bool = False,
 ):
     end_reason = f"Gen length {len(tokens)}"
     eod_token_idx = context_length
@@ -239,9 +241,9 @@ def _decode_chatml(
             end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
             break
-    trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx])[raw_text_len:]
     if verbose:
-        print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens)[raw_text_len:])
         print("\nRaw Generate:", trim_decode_tokens)
         print("\nEnd Reason:", end_reason)
     for stop_word in stop_words:
@@ -264,6 +266,7 @@ def decode_tokens(
     chat_format: str,
     verbose: bool = False,
     return_end_reason: bool = False,
 ) -> str:
     if torch.is_tensor(tokens):
         tokens = tokens.cpu().numpy().tolist()
@@ -278,6 +281,7 @@ def decode_tokens(
             context_length=context_length,
             verbose=verbose,
             return_end_reason=return_end_reason,
         )
     elif chat_format == "raw":
         return _decode_default(
@@ -288,6 +292,7 @@ def decode_tokens(
             raw_text_len=raw_text_len,
             verbose=verbose,
             return_end_reason=return_end_reason,
         )
     else:
         raise NotImplementedError(f"Unknown chat format {chat_format!r}")

     raw_text_len: int,
     verbose: bool = False,
     return_end_reason: bool = False,
+    errors: str='replace',
 ):
+    trim_decode_tokens = tokenizer.decode(tokens, errors=errors)[raw_text_len:]
     if verbose:
         print("\nRaw Generate: ", trim_decode_tokens)
     context_length: int,
     verbose: bool = False,
     return_end_reason: bool = False,
+    errors: str='replace'
 ):
     end_reason = f"Gen length {len(tokens)}"
     eod_token_idx = context_length
             end_reason = f"Gen {tokenizer.decode([tokens[eod_token_idx]])!r}"
             break
+    trim_decode_tokens = tokenizer.decode(tokens[:eod_token_idx], errors=errors)[raw_text_len:]
     if verbose:
+        print("\nRaw Generate w/o EOD:", tokenizer.decode(tokens, errors=errors)[raw_text_len:])
         print("\nRaw Generate:", trim_decode_tokens)
         print("\nEnd Reason:", end_reason)
     for stop_word in stop_words:
     chat_format: str,
     verbose: bool = False,
     return_end_reason: bool = False,
+    errors: str="replace",
 ) -> str:
     if torch.is_tensor(tokens):
         tokens = tokens.cpu().numpy().tolist()
             context_length=context_length,
             verbose=verbose,
             return_end_reason=return_end_reason,
+            errors=errors,
         )
     elif chat_format == "raw":
         return _decode_default(
             raw_text_len=raw_text_len,
             verbose=verbose,
             return_end_reason=return_end_reason,
+            errors=errors,
         )
     else:
         raise NotImplementedError(f"Unknown chat format {chat_format!r}")

tokenization_qwen.py CHANGED Viewed

@@ -35,7 +35,8 @@ SPECIAL_TOKENS = (
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
-    contents = open(tiktoken_bpe_file, "rb").read()
     return {
         base64.b64decode(token): int(rank)
         for token, rank in (line.split() for line in contents.splitlines() if line)
@@ -217,10 +218,11 @@ class QWenTokenizer(PreTrainedTokenizer):
         self,
         token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
         **kwargs,
     ) -> str:
         if isinstance(token_ids, int):
             token_ids = [token_ids]
         if skip_special_tokens:
             token_ids = [i for i in token_ids if i < self.eod_id]
-        return self.tokenizer.decode(token_ids, errors=self.errors)

 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
     return {
         base64.b64decode(token): int(rank)
         for token, rank in (line.split() for line in contents.splitlines() if line)
         self,
         token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
+        errors: str = None,
         **kwargs,
     ) -> str:
         if isinstance(token_ids, int):
             token_ids = [token_ids]
         if skip_special_tokens:
             token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)