add more tokenizer
Browse files- vocab/__init__.py +13 -3
- vocab/internlm2_chat_7b/__init__.py +4 -0
- vocab/internlm2_math_7b/__init__.py +4 -0
- vocab/internlm_xcomposer_7b/__init__.py +4 -0
- vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/special_tokens_map.json +0 -0
- vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer.json +0 -0
- vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer.model +0 -0
- vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer_config.json +0 -0
- vocab/{mistral → mistral_7b}/README.md +0 -0
- vocab/{mistral → mistral_7b}/__init__.py +0 -0
- vocab/mixtral_8_7b/__init__.py +2 -0
- vocab/orion_14b_chat/__init__.py +4 -0
- vocab/phi_1/__init__.py +4 -0
- vocab/phi_2/__init__.py +4 -0
- vocab/solar_10_7b/__init__.py +4 -0
- vocab/yi_34b/__init__.py +4 -0
- vocab/yi_6b/__init__.py +3 -0
- vocab/yi_vl34b/__init__.py +9 -0
vocab/__init__.py
CHANGED
@@ -46,7 +46,7 @@ tokenizer.special_tokens_map
|
|
46 |
tokenizer.dependency [sentencepiece, tiktoken, icetk]
|
47 |
"""
|
48 |
|
49 |
-
Animal = Enum('Animal', 'ANT BEE CAT DOG')
|
50 |
|
51 |
uniq_tokenizers = [
|
52 |
""
|
@@ -95,6 +95,9 @@ all_tokenizers = [
|
|
95 |
"baichuan",
|
96 |
"baichuan2",
|
97 |
"internlm_chat_7b",
|
|
|
|
|
|
|
98 |
"falcon_7b",
|
99 |
"falcon_180b",
|
100 |
# "goat",
|
@@ -111,7 +114,8 @@ all_tokenizers = [
|
|
111 |
# 未分类
|
112 |
"skywork_13b_base",
|
113 |
"skywork_13b_math",
|
114 |
-
"
|
|
|
115 |
"t5_small",
|
116 |
"t5_base",
|
117 |
"t5_large",
|
@@ -119,6 +123,13 @@ all_tokenizers = [
|
|
119 |
"fastchat_t5_3b",
|
120 |
"pko_t5_large",
|
121 |
"wizardcoder_15b_v1",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
"wizardcoder_python_7b_v1",
|
123 |
"wizardlm_7b_v1",
|
124 |
"wizardmath_70b_v1",
|
@@ -128,7 +139,6 @@ all_tokenizers = [
|
|
128 |
"deepseek_llm_7b_base",
|
129 |
|
130 |
|
131 |
-
|
132 |
]
|
133 |
|
134 |
all_tokenizers = sorted(all_tokenizers)
|
|
|
46 |
tokenizer.dependency [sentencepiece, tiktoken, icetk]
|
47 |
"""
|
48 |
|
49 |
+
# Animal = Enum('Animal', 'ANT BEE CAT DOG')
|
50 |
|
51 |
uniq_tokenizers = [
|
52 |
""
|
|
|
95 |
"baichuan",
|
96 |
"baichuan2",
|
97 |
"internlm_chat_7b",
|
98 |
+
"internlm2_chat_7b",
|
99 |
+
"internlm2_math_7b",
|
100 |
+
"internlm_xcomposer_7b",
|
101 |
"falcon_7b",
|
102 |
"falcon_180b",
|
103 |
# "goat",
|
|
|
114 |
# 未分类
|
115 |
"skywork_13b_base",
|
116 |
"skywork_13b_math",
|
117 |
+
"mistral_7b",
|
118 |
+
"mixtral_8_7b",
|
119 |
"t5_small",
|
120 |
"t5_base",
|
121 |
"t5_large",
|
|
|
123 |
"fastchat_t5_3b",
|
124 |
"pko_t5_large",
|
125 |
"wizardcoder_15b_v1",
|
126 |
+
"yi_6b",
|
127 |
+
"yi_34b",
|
128 |
+
"yi_vl34b",
|
129 |
+
"orion_14b_chat",
|
130 |
+
"phi_1",
|
131 |
+
"phi_2",
|
132 |
+
"solar_10_7b",
|
133 |
"wizardcoder_python_7b_v1",
|
134 |
"wizardlm_7b_v1",
|
135 |
"wizardmath_70b_v1",
|
|
|
139 |
"deepseek_llm_7b_base",
|
140 |
|
141 |
|
|
|
142 |
]
|
143 |
|
144 |
all_tokenizers = sorted(all_tokenizers)
|
vocab/internlm2_chat_7b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-chat-7b", trust_remote_code=True)
|
vocab/internlm2_math_7b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-math-7b", trust_remote_code=True)
|
vocab/internlm_xcomposer_7b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-xcomposer-7b", trust_remote_code=True)
|
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/special_tokens_map.json
RENAMED
File without changes
|
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer.json
RENAMED
File without changes
|
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer.model
RENAMED
File without changes
|
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer_config.json
RENAMED
File without changes
|
vocab/{mistral → mistral_7b}/README.md
RENAMED
File without changes
|
vocab/{mistral → mistral_7b}/__init__.py
RENAMED
File without changes
|
vocab/mixtral_8_7b/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1", trust_remote_code=True)
|
vocab/orion_14b_chat/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("OrionStarAI/Orion-14B-Chat", trust_remote_code=True)
|
vocab/phi_1/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1", trust_remote_code=True)
|
vocab/phi_2/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
|
vocab/solar_10_7b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-v1.0")
|
vocab/yi_34b/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-34B", trust_remote_code=True)
|
vocab/yi_6b/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-6B", trust_remote_code=True)
|
vocab/yi_vl34b/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
|
3 |
+
Yi-VL adopts the LLaVA architecture,
|
4 |
+
"""
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
from transformers import AutoTokenizer
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-VL-34B", trust_remote_code=True)
|