Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -12,17 +12,33 @@ def tokenize(input_text):
|
|
12 |
mistral_tokens = len(
|
13 |
mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
14 |
)
|
15 |
-
gpt2_tokens = len(
|
|
|
|
|
16 |
gpt_neox_tokens = len(
|
17 |
gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
18 |
)
|
19 |
falcon_tokens = len(
|
20 |
falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
21 |
)
|
22 |
-
phi2_tokens = len(
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
results = {
|
28 |
"LLaMa-1/LLaMa-2": llama_tokens,
|
@@ -31,10 +47,12 @@ def tokenize(input_text):
|
|
31 |
"GPT-2/GPT-J": gpt2_tokens,
|
32 |
"GPT-NeoX": gpt_neox_tokens,
|
33 |
"Falcon": falcon_tokens,
|
34 |
-
"Phi": phi2_tokens,
|
35 |
"T5": t5_tokens,
|
36 |
"Gemma": gemma_tokens,
|
37 |
-
"Command-R": command_r_tokens
|
|
|
|
|
38 |
}
|
39 |
|
40 |
# Sort the results in descending order based on token length
|
@@ -44,16 +62,44 @@ def tokenize(input_text):
|
|
44 |
|
45 |
|
46 |
if __name__ == "__main__":
|
47 |
-
llama_tokenizer = AutoTokenizer.from_pretrained(
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
iface.launch()
|
|
|
12 |
mistral_tokens = len(
|
13 |
mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
14 |
)
|
15 |
+
gpt2_tokens = len(
|
16 |
+
gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
17 |
+
)
|
18 |
gpt_neox_tokens = len(
|
19 |
gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
20 |
)
|
21 |
falcon_tokens = len(
|
22 |
falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
23 |
)
|
24 |
+
phi2_tokens = len(
|
25 |
+
phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
26 |
+
)
|
27 |
+
t5_tokens = len(
|
28 |
+
t5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
29 |
+
)
|
30 |
+
gemma_tokens = len(
|
31 |
+
gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
32 |
+
)
|
33 |
+
command_r_tokens = len(
|
34 |
+
command_r_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
35 |
+
)
|
36 |
+
qwen_tokens = len(
|
37 |
+
qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
38 |
+
)
|
39 |
+
codeqwen_tokens = len(
|
40 |
+
codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
|
41 |
+
)
|
42 |
|
43 |
results = {
|
44 |
"LLaMa-1/LLaMa-2": llama_tokens,
|
|
|
47 |
"GPT-2/GPT-J": gpt2_tokens,
|
48 |
"GPT-NeoX": gpt_neox_tokens,
|
49 |
"Falcon": falcon_tokens,
|
50 |
+
"Phi-1/Phi-2": phi2_tokens,
|
51 |
"T5": t5_tokens,
|
52 |
"Gemma": gemma_tokens,
|
53 |
+
"Command-R": command_r_tokens,
|
54 |
+
"Qwen/Qwen1.5": qwen_tokens,
|
55 |
+
"CodeQwen": codeqwen_tokens,
|
56 |
}
|
57 |
|
58 |
# Sort the results in descending order based on token length
|
|
|
62 |
|
63 |
|
64 |
if __name__ == "__main__":
|
65 |
+
llama_tokenizer = AutoTokenizer.from_pretrained(
|
66 |
+
"TheBloke/Llama-2-7B-fp16"
|
67 |
+
)
|
68 |
+
llama3_tokenizer = AutoTokenizer.from_pretrained(
|
69 |
+
"unsloth/llama-3-8b"
|
70 |
+
)
|
71 |
+
mistral_tokenizer = AutoTokenizer.from_pretrained(
|
72 |
+
"mistral-community/Mistral-7B-v0.2"
|
73 |
+
)
|
74 |
+
gpt2_tokenizer = AutoTokenizer.from_pretrained(
|
75 |
+
"gpt2"
|
76 |
+
)
|
77 |
+
gpt_neox_tokenizer = AutoTokenizer.from_pretrained(
|
78 |
+
"EleutherAI/gpt-neox-20b"
|
79 |
+
)
|
80 |
+
falcon_tokenizer = AutoTokenizer.from_pretrained(
|
81 |
+
"tiiuae/falcon-7b"
|
82 |
+
)
|
83 |
+
phi2_tokenizer = AutoTokenizer.from_pretrained(
|
84 |
+
"microsoft/phi-2"
|
85 |
+
)
|
86 |
+
t5_tokenizer = AutoTokenizer.from_pretrained(
|
87 |
+
"google/flan-t5-xxl"
|
88 |
+
)
|
89 |
+
gemma_tokenizer = AutoTokenizer.from_pretrained(
|
90 |
+
"alpindale/gemma-2b"
|
91 |
+
)
|
92 |
+
command_r_tokenizer = AutoTokenizer.from_pretrained(
|
93 |
+
"CohereForAI/c4ai-command-r-plus"
|
94 |
+
)
|
95 |
+
qwen_tokenizer = AutoTokenizer.from_pretrained(
|
96 |
+
"Qwen/Qwen1.5-7B"
|
97 |
+
)
|
98 |
+
codeqwen_tokenizer = AutoTokenizer.from_pretrained(
|
99 |
+
"Qwen/CodeQwen1.5-7B"
|
100 |
+
)
|
101 |
+
|
102 |
+
iface = gr.Interface(
|
103 |
+
fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=12), outputs="text"
|
104 |
+
)
|
105 |
iface.launch()
|