xzuyn commited on
Commit
6475fdc
·
verified ·
1 Parent(s): 082f539

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -19
app.py CHANGED
@@ -12,17 +12,33 @@ def tokenize(input_text):
12
  mistral_tokens = len(
13
  mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
14
  )
15
- gpt2_tokens = len(gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
 
 
16
  gpt_neox_tokens = len(
17
  gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
18
  )
19
  falcon_tokens = len(
20
  falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
21
  )
22
- phi2_tokens = len(phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"])
23
- t5_tokens = len(t5_tokenizer(input_text, add_special_tokens=True)["input_ids"])
24
- gemma_tokens = len(gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"])
25
- command_r_tokens = len(command_r_tokenizer(input_text, add_special_tokens=True)["input_ids"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  results = {
28
  "LLaMa-1/LLaMa-2": llama_tokens,
@@ -31,10 +47,12 @@ def tokenize(input_text):
31
  "GPT-2/GPT-J": gpt2_tokens,
32
  "GPT-NeoX": gpt_neox_tokens,
33
  "Falcon": falcon_tokens,
34
- "Phi": phi2_tokens,
35
  "T5": t5_tokens,
36
  "Gemma": gemma_tokens,
37
- "Command-R": command_r_tokens
 
 
38
  }
39
 
40
  # Sort the results in descending order based on token length
@@ -44,16 +62,44 @@ def tokenize(input_text):
44
 
45
 
46
  if __name__ == "__main__":
47
- llama_tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-fp16")
48
- llama3_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b")
49
- mistral_tokenizer = AutoTokenizer.from_pretrained("unsloth/Mistral-7B-v0.2")
50
- gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
51
- gpt_neox_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
52
- falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
53
- phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
54
- t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
55
- gemma_tokenizer = AutoTokenizer.from_pretrained("alpindale/gemma-2b")
56
- command_r_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
57
-
58
- iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=10), outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  iface.launch()
 
12
  mistral_tokens = len(
13
  mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
14
  )
15
+ gpt2_tokens = len(
16
+ gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
17
+ )
18
  gpt_neox_tokens = len(
19
  gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
20
  )
21
  falcon_tokens = len(
22
  falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
23
  )
24
+ phi2_tokens = len(
25
+ phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
26
+ )
27
+ t5_tokens = len(
28
+ t5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
29
+ )
30
+ gemma_tokens = len(
31
+ gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"]
32
+ )
33
+ command_r_tokens = len(
34
+ command_r_tokenizer(input_text, add_special_tokens=True)["input_ids"]
35
+ )
36
+ qwen_tokens = len(
37
+ qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
38
+ )
39
+ codeqwen_tokens = len(
40
+ codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
41
+ )
42
 
43
  results = {
44
  "LLaMa-1/LLaMa-2": llama_tokens,
 
47
  "GPT-2/GPT-J": gpt2_tokens,
48
  "GPT-NeoX": gpt_neox_tokens,
49
  "Falcon": falcon_tokens,
50
+ "Phi-1/Phi-2": phi2_tokens,
51
  "T5": t5_tokens,
52
  "Gemma": gemma_tokens,
53
+ "Command-R": command_r_tokens,
54
+ "Qwen/Qwen1.5": qwen_tokens,
55
+ "CodeQwen": codeqwen_tokens,
56
  }
57
 
58
  # Sort the results in descending order based on token length
 
62
 
63
 
64
  if __name__ == "__main__":
65
+ llama_tokenizer = AutoTokenizer.from_pretrained(
66
+ "TheBloke/Llama-2-7B-fp16"
67
+ )
68
+ llama3_tokenizer = AutoTokenizer.from_pretrained(
69
+ "unsloth/llama-3-8b"
70
+ )
71
+ mistral_tokenizer = AutoTokenizer.from_pretrained(
72
+ "mistral-community/Mistral-7B-v0.2"
73
+ )
74
+ gpt2_tokenizer = AutoTokenizer.from_pretrained(
75
+ "gpt2"
76
+ )
77
+ gpt_neox_tokenizer = AutoTokenizer.from_pretrained(
78
+ "EleutherAI/gpt-neox-20b"
79
+ )
80
+ falcon_tokenizer = AutoTokenizer.from_pretrained(
81
+ "tiiuae/falcon-7b"
82
+ )
83
+ phi2_tokenizer = AutoTokenizer.from_pretrained(
84
+ "microsoft/phi-2"
85
+ )
86
+ t5_tokenizer = AutoTokenizer.from_pretrained(
87
+ "google/flan-t5-xxl"
88
+ )
89
+ gemma_tokenizer = AutoTokenizer.from_pretrained(
90
+ "alpindale/gemma-2b"
91
+ )
92
+ command_r_tokenizer = AutoTokenizer.from_pretrained(
93
+ "CohereForAI/c4ai-command-r-plus"
94
+ )
95
+ qwen_tokenizer = AutoTokenizer.from_pretrained(
96
+ "Qwen/Qwen1.5-7B"
97
+ )
98
+ codeqwen_tokenizer = AutoTokenizer.from_pretrained(
99
+ "Qwen/CodeQwen1.5-7B"
100
+ )
101
+
102
+ iface = gr.Interface(
103
+ fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=12), outputs="text"
104
+ )
105
  iface.launch()