Files changed (1) hide show
  1. app.py +39 -48
app.py CHANGED
@@ -1,23 +1,30 @@
1
  from transformers import AutoTokenizer
2
  import gradio as gr
3
 
 
4
  def formatarr(input):
5
  return "["+",".join(str(x) for x in input)+"]"
6
 
 
7
  def tokenize(input_text):
8
  llama_tokens = llama_tokenizer(input_text, add_special_tokens=True)["input_ids"]
9
  llama3_tokens = llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
10
  mistral_tokens = mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
11
  gpt2_tokens = gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
12
- gpt_neox_tokens = gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
13
- falcon_tokens = falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
14
- phi2_tokens = phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
15
- phi3_tokens = phi3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
16
- t5_tokens = t5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
17
- gemma_tokens = gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"]
18
- command_r_tokens = command_r_tokenizer(input_text, add_special_tokens=True)["input_ids"]
19
- qwen_tokens = qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
20
  codeqwen_tokens = codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
 
 
 
 
 
21
 
22
 
23
  results = {
@@ -34,6 +41,11 @@ def tokenize(input_text):
34
  "Command-R": command_r_tokens,
35
  "Qwen/Qwen1.5": qwen_tokens,
36
  "CodeQwen": codeqwen_tokens,
 
 
 
 
 
37
  }
38
 
39
  toks = ""
@@ -43,47 +55,26 @@ def tokenize(input_text):
43
 
44
 
45
  if __name__ == "__main__":
46
- llama_tokenizer = AutoTokenizer.from_pretrained(
47
- "TheBloke/Llama-2-7B-fp16"
48
- )
49
- llama3_tokenizer = AutoTokenizer.from_pretrained(
50
- "unsloth/llama-3-8b"
51
- )
52
- mistral_tokenizer = AutoTokenizer.from_pretrained(
53
- "mistral-community/Mistral-7B-v0.2"
54
- )
55
- gpt2_tokenizer = AutoTokenizer.from_pretrained(
56
- "gpt2"
57
- )
58
- gpt_neox_tokenizer = AutoTokenizer.from_pretrained(
59
- "EleutherAI/gpt-neox-20b"
60
- )
61
- falcon_tokenizer = AutoTokenizer.from_pretrained(
62
- "tiiuae/falcon-7b"
63
- )
64
- phi2_tokenizer = AutoTokenizer.from_pretrained(
65
- "microsoft/phi-2"
66
- )
67
- phi3_tokenizer = AutoTokenizer.from_pretrained(
68
- "microsoft/Phi-3-mini-4k-instruct"
69
- )
70
- t5_tokenizer = AutoTokenizer.from_pretrained(
71
- "google/flan-t5-xxl"
72
- )
73
- gemma_tokenizer = AutoTokenizer.from_pretrained(
74
- "alpindale/gemma-2b"
75
- )
76
- command_r_tokenizer = AutoTokenizer.from_pretrained(
77
- "CohereForAI/c4ai-command-r-plus"
78
- )
79
- qwen_tokenizer = AutoTokenizer.from_pretrained(
80
- "Qwen/Qwen1.5-7B"
81
- )
82
- codeqwen_tokenizer = AutoTokenizer.from_pretrained(
83
- "Qwen/CodeQwen1.5-7B"
84
- )
85
 
86
  iface = gr.Interface(
87
- fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=12), outputs="text"
88
  )
89
  iface.launch()
 
1
  from transformers import AutoTokenizer
2
  import gradio as gr
3
 
4
+
5
  def formatarr(input):
6
  return "["+",".join(str(x) for x in input)+"]"
7
 
8
+
9
  def tokenize(input_text):
10
  llama_tokens = llama_tokenizer(input_text, add_special_tokens=True)["input_ids"]
11
  llama3_tokens = llama3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
12
  mistral_tokens = mistral_tokenizer(input_text, add_special_tokens=True)["input_ids"]
13
  gpt2_tokens = gpt2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
14
+ gpt_neox_tokens = gpt_neox_tokenizer(input_text, add_special_tokens=True)["input_ids"]
15
+ falcon_tokens = falcon_tokenizer(input_text, add_special_tokens=True)["input_ids"]
16
+ phi2_tokens = phi2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
17
+ phi3_tokens = phi3_tokenizer(input_text, add_special_tokens=True)["input_ids"]
18
+ t5_tokens = t5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
19
+ gemma_tokens = gemma_tokenizer(input_text, add_special_tokens=True)["input_ids"]
20
+ command_r_tokens = command_r_tokenizer(input_text, add_special_tokens=True)["input_ids"]
21
+ qwen_tokens = qwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
22
  codeqwen_tokens = codeqwen_tokenizer(input_text, add_special_tokens=True)["input_ids"]
23
+ rwkv4_tokens = rwkv4_tokenizer(input_text, add_special_tokens=True)["input_ids"]
24
+ rwkv5_tokens = rwkv5_tokenizer(input_text, add_special_tokens=True)["input_ids"]
25
+ deepseek_tokens = deepseek_tokenizer(input_text, add_special_tokens=True)["input_ids"]
26
+ internlm_tokens = internlm_tokenizer(input_text, add_special_tokens=True)["input_ids"]
27
+ internlm2_tokens = internlm2_tokenizer(input_text, add_special_tokens=True)["input_ids"]
28
 
29
 
30
  results = {
 
41
  "Command-R": command_r_tokens,
42
  "Qwen/Qwen1.5": qwen_tokens,
43
  "CodeQwen": codeqwen_tokens,
44
+ "RWKV-v4": rwkv4_tokens,
45
+ "RWKV-v5/RWKV-v6": rwkv5_tokens,
46
+ "DeepSeek": deepseek_tokens,
47
+ "InternLM": internlm_tokens,
48
+ "InternLM2": internlm2_tokens
49
  }
50
 
51
  toks = ""
 
55
 
56
 
57
  if __name__ == "__main__":
58
+ llama_tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-7B-fp16")
59
+ llama3_tokenizer = AutoTokenizer.from_pretrained("unsloth/llama-3-8b")
60
+ mistral_tokenizer = AutoTokenizer.from_pretrained("mistral-community/Mistral-7B-v0.2")
61
+ gpt2_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
62
+ gpt_neox_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
63
+ falcon_tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
64
+ phi2_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
65
+ phi3_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
66
+ t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xxl")
67
+ gemma_tokenizer = AutoTokenizer.from_pretrained("alpindale/gemma-2b")
68
+ command_r_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")
69
+ qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B")
70
+ codeqwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/CodeQwen1.5-7B")
71
+ rwkv4_tokenizer = AutoTokenizer.from_pretrained("RWKV/rwkv-4-14b-pile", trust_remote_code=True)
72
+ rwkv5_tokenizer = AutoTokenizer.from_pretrained("RWKV/v5-EagleX-v2-7B-HF", trust_remote_code=True)
73
+ deepseek_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2", trust_remote_code=True)
74
+ internlm_tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-20b", trust_remote_code=True)
75
+ internlm2_tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-20b", trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  iface = gr.Interface(
78
+ fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=19), outputs="text"
79
  )
80
  iface.launch()