helenai commited on
Commit
fbd4fd8
1 Parent(s): 2acbb98

Add more preset tokenizers

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -9,27 +9,35 @@ from datasets import load_dataset
9
  from PIL import Image
10
  from transformers import AutoTokenizer
11
 
 
12
  tokenizers = {
13
  "bert": "google-bert/bert-base-uncased",
 
14
  "bloom": "bigscience/bloom-560m",
15
- "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
16
  "chatglm3": "THUDM/chatglm3-6b",
17
  "falcon": "tiiuae/falcon-7b",
 
18
  "gpt-neox": "EleutherAI/gpt-neox-20b",
19
  "llama": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
20
  "magicoder": "ise-uiuc/Magicoder-S-DS-6.7B",
21
  "mistral": "mistralai/Mistral-7B-v0.1",
 
22
  "opt": "facebook/opt-2.7b",
23
  "phi-2": "microsoft/phi-2",
24
  "pythia": "EleutherAI/pythia-1.4b-deduped",
25
- "roberta": "FacebookAI/roberta-base",
26
  "qwen": "Qwen/Qwen1.5-7B-Chat",
 
 
27
  "starcoder": "bigcode/starcoder2-7b",
28
  "t5": "google-t5/t5-base",
 
 
29
  }
30
 
31
  tokenizers = list(tokenizers.values())
32
 
 
33
  def plot_histogram(data):
34
  plt.hist(data)
35
  plt.title("Histogram of number of tokens per dataset item")
@@ -98,7 +106,7 @@ demo = gr.Interface(
98
  ["mistralai/Mistral-7B-v0.1", "wikitext", "wikitext-2-v1", "validation", "text"],
99
  ["mistralai/Mistral-7B-v0.1", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
100
  ],
101
- cache_examples=True
102
  )
103
 
104
  demo.launch()
 
9
  from PIL import Image
10
  from transformers import AutoTokenizer
11
 
12
+
13
  tokenizers = {
14
  "bert": "google-bert/bert-base-uncased",
15
+ "blenderbot": "facebook/blenderbot-3B",
16
  "bloom": "bigscience/bloom-560m",
17
+ "bloomz": "bigscience/bloomz-7b1",
18
  "chatglm3": "THUDM/chatglm3-6b",
19
  "falcon": "tiiuae/falcon-7b",
20
+ "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
21
  "gpt-neox": "EleutherAI/gpt-neox-20b",
22
  "llama": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
23
  "magicoder": "ise-uiuc/Magicoder-S-DS-6.7B",
24
  "mistral": "mistralai/Mistral-7B-v0.1",
25
+ "mpt": "mosaicml/mpt-7b",
26
  "opt": "facebook/opt-2.7b",
27
  "phi-2": "microsoft/phi-2",
28
  "pythia": "EleutherAI/pythia-1.4b-deduped",
 
29
  "qwen": "Qwen/Qwen1.5-7B-Chat",
30
+ "redpajama": "togethercomputer/RedPajama-INCITE-Chat-3B-v1",
31
+ "roberta": "FacebookAI/roberta-base",
32
  "starcoder": "bigcode/starcoder2-7b",
33
  "t5": "google-t5/t5-base",
34
+ "vicuna": "lmsys/vicuna-7b-v1.5",
35
+ "zephyr": "HuggingFaceH4/zephyr-7b-beta",
36
  }
37
 
38
  tokenizers = list(tokenizers.values())
39
 
40
+
41
  def plot_histogram(data):
42
  plt.hist(data)
43
  plt.title("Histogram of number of tokens per dataset item")
 
106
  ["mistralai/Mistral-7B-v0.1", "wikitext", "wikitext-2-v1", "validation", "text"],
107
  ["mistralai/Mistral-7B-v0.1", "zeroshot/twitter-financial-news-sentiment", "", "validation", "text"],
108
  ],
109
+ cache_examples=True,
110
  )
111
 
112
  demo.launch()