Jordan Legg commited on
Commit
b39e76c
1 Parent(s): 5b879f4

unified the approach to not rely on HF models, just input text.

Browse files
Files changed (1) hide show
  1. app.py +13 -45
app.py CHANGED
@@ -1,61 +1,29 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer
3
- import json
4
- from huggingface_hub import hf_hub_download
5
 
6
- def get_tokenizer_names(model_name):
7
- try:
8
- # First attempt: Try to get names from model_index.json
9
- model_info_path = hf_hub_download(model_name, filename="model_index.json")
10
- with open(model_info_path, "r") as f:
11
- model_info = json.load(f)
12
-
13
- # Extract tokenizer class names from the JSON
14
- tokenizer_1_class = model_info.get("tokenizer", ["", "Unknown"])[1]
15
- tokenizer_2_class = model_info.get("tokenizer_2", ["", "Unknown"])[1]
16
-
17
- return tokenizer_1_class, tokenizer_2_class
18
-
19
- except Exception:
20
- # Second attempt: Fall back to original method
21
- try:
22
- model_info = AutoTokenizer.from_pretrained(model_name, subfolder="tokenizer", _from_auto=True)
23
- config = model_info.init_kwargs
24
- return config.get('tokenizer_class', 'Unknown'), config.get('tokenizer_2_class', 'Unknown')
25
- except Exception:
26
- return "Unknown", "Unknown"
27
-
28
- def count_tokens(model_name, text):
29
- # Load the tokenizers from the specified model
30
- tokenizer_1 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer")
31
- tokenizer_2 = AutoTokenizer.from_pretrained(f"{model_name}", subfolder="tokenizer_2")
32
-
33
- # Get tokenizer names
34
- tokenizer_1_name, tokenizer_2_name = get_tokenizer_names(model_name)
35
-
36
- # Tokenize the input text
37
- tokens_1 = tokenizer_1.tokenize(text)
38
- tokens_2 = tokenizer_2.tokenize(text)
39
 
40
- # Count the tokens
41
- count_1 = len(tokens_1)
42
- count_2 = len(tokens_2)
43
 
44
- return f"{tokenizer_1_name}: {count_1} tokens", f"{tokenizer_2_name}: {count_2} tokens"
45
 
46
  # Create a Gradio interface
47
  iface = gr.Interface(
48
  fn=count_tokens,
49
  inputs=[
50
- gr.Textbox(label="Model Name", placeholder="e.g., black-forest-labs/FLUX.1-dev"),
51
  gr.Textbox(label="Text", placeholder="Enter text here...")
52
  ],
53
  outputs=[
54
- gr.Textbox(label="Tokenizer 1"),
55
- gr.Textbox(label="Tokenizer 2")
56
  ],
57
- title="Token Counter",
58
- description="Enter a Hugging Face model name and text to count tokens using the model's tokenizers."
59
  )
60
 
61
  # Launch the app
 
1
  import gradio as gr
2
+ from transformers import T5TokenizerFast, CLIPTokenizer
 
 
3
 
4
+ def count_tokens(text):
5
+ # Load the common tokenizers
6
+ t5_tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl", legacy=False)
7
+ clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Get token counts directly using the encode method
10
+ t5_count = len(t5_tokenizer.encode(text))
11
+ clip_count = len(clip_tokenizer.encode(text))
12
 
13
+ return f"T5: {t5_count} tokens", f"CLIP: {clip_count} tokens"
14
 
15
  # Create a Gradio interface
16
  iface = gr.Interface(
17
  fn=count_tokens,
18
  inputs=[
 
19
  gr.Textbox(label="Text", placeholder="Enter text here...")
20
  ],
21
  outputs=[
22
+ gr.Textbox(label="T5 Tokenizer"),
23
+ gr.Textbox(label="CLIP Tokenizer")
24
  ],
25
+ title="Common Diffusion Model Token Counter",
26
+ description="Enter text to count tokens using T5 and CLIP tokenizers, commonly used in diffusion models."
27
  )
28
 
29
  # Launch the app