Omartificial-Intelligence-Space commited on
Commit
71be925
·
verified ·
1 Parent(s): 76d1dbc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -22
app.py CHANGED
@@ -1,22 +1,13 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
 
4
- chart_html = gr.HTML(label="Token Frequency Chart")
5
-
6
- # Define a function to tokenize text and create visualization
7
  def tokenize_text(text, tokenizer_name):
8
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
9
- tokenized_text = tokenizer.tokenize(text)
10
- input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
11
- decoded_text = tokenizer.decode(input_ids)
12
-
13
- # Create visualization HTML
14
- chart_html = create_token_frequency_chart(tokenized_text)
15
-
16
- return (
17
- f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}",
18
- chart_html,
19
- )
20
 
21
 
22
  # Define available tokenizers
@@ -40,14 +31,9 @@ iface = gr.Interface(
40
  gr.Textbox(label="Enter Text"),
41
  gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
42
  ],
43
- outputs=[
44
- gr.Textbox(label="Tokenized Text"),
45
- gr.Textbox(label="Input IDs"),
46
- gr.Textbox(label="Decoded Text"),
47
- gr.HTML(label="Token Frequency Chart"), # Include chart_html
48
- ]
49
  title="Kalemat: Explore Arabic Tokenizers",
50
- description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process",
51
  )
52
 
53
  # Launch the app
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer
3
 
4
+ # Define a function to tokenize text with a selected tokenizer
 
 
5
  def tokenize_text(text, tokenizer_name):
6
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
7
+ tokenized_text = tokenizer.tokenize(text)
8
+ input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
9
+ decoded_text = tokenizer.decode(input_ids) # Decode the input IDs
10
+ return f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}"
 
 
 
 
 
 
 
11
 
12
 
13
  # Define available tokenizers
 
31
  gr.Textbox(label="Enter Text"),
32
  gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
33
  ],
34
+ outputs="text",
 
 
 
 
 
35
  title="Kalemat: Explore Arabic Tokenizers",
36
+ description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process.",
37
  )
38
 
39
  # Launch the app