HeshamHaroon commited on
Commit
80ccea0
1 Parent(s): cd02b60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -17
app.py CHANGED
@@ -3,28 +3,38 @@ import gradio as gr
3
  import aranizer
4
  from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
5
 
6
- # Load all available tokenizers
 
 
 
 
 
7
  tokenizers = {
8
- "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer(),
9
- "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer(),
10
- "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer(),
11
- "aranizer_sp32k": aranizer_sp32k.get_tokenizer(),
12
- "aranizer_sp50k": aranizer_sp50k.get_tokenizer(),
13
- "aranizer_sp64k": aranizer_sp64k.get_tokenizer(),
14
- "aranizer_sp86k": aranizer_sp86k.get_tokenizer(),
15
  }
16
 
17
- def compare_tokenizers(text):
18
- results = []
19
- for name, tokenizer in tokenizers.items():
20
- tokens = tokenizer.tokenize(text)
21
- encoded_output = tokenizer.encode(text, add_special_tokens=True)
22
- decoded_text = tokenizer.decode(encoded_output)
23
- results.append((name, tokens, encoded_output, decoded_text))
 
 
24
  return results
25
 
26
- # Define the Gradio interface components using correct syntax
27
- inputs_component = gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
 
 
 
28
  outputs_component = gr.Dataframe(headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"], label="Results")
29
 
30
  # Setting up the interface
 
3
  import aranizer
4
  from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
5
 
6
+ # List of available tokenizers and a dictionary to load them
7
+ tokenizer_options = [
8
+ "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
9
+ "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k"
10
+ ]
11
+
12
  tokenizers = {
13
+ "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
14
+ "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
15
+ "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
16
+ "aranizer_sp32k": aranizer_sp32k.get_tokenizer,
17
+ "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
18
+ "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
19
+ "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
20
  }
21
 
22
+ def compare_tokenizers(tokenizer_name, text):
23
+ # Load the selected tokenizer
24
+ tokenizer = tokenizers[tokenizer_name]()
25
+ tokens = tokenizer.tokenize(text)
26
+ encoded_output = tokenizer.encode(text, add_special_tokens=True)
27
+ decoded_text = tokenizer.decode(encoded_output)
28
+
29
+ # Prepare the results to be displayed
30
+ results = [(tokenizer_name, tokens, encoded_output, decoded_text)]
31
  return results
32
 
33
+ # Define the Gradio interface components with a dropdown for model selection
34
+ inputs_component = [
35
+ gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer"),
36
+ gr.Textbox(lines=2, placeholder="Enter Arabic text here...", label="Input Text")
37
+ ]
38
  outputs_component = gr.Dataframe(headers=["Tokenizer", "Tokens", "Encoded Output", "Decoded Text"], label="Results")
39
 
40
  # Setting up the interface