File size: 4,760 Bytes
5357bd8
61a3b2f
bee3802
7760bbc
0a9a631
8d000e9
176f915
 
 
 
 
 
71c7fc4
0a9a631
ece3f89
7760bbc
ece3f89
 
 
b9f9278
176f915
 
 
 
 
 
 
 
c296fab
 
61a3b2f
7760bbc
 
 
 
 
 
b9f9278
c296fab
 
b613c61
7760bbc
 
176f915
 
 
66b3df6
7760bbc
 
 
 
 
 
 
ece3f89
 
b9f9278
c296fab
 
 
66b3df6
ab010ed
176f915
 
 
27c0e8b
 
71c7fc4
 
 
 
 
91eabe4
00fb4c8
91eabe4
bd02afc
b613c61
 
 
778be61
bd02afc
778be61
b613c61
778be61
 
 
bd02afc
 
 
ece3f89
7760bbc
80ccea0
b613c61
7760bbc
80ccea0
3567a04
bd02afc
6f9d03f
7760bbc
ece3f89
 
 
 
9f4f9aa
7760bbc
ece3f89
fcbfd45
7760bbc
27c0e8b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from gradio import Interface
import gradio as gr
import aranizer
from aranizer import aranizer_bpe50k, aranizer_bpe64k, aranizer_bpe86k, aranizer_sp32k, aranizer_sp50k, aranizer_sp64k, aranizer_sp86k
from transformers import AutoTokenizer, logging
from huggingface_hub import login
import os

# Retrieve your Hugging Face token from the environment variable
HF_TOKEN = os.getenv('HF_TOKEN')

if HF_TOKEN:
    HF_TOKEN = HF_TOKEN.strip()  # Remove any leading or trailing whitespace/newlines
    login(token=HF_TOKEN)

# Load additional tokenizers from transformers
gpt_13b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-13B")
gpt_7b_tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/AceGPT-7B")
jais_13b_tokenizer = AutoTokenizer.from_pretrained("inception-mbzuai/jais-13b")
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# Try to load the gated tokenizer
try:
    meta_llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
except Exception as e:
    meta_llama_tokenizer = None
    logging.warning(f"Could not load meta-llama/Meta-Llama-3-8B tokenizer: {e}")

cohere_command_r_v01_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
cohere_command_r_plus_tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-plus")

# List of available tokenizers and a dictionary to load them
tokenizer_options = [
    "aranizer_bpe50k", "aranizer_bpe64k", "aranizer_bpe86k",
    "aranizer_sp32k", "aranizer_sp50k", "aranizer_sp64k", "aranizer_sp86k",
    "FreedomIntelligence/AceGPT-13B",  
    "FreedomIntelligence/AceGPT-7B",   
    "inception-mbzuai/jais-13b",
    "aubmindlab/bert-base-arabertv2",
    "CohereForAI/c4ai-command-r-v01",
    "CohereForAI/c4ai-command-r-plus"
]

if meta_llama_tokenizer:
    tokenizer_options.append("meta-llama/Meta-Llama-3-8B")

tokenizers = {
    "aranizer_bpe50k": aranizer_bpe50k.get_tokenizer,
    "aranizer_bpe64k": aranizer_bpe64k.get_tokenizer,
    "aranizer_bpe86k": aranizer_bpe86k.get_tokenizer,
    "aranizer_sp32k": aranizer_sp32k.get_tokenizer,
    "aranizer_sp50k": aranizer_sp50k.get_tokenizer,
    "aranizer_sp64k": aranizer_sp64k.get_tokenizer,
    "aranizer_sp86k": aranizer_sp86k.get_tokenizer,
    "FreedomIntelligence/AceGPT-13B": lambda: gpt_13b_tokenizer,
    "FreedomIntelligence/AceGPT-7B": lambda: gpt_7b_tokenizer,
    "inception-mbzuai/jais-13b": lambda: jais_13b_tokenizer,
    "aubmindlab/bert-base-arabertv2": lambda: arabert_tokenizer,
    "CohereForAI/c4ai-command-r-v01": lambda: cohere_command_r_v01_tokenizer,
    "CohereForAI/c4ai-command-r-plus": lambda: cohere_command_r_plus_tokenizer
}

if meta_llama_tokenizer:
    tokenizers["meta-llama/Meta-Llama-3-8B"] = lambda: meta_llama_tokenizer

def compare_tokenizers(tokenizer_index, text):
    tokenizer_name = tokenizer_options[tokenizer_index]
    tokenizer = tokenizers[tokenizer_name]()
    tokens = tokenizer.tokenize(text)
    encoded_output = tokenizer.encode(text, add_special_tokens=True)
    decoded_text = tokenizer.decode(encoded_output, skip_special_tokens=True)

    # Ensure the tokens are properly decoded
    tokens_display = [token.encode('utf-8').decode('utf-8') if isinstance(token, bytes) else token for token in tokens]

    # Prepare the results to be displayed in HTML format
    tokens_html = "".join([f"<span style='background-color:#f0f0f0; padding:4px; margin:2px; border-radius:3px; border:1px solid #ccc;'>{token}</span>" for token in tokens_display])
    encoded_html = "".join([f"<span style='background-color:#e0f7fa; padding:4px; margin:2px; border-radius:3px; border:1px solid #00acc1;'>{token}</span>" for token in encoded_output])
    decoded_html = f"<div style='background-color:#e8f5e9; padding:10px; border-radius:3px; border:1px solid #4caf50;'>{decoded_text}</div>"

    results_html = f"""
    <div style='font-family: Arial, sans-serif;'>
        <h3 style='color: #00796b;'>Tokenizer: {tokenizer_name}</h3>
        <p><strong>Tokens:</strong> {tokens_html}</p>
        <p><strong>Encoded:</strong> {encoded_html}</p>
        <p><strong>Decoded:</strong> {decoded_html}</p>
    </div>
    """
    return results_html

# Define the Gradio interface components with a dropdown for model selection
inputs_component = [
    gr.Dropdown(choices=tokenizer_options, label="Select Tokenizer", type="index"),
    gr.Textbox(lines=2, placeholder="اكتب النص هنا...", label="Input Text")
]

outputs_component = gr.HTML(label="Results")

# Setting up the interface
iface = Interface(
    fn=compare_tokenizers, 
    inputs=inputs_component, 
    outputs=outputs_component, 
    title="Arabic Tokenizer Arena",
    live=True
)

# Launching the Gradio app
iface.launch()