MohamedRashad
commited on
Commit
·
0c0efc4
1
Parent(s):
e4cac44
Add refresh functionality to update dataframe
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@ initial_list_of_models = [
|
|
17 |
dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
|
18 |
|
19 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
|
|
20 |
if dataframe_path.exists():
|
21 |
df = pd.read_json(dataframe_path, lines=True)
|
22 |
else:
|
@@ -161,8 +162,14 @@ def tokenize_text(text, chosen_model, better_tokenization=False):
|
|
161 |
|
162 |
return gr.HighlightedText(output, color_map)
|
163 |
|
|
|
|
|
|
|
|
|
|
|
164 |
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
|
165 |
This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
|
|
|
166 |
A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
|
167 |
"""
|
168 |
|
@@ -188,7 +195,9 @@ with gr.Blocks() as demo:
|
|
188 |
model_name = gr.Textbox(
|
189 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
190 |
)
|
191 |
-
|
|
|
|
|
192 |
with gr.Tab(label="Try tokenizers"):
|
193 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
194 |
dropdown = gr.Dropdown(
|
@@ -202,6 +211,7 @@ with gr.Blocks() as demo:
|
|
202 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
203 |
|
204 |
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
|
|
|
205 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
|
206 |
|
207 |
|
|
|
17 |
dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"]
|
18 |
|
19 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
20 |
+
|
21 |
if dataframe_path.exists():
|
22 |
df = pd.read_json(dataframe_path, lines=True)
|
23 |
else:
|
|
|
162 |
|
163 |
return gr.HighlightedText(output, color_map)
|
164 |
|
165 |
+
def refresh():
|
166 |
+
global df
|
167 |
+
df = pd.read_json(dataframe_path, lines=True)
|
168 |
+
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
|
169 |
+
|
170 |
leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens summed on the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset.
|
171 |
This dataset was chosen because it represents Arabic Fusha text in a small and consentrated manner.
|
172 |
+
|
173 |
A tokenizer that scores high in this leaderboard will be efficient in parsing Arabic in its different dialects and forms.
|
174 |
"""
|
175 |
|
|
|
195 |
model_name = gr.Textbox(
|
196 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
197 |
)
|
198 |
+
with gr.Row():
|
199 |
+
submit_new_model_btn = gr.Button(value="Submit New Model", variant="primary", scale=3)
|
200 |
+
refresh_btn = gr.Button(value="Refresh", variant="secondary", scale=1)
|
201 |
with gr.Tab(label="Try tokenizers"):
|
202 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
203 |
dropdown = gr.Dropdown(
|
|
|
211 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
212 |
|
213 |
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown])
|
214 |
+
refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
|
215 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown, checkbox], outputs=[tokenized_textbox])
|
216 |
|
217 |
|