zhuohan-7 commited on
Commit
e9e9e4c
1 Parent(s): 969c3ae

Upload folder using huggingface_hub

Browse files
app/__pycache__/draw_diagram.cpython-312.pyc CHANGED
Binary files a/app/__pycache__/draw_diagram.cpython-312.pyc and b/app/__pycache__/draw_diagram.cpython-312.pyc differ
 
app/__pycache__/pages.cpython-312.pyc CHANGED
Binary files a/app/__pycache__/pages.cpython-312.pyc and b/app/__pycache__/pages.cpython-312.pyc differ
 
app/draw_diagram.py CHANGED
@@ -2,77 +2,15 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  from streamlit_echarts import st_echarts
5
- # from streamlit_echarts import JsCode
6
- # from streamlit_javascript import st_javascript
7
- # from PIL import Image
8
-
9
- # links_dic = {"random": "https://seaeval.github.io/",
10
- # "meta_llama_3_8b": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",
11
- # "mistral_7b_instruct_v0_2": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2",
12
- # "sailor_0_5b": "https://huggingface.co/sail/Sailor-0.5B",
13
- # "sailor_1_8b": "https://huggingface.co/sail/Sailor-1.8B",
14
- # "sailor_4b": "https://huggingface.co/sail/Sailor-4B",
15
- # "sailor_7b": "https://huggingface.co/sail/Sailor-7B",
16
- # "sailor_0_5b_chat": "https://huggingface.co/sail/Sailor-0.5B-Chat",
17
- # "sailor_1_8b_chat": "https://huggingface.co/sail/Sailor-1.8B-Chat",
18
- # "sailor_4b_chat": "https://huggingface.co/sail/Sailor-4B-Chat",
19
- # "sailor_7b_chat": "https://huggingface.co/sail/Sailor-7B-Chat",
20
- # "sea_mistral_highest_acc_inst_7b": "https://seaeval.github.io/",
21
- # "meta_llama_3_8b_instruct": "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct",
22
- # "flan_t5_base": "https://huggingface.co/google/flan-t5-base",
23
- # "flan_t5_large": "https://huggingface.co/google/flan-t5-large",
24
- # "flan_t5_xl": "https://huggingface.co/google/flan-t5-xl",
25
- # "flan_t5_xxl": "https://huggingface.co/google/flan-t5-xxl",
26
- # "flan_ul2": "https://huggingface.co/google/flan-t5-ul2",
27
- # "flan_t5_small": "https://huggingface.co/google/flan-t5-small",
28
- # "mt0_xxl": "https://huggingface.co/bigscience/mt0-xxl",
29
- # "seallm_7b_v2": "https://huggingface.co/SeaLLMs/SeaLLM-7B-v2",
30
- # "gpt_35_turbo_1106": "https://openai.com/blog/chatgpt",
31
- # "meta_llama_3_70b": "https://huggingface.co/meta-llama/Meta-Llama-3-70B",
32
- # "meta_llama_3_70b_instruct": "https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct",
33
- # "sea_lion_3b": "https://huggingface.co/aisingapore/sea-lion-3b",
34
- # "sea_lion_7b": "https://huggingface.co/aisingapore/sea-lion-7b",
35
- # "qwen1_5_110b": "https://huggingface.co/Qwen/Qwen1.5-110B",
36
- # "qwen1_5_110b_chat": "https://huggingface.co/Qwen/Qwen1.5-110B-Chat",
37
- # "llama_2_7b_chat": "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf",
38
- # "gpt4_1106_preview": "https://openai.com/blog/chatgpt",
39
- # "gemma_2b": "https://huggingface.co/google/gemma-2b",
40
- # "gemma_7b": "https://huggingface.co/google/gemma-7b",
41
- # "gemma_2b_it": "https://huggingface.co/google/gemma-2b-it",
42
- # "gemma_7b_it": "https://huggingface.co/google/gemma-7b-it",
43
- # "qwen_1_5_7b": "https://huggingface.co/Qwen/Qwen1.5-7B",
44
- # "qwen_1_5_7b_chat": "https://huggingface.co/Qwen/Qwen1.5-7B-Chat",
45
- # "sea_lion_7b_instruct": "https://huggingface.co/aisingapore/sea-lion-7b-instruct",
46
- # "sea_lion_7b_instruct_research": "https://huggingface.co/aisingapore/sea-lion-7b-instruct-research",
47
- # "LLaMA_3_Merlion_8B": "https://seaeval.github.io/",
48
- # "LLaMA_3_Merlion_8B_v1_1": "https://seaeval.github.io/"}
49
-
50
- # links_dic = {k.lower().replace('_', '-') : v for k, v in links_dic.items()}
51
-
52
- # # huggingface_image = Image.open('style/huggingface.jpg')
53
-
54
- # def nav_to(value):
55
- # try:
56
- # url = links_dic[str(value).lower()]
57
- # js = f'window.open("{url}", "_blank").then(r => window.parent.location.href);'
58
- # st_javascript(js)
59
- # except:
60
- # pass
61
-
62
- # # nav_script = """
63
- # # <meta http-equiv="refresh" content="0; url='%s'">
64
- # # """ % (url)
65
- # # st.write(nav_script, unsafe_allow_html=True)
66
-
67
- # def highlight_table_line(model_name):
68
-
69
- # st.write(model_name)
70
-
71
 
72
- def draw_cross_lingual(category_one, category_two, sort, sorted):
 
73
 
74
- folder = "./results/cross_lingual/"
75
- subtitle = ''
 
76
  data_path = f'{folder}/{category_one}/{category_two}.csv'
77
  chart_data = pd.read_csv(data_path).dropna(axis='columns').round(3)
78
  st.markdown("""
@@ -86,386 +24,55 @@ def draw_cross_lingual(category_one, category_two, sort, sorted):
86
  }
87
  </style>
88
  """, unsafe_allow_html=True)
89
- models = st.multiselect("Please choose the models", chart_data['Model'].tolist(), default = chart_data['Model'].tolist())
90
- chart_data = chart_data[chart_data['Model'].isin(models)]
91
-
92
- if sorted == 'Ascending':
93
- ascend = True
94
- else:
95
- ascend = False
96
-
97
- chart_data = chart_data.sort_values(by=[sort], ascending=ascend)
98
-
99
- min_value = round(chart_data.iloc[:, 1::].min().min() - 0.1, 1)
100
- max_value = round(chart_data.iloc[:, 1::].max().max() + 0.1, 1)
101
-
102
-
103
-
104
- if category_two in ['cross_mmlu', 'cross_logiqa']:
105
- # print(category_two)
106
-
107
- if category_two == 'cross_mmlu':
108
- subtitle = 'Cross-MMLU'
109
-
110
- elif category_two == 'cross_logiqa':
111
- subtitle = 'Cross-LogiQA'
112
-
113
- options = {
114
- "title": {"text": f"{subtitle}"},
115
- "tooltip": {
116
- "trigger": "axis",
117
- "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
118
- "triggerOn": 'mousemove',
119
- },
120
- "legend": {"data": ['Overall Accuracy','Cross-Lingual Consistency', 'AC3',
121
- 'English', 'Chinese', 'Spanish', 'Vietnamese', 'Indonesian', 'Malay', 'Filipino']},
122
- "toolbox": {"feature": {"saveAsImage": {}}},
123
- "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
124
- "xAxis": [
125
- {
126
- "type": "category",
127
- "boundaryGap": True,
128
- "triggerEvent": True,
129
- "data": chart_data['Model'].tolist(),
130
- }
131
- ],
132
- "yAxis": [{"type": "value",
133
- "min": min_value,
134
- "max": max_value,
135
- "boundaryGap": True
136
- # "splitNumber": 10
137
- }],
138
- "series": [
139
- {
140
- "name": "Overall Accuracy",
141
- "type": "bar", # "line"
142
- "data": chart_data['Accuracy'].tolist(),
143
- },
144
- {
145
- "name": "Cross-Lingual Consistency",
146
- "type": "bar",
147
- "data": chart_data["Cross-Lingual Consistency"].tolist(),
148
- },
149
- {
150
- "name": "AC3",
151
- "type": "bar",
152
- "data": chart_data["AC3"].tolist(),
153
- },
154
- {
155
- "name": "English",
156
- "type": "bar",
157
- "data": chart_data["English"].tolist(),
158
- },
159
- {
160
- "name": "Chinese",
161
- "type": "bar",
162
- "data": chart_data["Chinese"].tolist(),
163
- },
164
- {
165
- "name": "Spanish",
166
- "type": "bar",
167
- "data": chart_data["Spanish"].tolist(),
168
- },
169
- {
170
- "name": "Vietnamese",
171
- "type": "bar",
172
- "data": chart_data["Vietnamese"].tolist(),
173
- },
174
- {
175
- "name": "Indonesian",
176
- "type": "bar",
177
- "data": chart_data["Indonesian"].tolist(),
178
- },
179
- {
180
- "name": "Malay",
181
- "type": "bar",
182
- "data": chart_data["Malay"].tolist(),
183
- },
184
- {
185
- "name": "Filipino",
186
- "type": "bar",
187
- "data": chart_data["Filipino"].tolist(),
188
- },
189
- ],
190
- }
191
-
192
- # events = {
193
- # "click": "function(params) { return params.value }",
194
- # # "dblclick": "function(params) { return params.value }"
195
- # }
196
-
197
- value = st_echarts(options=options, height="500px") #events=events,
198
-
199
-
200
- # if value != None:
201
- # # print(value)
202
- # nav_to(value)
203
-
204
- # if value != None:
205
- # highlight_table_line(value)
206
-
207
-
208
- elif category_two == 'cross_xquad':
209
-
210
- subtitle = 'Cross-XQUAD'
211
-
212
- options = {
213
- "title": {"text": f"{subtitle}"},
214
- "tooltip": {
215
- "trigger": "axis",
216
- "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
217
- "triggerOn": 'mousemove',
218
- },
219
- "legend": {"data": ['Overall Accuracy','Cross-Lingual Consistency', 'AC3',
220
- 'English', 'Chinese', 'Spanish', 'Vietnamese', 'Indonesian', 'Malay', 'Filipino']},
221
- "toolbox": {"feature": {"saveAsImage": {}}},
222
- "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
223
- "xAxis": [
224
- {
225
- "type": "category",
226
- "boundaryGap": True,
227
- "data": chart_data['Model'].tolist(),
228
- }
229
- ],
230
- "yAxis": [{"type": "value",
231
- "min": min_value,
232
- "max": max_value,
233
- "boundaryGap": True
234
- # "splitNumber": 10
235
- }],
236
- "series": [
237
- {
238
- "name": "Overall Accuracy",
239
- "type": "bar",
240
- "data": chart_data['Accuracy'].tolist(),
241
- },
242
- {
243
- "name": "Cross-Lingual Consistency",
244
- "type": "bar",
245
- "data": chart_data["Cross-Lingual Consistency"].tolist(),
246
- },
247
- {
248
- "name": "AC3",
249
- "type": "bar",
250
- "data": chart_data["AC3"].tolist(),
251
- },
252
- {
253
- "name": "English",
254
- "type": "bar",
255
- "data": chart_data["English"].tolist(),
256
- },
257
- {
258
- "name": "Chinese",
259
- "type": "bar",
260
- "data": chart_data["Chinese"].tolist(),
261
- },
262
- {
263
- "name": "Spanish",
264
- "type": "bar",
265
- "data": chart_data["Spanish"].tolist(),
266
- },
267
- {
268
- "name": "Vietnamese",
269
- "type": "bar",
270
- "data": chart_data["Vietnamese"].tolist(),
271
- },
272
- ],
273
- }
274
-
275
- # events = {
276
- # "click": "function(params) { return params.value }"
277
- # }
278
-
279
- value = st_echarts(options=options, height="500px")
280
-
281
- # if value != None:
282
- # # print(value)
283
- # nav_to(value)
284
-
285
- # if value != None:
286
- # highlight_table_line(value)
287
-
288
- ### create table
289
- st.divider()
290
- # chart_data['Link'] = chart_data['Model'].map(links_dic)
291
- st.dataframe(chart_data,
292
- # column_config = {
293
- # "Link": st.column_config.LinkColumn(
294
- # display_text= st.image(huggingface_image)
295
- # ),
296
- # },
297
- hide_index = True,
298
- use_container_width=True)
299
-
300
 
 
 
 
 
301
 
302
- def draw_only_acc(folder_name, category_one, category_two, sorted):
303
- # Cultural Reasonling / General Reasoning / Emotion / Fundamental NLP Tasks
304
-
305
- folder = f"./results/{folder_name}/"
306
- category_two_dict = {}
307
-
308
- if folder_name == 'cultural_reasoning':
309
- category_two_dict = {'SG EVAL': 'sg_eval',
310
- 'SG EVAL V1 Cleaned': 'sg_eval_v1_cleaned',
311
- 'SG EVAL V2 MCQ': 'sg_eval_v2_mcq',
312
- 'SG EVAL V2 Open Ended': 'sg_eval_v2_open',
313
- 'US EVAL': 'us_eval',
314
- 'CN EVAL': 'cn_eval',
315
- 'PH EVAL': 'ph_eval'}
316
- elif folder_name == 'general_reasoning':
317
- category_two_dict = {'MMLU': 'mmlu',
318
- 'C Eval': 'c_eval',
319
- 'CMMLU': 'cmmlu',
320
- 'ZBench': 'zbench',
321
- 'IndoMMLU': 'indommlu'}
322
-
323
- elif folder_name == 'emotion':
324
- category_two_dict = {'Indonesian Emotion Classification': 'ind_emotion',
325
- 'SST2': 'sst2'}
326
-
327
- elif folder_name == 'fundamental_nlp_tasks':
328
- category_two_dict = {'OCNLI': 'ocnli',
329
- 'C3': 'c3',
330
- 'COLA': 'cola',
331
- 'QQP': 'qqp',
332
- 'MNLI': 'mnli',
333
- 'QNLI': 'qnli',
334
- 'WNLI': 'wnli',
335
- 'RTE': 'rte',
336
- 'MRPC': 'mrpc'}
337
 
338
- subtitle = category_two_dict[category_two]
339
- data_path = f'{folder}/{category_one}/{subtitle}.csv'
340
- chart_data = pd.read_csv(data_path).round(3)
341
 
342
- st.markdown("""
343
- <style>
344
- .stMultiSelect [data-baseweb=select] span {
345
- max-width: 800px;
346
- font-size: 0.9rem;
347
- background-color: #3C6478 !important; /* Background color for selected items */
348
- color: white; /* Change text color */
349
- back
350
- }
351
- </style>
352
- """, unsafe_allow_html=True)
353
- models = st.multiselect("Please choose the models", chart_data['Model'].tolist(), default = chart_data['Model'].tolist())
354
- chart_data = chart_data[chart_data['Model'].isin(models)]
355
-
356
- if sorted == 'Ascending':
357
  ascend = True
358
  else:
359
  ascend = False
360
 
361
- chart_data = chart_data.sort_values(by=['Accuracy'], ascending=ascend)
362
-
363
- min_value = round(chart_data.iloc[:, 1::].min().min() - 0.1, 1)
364
- max_value = round(chart_data.iloc[:, 1::].max().max() + 0.1, 1)
365
-
366
- options = {
367
- "title": {"text": f"{category_two}"},
368
- "tooltip": {
369
- "trigger": "axis",
370
- "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
371
- "triggerOn": 'mousemove',
372
- },
373
- "legend": {"data": ['Overall Accuracy']},
374
- "toolbox": {"feature": {"saveAsImage": {}}},
375
- "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
376
- "xAxis": [
377
- {
378
- "type": "category",
379
- "boundaryGap": True,
380
- "triggerEvent": True,
381
- "data": chart_data['Model'].tolist(),
382
- }
383
- ],
384
- "yAxis": [{"type": "value",
385
- "min": min_value,
386
- "max": max_value,
387
- "boundaryGap": True
388
- # "splitNumber": 10
389
- }],
390
- "series": [
391
- {
392
- "name": "Overall Accuracy",
393
- "type": "bar",
394
- "data": chart_data['Accuracy'].tolist(),
395
- },
396
-
397
- ],
398
  }
399
-
400
- # events = {
401
- # "click": "function(params) { return params.value }"
402
- # }
403
-
404
- value = st_echarts(options=options, height="500px")
405
-
406
- # if value != None:
407
- # # print(value)
408
- # nav_to(value)
409
-
410
- # if value != None:
411
- # highlight_table_line(value)
412
-
413
- ### create table
414
- st.divider()
415
- # chart_data['Link'] = chart_data['Model'].map(links_dic)
416
- st.dataframe(chart_data,
417
- # column_config = {
418
- # "Link": st.column_config.LinkColumn(
419
- # display_text= st.image(huggingface_image)
420
- # ),
421
- # },
422
- hide_index = True,
423
- use_container_width=True)
424
 
425
- def draw_flores_translation(category_one, category_two, sorted):
426
- folder = "./results/flores_translation/"
427
- category_two_dict = {'Indonesian to English': 'ind2eng',
428
- 'Vitenamese to English': 'vie2eng',
429
- 'Chinese to English': 'zho2eng',
430
- 'Malay to English': 'zsm2eng'}
431
-
432
- subtitle = category_two_dict[category_two]
433
-
434
- data_path = f'{folder}/{category_one}/{subtitle}.csv'
435
- chart_data = pd.read_csv(data_path).round(3)
436
- st.markdown("""
437
- <style>
438
- .stMultiSelect [data-baseweb=select] span {
439
- max-width: 800px;
440
- font-size: 0.9rem;
441
- background-color: #3C6478 !important; /* Background color for selected items */
442
- color: white; /* Change text color */
443
- back
444
- }
445
-
446
- </style>
447
- """, unsafe_allow_html=True)
448
- models = st.multiselect("Please choose the models", chart_data['Model'].tolist(), default = chart_data['Model'].tolist())
449
- chart_data = chart_data[chart_data['Model'].isin(models)]
450
-
451
- if sorted == 'Ascending':
452
- ascend = True
453
- else:
454
- ascend = False
455
-
456
- chart_data = chart_data.sort_values(by=['BLEU'], ascending=ascend)
457
 
458
- min_value = round(chart_data.iloc[:, 1::].min().min() - 0.1, 1)
459
- max_value = round(chart_data.iloc[:, 1::].max().max() + 0.1, 1)
460
-
461
  options = {
462
- "title": {"text": f"{category_two}"},
463
  "tooltip": {
464
  "trigger": "axis",
465
  "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
466
  "triggerOn": 'mousemove',
467
  },
468
- "legend": {"data": ['BLEU']},
469
  "toolbox": {"feature": {"saveAsImage": {}}},
470
  "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
471
  "xAxis": [
@@ -473,7 +80,7 @@ def draw_flores_translation(category_one, category_two, sorted):
473
  "type": "category",
474
  "boundaryGap": True,
475
  "triggerEvent": True,
476
- "data": chart_data['Model'].tolist(),
477
  }
478
  ],
479
  "yAxis": [{"type": "value",
@@ -482,181 +89,50 @@ def draw_flores_translation(category_one, category_two, sorted):
482
  "boundaryGap": True
483
  # "splitNumber": 10
484
  }],
485
- "series": [
486
- {
487
- "name": "BLEU",
488
  "type": "bar",
489
- "data": chart_data['BLEU'].tolist(),
490
- },
491
-
492
- ],
493
  }
494
 
495
- # events = {
496
- # "click": "function(params) { return params.value }"
497
- # }
498
-
499
- value = st_echarts(options=options, height="500px")
500
-
501
- # if value != None:
502
- # # print(value)
503
- # nav_to(value)
504
-
505
-
506
- ### create table
507
- st.divider()
508
- # chart_data['Link'] = chart_data['Model'].map(links_dic)
509
- st.dataframe(chart_data,
510
- # column_config = {
511
- # "Link": st.column_config.LinkColumn(
512
- # display_text= st.image(huggingface_image)
513
- # ),
514
- # },
515
- hide_index = True,
516
- use_container_width=True)
517
-
518
 
519
- def draw_dialogue(category_one, category_two, sort, sorted):
520
- folder = "./results/dialogue"
521
- category_two_dict = {'DREAM': 'dream',
522
- 'SAMSum': 'samsum',
523
- 'DialogSum': 'dialogsum'}
524
-
525
- subtitle = category_two_dict[category_two]
526
-
527
- data_path = f'{folder}/{category_one}/{subtitle}.csv'
528
- chart_data = pd.read_csv(data_path).round(3)
 
 
 
 
 
529
 
530
- st.markdown("""
531
- <style>
532
- .stMultiSelect [data-baseweb=select] span {
533
- max-width: 800px;
534
- font-size: 0.9rem;
535
- background-color: #3C6478 !important; /* Background color for selected items */
536
- color: white; /* Change text color */
537
- back
538
- }
539
- </style>
540
- """, unsafe_allow_html=True)
541
- models = st.multiselect("Please choose the models", chart_data['Model'].tolist(), default = chart_data['Model'].tolist())
542
- chart_data = chart_data[chart_data['Model'].isin(models)]
543
-
544
- if sorted == 'Ascending':
545
- ascend = True
546
- else:
547
- ascend = False
548
-
549
- chart_data = chart_data.sort_values(by=[sort], ascending=ascend)
550
 
551
- min_value = round(chart_data.iloc[:, 1::].min().min() - 0.1, 1)
552
- max_value = round(chart_data.iloc[:, 1::].max().max() + 0.1, 1)
553
-
554
 
555
- options = {}
556
- if category_two in ['SAMSum', 'DialogSum']:
557
- options = {
558
- "title": {"text": f"{category_two}"},
559
- "tooltip": {
560
- "trigger": "axis",
561
- "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
562
- "triggerOn": 'mousemove',
563
- },
564
- "legend": {"data": list(chart_data.columns)},
565
- "toolbox": {"feature": {"saveAsImage": {}}},
566
- "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
567
- "xAxis": [
568
- {
569
- "type": "category",
570
- "boundaryGap": True,
571
- "triggerEvent": True,
572
- "data": chart_data['Model'].tolist(),
573
- }
574
- ],
575
- "yAxis": [{"type": "value",
576
- "min": min_value,
577
- "max": max_value,
578
- "boundaryGap": True
579
- # "splitNumber": 10
580
- }],
581
- "series": [
582
- {
583
- "name": "Average",
584
- "type": "bar",
585
- "data": chart_data['Average'].tolist(),
586
- },
587
- {
588
- "name": "ROUGE-1",
589
- "type": "bar",
590
- "data": chart_data["ROUGE-1"].tolist(),
591
- },
592
- {
593
- "name": "ROUGE-2",
594
- "type": "bar",
595
- "data": chart_data["ROUGE-2"].tolist(),
596
- },
597
- {
598
- "name": "ROUGE-L",
599
- "type": "bar",
600
- "data": chart_data["ROUGE-L"].tolist(),
601
- },
602
 
603
- ],
604
- }
605
-
606
- elif category_two == 'DREAM':
607
- options = {
608
- "title": {"text": f"{category_two}"},
609
- "tooltip": {
610
- "trigger": "axis",
611
- "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
612
- "triggerOn": 'mousemove',
613
- },
614
- "legend": {"data": list(chart_data.columns)},
615
- "toolbox": {"feature": {"saveAsImage": {}}},
616
- "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
617
- "xAxis": [
618
- {
619
- "type": "category",
620
- "boundaryGap": True,
621
- "triggerEvent": True,
622
- "data": chart_data['Model'].tolist(),
623
- }
624
- ],
625
- "yAxis": [{"type": "value",
626
- "min": min_value,
627
- "max": max_value,
628
- # "splitNumber": 10
629
- "boundaryGap": True
630
- }],
631
- "series": [
632
- {
633
- "name": "Accuracy",
634
- "type": "bar",
635
- "data": chart_data['Accuracy'].tolist(),
636
  },
 
 
 
637
 
638
- ],
639
- }
640
-
641
- # events = {
642
- # "click": "function(params) { return params.value }"
643
- # }
644
-
645
- value = st_echarts(options=options, height="500px")
646
-
647
- # if value != None:
648
- # # print(value)
649
- # nav_to(value)
650
-
651
-
652
- ### create table
653
- st.divider()
654
- # chart_data['Link'] = chart_data['Model'].map(links_dic)
655
- st.dataframe(chart_data,
656
- # column_config = {
657
- # "Link": st.column_config.LinkColumn(
658
- # display_text= st.image(huggingface_image)
659
- # ),
660
- # },
661
- hide_index = True,
662
- use_container_width=True)
 
2
  import pandas as pd
3
  import numpy as np
4
  from streamlit_echarts import st_echarts
5
+ from streamlit.components.v1 import html
6
+ import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ path = "./style/Leaderboard-Rename-SeaEval.csv"
9
+ info_df = pd.read_csv(path).dropna(axis=0)
10
 
11
+ def draw(folder_name, category_one, category_two, sort, num_sort):
12
+
13
+ folder = f"./results/{folder_name}/"
14
  data_path = f'{folder}/{category_one}/{category_two}.csv'
15
  chart_data = pd.read_csv(data_path).dropna(axis='columns').round(3)
16
  st.markdown("""
 
24
  }
25
  </style>
26
  """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # remap model names
29
+ display_model_names = {key.strip() :val.strip() for key, val in zip(info_df['Original Name'], info_df['Proper Display Name'])}
30
+ chart_data['model_show'] = chart_data['Model'].map(display_model_names)
31
+ chart_data['model_show'] = chart_data['model_show'].fillna(chart_data['Model'].apply(lambda x: x.replace('_', '-')))
32
 
33
+ st.session_state.models = st.multiselect("Please choose the model",
34
+ sorted(chart_data['model_show'].tolist()),
35
+ default = sorted(chart_data['model_show'].tolist()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ chart_data = chart_data[chart_data['model_show'].isin(st.session_state.models)]
 
 
38
 
39
+ if num_sort == 'Ascending':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ascend = True
41
  else:
42
  ascend = False
43
 
44
+ chart_data = chart_data.sort_values(by=[sort], ascending=ascend).dropna(axis=0)
45
+
46
+ if len(chart_data) == 0:
47
+ return
48
+
49
+ min_value = round(min(chart_data.iloc[:, 1]) - 0.1*min(chart_data.iloc[:, 1]), 1)
50
+ max_value = round(max(chart_data.iloc[:, 1]) + 0.1*max(chart_data.iloc[:, 1]), 1)
51
+
52
+ display_names = {
53
+ 'cross_mmlu': 'Cross-MMLU',
54
+ 'cross_logiqa': 'Cross-LogiQA',
55
+ 'cross_xquad': 'Cross-XQUAD',
56
+ 'sg_eval': 'SG EVAL',
57
+ 'sg_eval_v1_cleaned': 'SG EVAL V1 Cleaned',
58
+ 'sg_eval_v2_mcq': 'SG EVAL V2 MCQ',
59
+ 'sg_eval_v2_open': 'SG EVAL V2 Open Ended',
60
+ 'us_eval': 'US EVAL',
61
+ 'cn_eval': 'CN EVAL',
62
+ 'ph_eval': 'PH EVAL'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # breakpoint()
66
+ data_columns = [i for i in chart_data.columns if i not in ['Model', 'model_show']]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
68
  options = {
69
+ # "title": {"text": f"{display_names[category_two]}"},
70
  "tooltip": {
71
  "trigger": "axis",
72
  "axisPointer": {"type": "cross", "label": {"backgroundColor": "#6a7985"}},
73
  "triggerOn": 'mousemove',
74
  },
75
+ "legend": {"data": data_columns},
76
  "toolbox": {"feature": {"saveAsImage": {}}},
77
  "grid": {"left": "3%", "right": "4%", "bottom": "3%", "containLabel": True},
78
  "xAxis": [
 
80
  "type": "category",
81
  "boundaryGap": True,
82
  "triggerEvent": True,
83
+ "data": chart_data['model_show'].tolist(),
84
  }
85
  ],
86
  "yAxis": [{"type": "value",
 
89
  "boundaryGap": True
90
  # "splitNumber": 10
91
  }],
92
+ "series": [{
93
+ "name": f"{col}",
 
94
  "type": "bar",
95
+ "data": chart_data[f'{col}'].tolist(),
96
+ } for col in data_columns],
 
 
97
  }
98
 
99
+ events = {
100
+ "click": "function(params) { return params.value }"
101
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ value = st_echarts(options=options, events=events, height="500px")
104
+
105
+ '''
106
+ Show table
107
+ '''
108
+ # st.divider()
109
+ with st.container():
110
+ # st.write("")
111
+ st.markdown('##### TABLE')
112
+ # custom_css = """
113
+
114
+ # """
115
+ # st.markdown(custom_css, unsafe_allow_html=True)
116
+
117
+ model_link = {key.strip(): val for key, val in zip(info_df['Proper Display Name'], info_df['Link'])}
118
 
119
+ chart_data['model_link'] = chart_data['model_show'].map(model_link)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ # import pdb
122
+ # pdb.set_trace()
 
123
 
124
+ chart_data_table = chart_data[['model_show', 'model_link'] + data_columns]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ st.dataframe(
127
+ chart_data_table,
128
+ column_config={
129
+ 'model_show': 'Model',
130
+ chart_data_table.columns[1]: {'alignment': 'center'},
131
+ "model_link": st.column_config.LinkColumn(
132
+ "Model Link",
133
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  },
135
+ hide_index=True,
136
+ use_container_width=True
137
+ )
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/pages.py CHANGED
@@ -90,15 +90,15 @@ def cross_lingual_consistency():
90
  sort = st.selectbox('Sort', ['Accuracy','Cross-Lingual Consistency', 'AC3',
91
  'English', 'Chinese', 'Spanish', 'Vietnamese'])
92
  with right:
93
- sorted = st.selectbox('by', ['Ascending', 'Descending'])
94
 
95
- if category_one or category_two or sort or sorted:
96
  category_one = category_one_dict[category_one]
97
  category_two = category_two_dict[category_two]
98
 
99
- draw_cross_lingual(category_one, category_two, sort, sorted)
100
- else:
101
- draw_cross_lingual('zero_shot', 'cross_mmlu', 'Accuracy', 'Descending')
102
 
103
  def cultural_reasoning():
104
  st.title("Cultural Reasoning")
@@ -116,6 +116,13 @@ def cultural_reasoning():
116
 
117
  category_one_dict = {'Zero Shot': 'zero_shot',
118
  'Few Shot': 'few_shot'}
 
 
 
 
 
 
 
119
 
120
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
121
  with left:
@@ -123,13 +130,14 @@ def cultural_reasoning():
123
  with center:
124
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
125
  with right:
126
- sorted = st.selectbox('sorted by', ['Ascending', 'Descending'])
127
 
128
- if category_one or category_two or sorted:
129
  category_one = category_one_dict[category_one]
130
- draw_only_acc('cultural_reasoning', category_one, category_two, sorted)
131
- else:
132
- draw_only_acc('cultural_reasoning', 'zero_shot', 'sg_eval', 'Descending')
 
133
 
134
 
135
  def general_reasoning():
@@ -146,6 +154,11 @@ def general_reasoning():
146
 
147
  category_one_dict = {'Zero Shot': 'zero_shot',
148
  'Few Shot': 'few_shot'}
 
 
 
 
 
149
 
150
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
151
  with left:
@@ -153,13 +166,14 @@ def general_reasoning():
153
  with center:
154
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
155
  with right:
156
- sorted = st.selectbox('sorted by', ['Ascending', 'Descending'])
157
 
158
- if category_one or category_two or sorted:
159
  category_one = category_one_dict[category_one]
160
- draw_only_acc('general_reasoning', category_one, category_two, sorted)
161
- else:
162
- draw_only_acc('general_reasoning', 'zero_shot', 'MMLU Full', 'Descending')
 
163
 
164
  def flores():
165
  st.title("FLORES-Translation")
@@ -173,6 +187,10 @@ def flores():
173
 
174
  category_one_dict = {'Zero Shot': 'zero_shot',
175
  'Few Shot': 'few_shot'}
 
 
 
 
176
 
177
 
178
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
@@ -181,13 +199,14 @@ def flores():
181
  with center:
182
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
183
  with right:
184
- sorted = st.selectbox('sorted by', ['Ascending', 'Descending'])
185
 
186
- if category_one or category_two or sorted:
187
  category_one = category_one_dict[category_one]
188
- draw_flores_translation(category_one, category_two, sorted)
189
- else:
190
- draw_flores_translation('zero_shot', 'Indonesian to English', 'Descending')
 
191
 
192
  def emotion():
193
  st.title("Emotion")
@@ -200,6 +219,8 @@ def emotion():
200
 
201
  category_one_dict = {'Zero Shot': 'zero_shot',
202
  'Few Shot': 'few_shot'}
 
 
203
 
204
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
205
  with left:
@@ -207,13 +228,14 @@ def emotion():
207
  with center:
208
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
209
  with right:
210
- sorted = st.selectbox('sorted by', ['Ascending', 'Descending'])
211
 
212
- if category_one or category_two or sorted:
213
  category_one = category_one_dict[category_one]
214
- draw_only_acc('emotion', category_one, category_two, sorted)
215
- else:
216
- draw_only_acc('emotion', 'zero_shot', 'Indonesian Emotion Classification', 'Descending')
 
217
 
218
  def dialogue():
219
  st.title("Dialogue")
@@ -227,6 +249,9 @@ def dialogue():
227
 
228
  category_one_dict = {'Zero Shot': 'zero_shot',
229
  'Few Shot': 'few_shot'}
 
 
 
230
 
231
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
232
  with left:
@@ -240,13 +265,14 @@ def dialogue():
240
  sort = st.selectbox('Sort', ['Average', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'])
241
 
242
  with right:
243
- sorted = st.selectbox('by', ['Ascending', 'Descending'])
244
 
245
- if category_one or category_two or sort or sorted:
246
  category_one = category_one_dict[category_one]
247
- draw_dialogue(category_one, category_two, sort, sorted)
248
- else:
249
- draw_dialogue('zero_shot', 'DREAM', sort[0],'Descending')
 
250
 
251
  def fundamental_nlp_tasks():
252
  st.title("Fundamental NLP Tasks")
@@ -256,6 +282,15 @@ def fundamental_nlp_tasks():
256
 
257
  category_one_dict = {'Zero Shot': 'zero_shot',
258
  'Few Shot': 'few_shot'}
 
 
 
 
 
 
 
 
 
259
 
260
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
261
  with left:
@@ -263,10 +298,11 @@ def fundamental_nlp_tasks():
263
  with center:
264
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
265
  with right:
266
- sorted = st.selectbox('sorted by', ['Ascending', 'Descending'])
267
 
268
- if category_one or category_two or sorted:
269
  category_one = category_one_dict[category_one]
270
- draw_only_acc('fundamental_nlp_tasks', category_one, category_two, sorted)
271
- else:
272
- draw_only_acc('fundamental_nlp_tasks', 'zero_shot', 'OCNLI', 'Descending')
 
 
90
  sort = st.selectbox('Sort', ['Accuracy','Cross-Lingual Consistency', 'AC3',
91
  'English', 'Chinese', 'Spanish', 'Vietnamese'])
92
  with right:
93
+ sortby = st.selectbox('by', ['Ascending', 'Descending'])
94
 
95
+ if category_one or category_two or sort or sortby:
96
  category_one = category_one_dict[category_one]
97
  category_two = category_two_dict[category_two]
98
 
99
+ draw('cross_lingual',category_one, category_two, sort, sortby)
100
+ # else:
101
+ # draw('zero_shot', 'cross_mmlu', 'Accuracy', 'Descending')
102
 
103
  def cultural_reasoning():
104
  st.title("Cultural Reasoning")
 
116
 
117
  category_one_dict = {'Zero Shot': 'zero_shot',
118
  'Few Shot': 'few_shot'}
119
+ category_two_dict = {'SG EVAL': 'sg_eval',
120
+ 'SG EVAL V1 Cleaned': 'sg_eval_v1_cleaned',
121
+ 'SG EVAL V2 MCQ': 'sg_eval_v2_mcq',
122
+ 'SG EVAL V2 Open Ended': 'sg_eval_v2_open',
123
+ 'US EVAL': 'us_eval',
124
+ 'CN EVAL': 'cn_eval',
125
+ 'PH EVAL': 'ph_eval'}
126
 
127
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
128
  with left:
 
130
  with center:
131
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
132
  with right:
133
+ sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])
134
 
135
+ if category_one or category_two or sortby:
136
  category_one = category_one_dict[category_one]
137
+ category_two = category_two_dict[category_two]
138
+ draw('cultural_reasoning', category_one, category_two, 'Accuracy',sortby)
139
+ # else:
140
+ # draw_only_acc('cultural_reasoning', 'zero_shot', 'sg_eval', 'Descending')
141
 
142
 
143
  def general_reasoning():
 
154
 
155
  category_one_dict = {'Zero Shot': 'zero_shot',
156
  'Few Shot': 'few_shot'}
157
+ category_two_dict = {'MMLU': 'mmlu',
158
+ 'C Eval': 'c_eval',
159
+ 'CMMLU': 'cmmlu',
160
+ 'ZBench': 'zbench',
161
+ 'IndoMMLU': 'indommlu'}
162
 
163
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
164
  with left:
 
166
  with center:
167
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
168
  with right:
169
+ sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])
170
 
171
+ if category_one or category_two or sortby:
172
  category_one = category_one_dict[category_one]
173
+ category_two = category_two_dict[category_two]
174
+ draw('general_reasoning', category_one, category_two, 'Accuracy',sortby)
175
+ # else:
176
+ # draw_only_acc('general_reasoning', 'zero_shot', 'MMLU Full', 'Descending')
177
 
178
  def flores():
179
  st.title("FLORES-Translation")
 
187
 
188
  category_one_dict = {'Zero Shot': 'zero_shot',
189
  'Few Shot': 'few_shot'}
190
+ category_two_dict = {'Indonesian to English': 'ind2eng',
191
+ 'Vitenamese to English': 'vie2eng',
192
+ 'Chinese to English': 'zho2eng',
193
+ 'Malay to English': 'zsm2eng'}
194
 
195
 
196
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
 
199
  with center:
200
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
201
  with right:
202
+ sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])
203
 
204
+ if category_one or category_two or sortby:
205
  category_one = category_one_dict[category_one]
206
+ category_two = category_two_dict[category_two]
207
+ draw('flores_translation', category_one, category_two, 'BLEU',sortby)
208
+ # else:
209
+ # draw_flores_translation('zero_shot', 'Indonesian to English', 'Descending')
210
 
211
  def emotion():
212
  st.title("Emotion")
 
219
 
220
  category_one_dict = {'Zero Shot': 'zero_shot',
221
  'Few Shot': 'few_shot'}
222
+ category_two_dict = {'Indonesian Emotion Classification': 'ind_emotion',
223
+ 'SST2': 'sst2'}
224
 
225
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
226
  with left:
 
228
  with center:
229
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
230
  with right:
231
+ sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])
232
 
233
+ if category_one or category_two or sortby:
234
  category_one = category_one_dict[category_one]
235
+ category_two = category_two_dict[category_two]
236
+ draw('emotion', category_one, category_two, 'Accuracy', sortby)
237
+ # else:
238
+ # draw_only_acc('emotion', 'zero_shot', 'Indonesian Emotion Classification', 'Descending')
239
 
240
  def dialogue():
241
  st.title("Dialogue")
 
249
 
250
  category_one_dict = {'Zero Shot': 'zero_shot',
251
  'Few Shot': 'few_shot'}
252
+ category_two_dict = {'DREAM': 'dream',
253
+ 'SAMSum': 'samsum',
254
+ 'DialogSum': 'dialogsum'}
255
 
256
  left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
257
  with left:
 
265
  sort = st.selectbox('Sort', ['Average', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'])
266
 
267
  with right:
268
+ sortby = st.selectbox('by', ['Ascending', 'Descending'])
269
 
270
+ if category_one or category_two or sort or sortby:
271
  category_one = category_one_dict[category_one]
272
+ category_two = category_two_dict[category_two]
273
+ draw('dialogue', category_one, category_two, sort, sortby)
274
+ # else:
275
+ # draw_dialogue('zero_shot', 'DREAM', sort[0],'Descending')
276
 
277
  def fundamental_nlp_tasks():
278
  st.title("Fundamental NLP Tasks")
 
282
 
283
  category_one_dict = {'Zero Shot': 'zero_shot',
284
  'Few Shot': 'few_shot'}
285
+ category_two_dict = {'OCNLI': 'ocnli',
286
+ 'C3': 'c3',
287
+ 'COLA': 'cola',
288
+ 'QQP': 'qqp',
289
+ 'MNLI': 'mnli',
290
+ 'QNLI': 'qnli',
291
+ 'WNLI': 'wnli',
292
+ 'RTE': 'rte',
293
+ 'MRPC': 'mrpc'}
294
 
295
  left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2])
296
  with left:
 
298
  with center:
299
  category_two = st.selectbox('Select the sub-category', filters_leveltwo)
300
  with right:
301
+ sortby = st.selectbox('sorted by', ['Ascending', 'Descending'])
302
 
303
+ if category_one or category_two or sortby:
304
  category_one = category_one_dict[category_one]
305
+ category_two = category_two_dict[category_two]
306
+ draw('fundamental_nlp_tasks', category_one, category_two, 'Accuracy', sortby)
307
+ # else:
308
+ # draw_only_acc('fundamental_nlp_tasks', 'zero_shot', 'OCNLI', 'Descending')