binwang commited on
Commit
14a87de
ยท
1 Parent(s): 1ad7fa2
Files changed (1) hide show
  1. app.py +3 -43
app.py CHANGED
@@ -2204,12 +2204,9 @@ block = gr.Blocks(theme='rottenlittlecreature/Moon_Goblin')
2204
  with block:
2205
  gr.Markdown(f"""
2206
  ### SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a>. Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.
2207
- - **Number of Datasets**: > 30
2208
- - **Number of Languages**: > 8
2209
- - **Number of Models**: {NUM_MODELS}
2210
- - **Mode of Evaluation**: Zero-Shot, Five-Shot
2211
 
2212
- ### Know Issues:
2213
  - For base models, the output of base model is not truncated as no EOS detected. Evaluation could be affected, especially with length-aware metrics.
2214
 
2215
  ### The following table shows the performance of the models on the SeaEval benchmark.
@@ -2261,7 +2258,6 @@ with block:
2261
  with gr.Row():
2262
  gr.Markdown("""
2263
  **Cross-XQUAD Leaderboard** ๐Ÿ”ฎ
2264
-
2265
  - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2266
  - **Languages:** English, Chinese, Spanish, Vietnamese
2267
  """)
@@ -2305,7 +2301,6 @@ with block:
2305
  with gr.Row():
2306
  gr.Markdown("""
2307
  **Cross-MMLU Leaderboard** ๐Ÿ”ฎ
2308
-
2309
  - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2310
  - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2311
  """)
@@ -2347,7 +2342,6 @@ with block:
2347
  with gr.Row():
2348
  gr.Markdown("""
2349
  **Cross-LogiQA Leaderboard** ๐Ÿ”ฎ
2350
-
2351
  - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2352
  - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2353
  """)
@@ -2377,7 +2371,6 @@ with block:
2377
  with gr.Row():
2378
  gr.Markdown("""
2379
  **SG_EVAL Leaderboard** ๐Ÿ”ฎ
2380
-
2381
  - **Metric:** Accuracy
2382
  - **Languages:** English
2383
  """)
@@ -2406,7 +2399,6 @@ with block:
2406
  with gr.Row():
2407
  gr.Markdown("""
2408
  **US_EVAL Leaderboard** ๐Ÿ”ฎ
2409
-
2410
  - **Metric:** Accuracy
2411
  - **Languages:** English
2412
  """)
@@ -2434,7 +2426,6 @@ with block:
2434
  with gr.Row():
2435
  gr.Markdown("""
2436
  **CN_EVAL Leaderboard** ๐Ÿ”ฎ
2437
-
2438
  - **Metric:** Accuracy
2439
  - **Languages:** Chinese
2440
  """)
@@ -2461,7 +2452,6 @@ with block:
2461
  with gr.Row():
2462
  gr.Markdown("""
2463
  **PH_EVAL Leaderboard** ๐Ÿ”ฎ
2464
-
2465
  - **Metric:** Accuracy
2466
  - **Languages:** English
2467
  """)
@@ -2488,7 +2478,6 @@ with block:
2488
  with gr.Row():
2489
  gr.Markdown("""
2490
  **SING2ENG Leaderboard** ๐Ÿ”ฎ
2491
-
2492
  - **Metric:** BLEU Avg.
2493
  - **Languages:** English
2494
  """)
@@ -2518,7 +2507,6 @@ with block:
2518
  with gr.Row():
2519
  gr.Markdown("""
2520
  **MMLU Leaderboard** ๐Ÿ”ฎ
2521
-
2522
  - **Metric:** Accuracy.
2523
  - **Languages:** English
2524
  """)
@@ -2546,7 +2534,6 @@ with block:
2546
  with gr.Row():
2547
  gr.Markdown("""
2548
  **MMLU Full Leaderboard** ๐Ÿ”ฎ
2549
-
2550
  - **Metric:** Accuracy.
2551
  - **Languages:** English
2552
  """)
@@ -2574,7 +2561,6 @@ with block:
2574
  with gr.Row():
2575
  gr.Markdown("""
2576
  **C_EVAL Leaderboard** ๐Ÿ”ฎ
2577
-
2578
  - **Metric:** Accuracy.
2579
  - **Languages:** Chinese
2580
  """)
@@ -2602,7 +2588,6 @@ with block:
2602
  with gr.Row():
2603
  gr.Markdown("""
2604
  **C_EVAL Full Leaderboard** ๐Ÿ”ฎ
2605
-
2606
  - **Metric:** Accuracy.
2607
  - **Languages:** Chinese
2608
  """)
@@ -2629,7 +2614,6 @@ with block:
2629
  with gr.Row():
2630
  gr.Markdown("""
2631
  **CMMLU Leaderboard** ๐Ÿ”ฎ
2632
-
2633
  - **Metric:** Accuracy.
2634
  - **Languages:** Chinese
2635
  """)
@@ -2657,7 +2641,6 @@ with block:
2657
  with gr.Row():
2658
  gr.Markdown("""
2659
  **CMMLU Full Leaderboard** ๐Ÿ”ฎ
2660
-
2661
  - **Metric:** Accuracy.
2662
  - **Languages:** Chinese
2663
  """)
@@ -2684,7 +2667,6 @@ with block:
2684
  with gr.Row():
2685
  gr.Markdown("""
2686
  **ZBench Leaderboard** ๐Ÿ”ฎ
2687
-
2688
  - **Metric:** Accuracy.
2689
  - **Languages:** Chinese
2690
  """)
@@ -2710,7 +2692,6 @@ with block:
2710
  with gr.Row():
2711
  gr.Markdown("""
2712
  **IndoMMLU Leaderboard** ๐Ÿ”ฎ
2713
-
2714
  - **Metric:** Accuracy.
2715
  - **Languages:** Bahasa Indonesian
2716
  """)
@@ -2740,7 +2721,6 @@ with block:
2740
  with gr.Row():
2741
  gr.Markdown("""
2742
  **flores_ind2eng Leaderboard** ๐Ÿ”ฎ
2743
-
2744
  - **Metric:** BLEU Avg.
2745
  - **Languages:** English
2746
  """)
@@ -2767,7 +2747,6 @@ with block:
2767
  with gr.Row():
2768
  gr.Markdown("""
2769
  **flores_vie2eng Leaderboard** ๐Ÿ”ฎ
2770
-
2771
  - **Metric:** BLEU Avg.
2772
  - **Languages:** English
2773
  """)
@@ -2795,7 +2774,6 @@ with block:
2795
  with gr.Row():
2796
  gr.Markdown("""
2797
  **flores_zho2eng Leaderboard** ๐Ÿ”ฎ
2798
-
2799
  - **Metric:** BLEU Avg.
2800
  - **Languages:** English
2801
  """)
@@ -2822,7 +2800,6 @@ with block:
2822
  with gr.Row():
2823
  gr.Markdown("""
2824
  **flores_zsm2eng Leaderboard** ๐Ÿ”ฎ
2825
-
2826
  - **Metric:** BLEU Avg.
2827
  - **Languages:** English
2828
  """)
@@ -2850,8 +2827,7 @@ with block:
2850
  )
2851
  with gr.Row():
2852
  gr.Markdown("""
2853
- **ind_emotion Leaderboard** ๐Ÿ”ฎ
2854
-
2855
  - **Metric:** Accuracy.
2856
  - **Languages:** Indonesian
2857
  """)
@@ -2878,7 +2854,6 @@ with block:
2878
  with gr.Row():
2879
  gr.Markdown("""
2880
  **SST2 Leaderboard** ๐Ÿ”ฎ
2881
-
2882
  - **Metric:** Accuracy.
2883
  - **Languages:** English
2884
  """)
@@ -2909,7 +2884,6 @@ with block:
2909
  with gr.Row():
2910
  gr.Markdown("""
2911
  **DREAM Leaderboard** ๐Ÿ”ฎ
2912
-
2913
  - **Metric:** Accuracy.
2914
  - **Languages:** English
2915
  """)
@@ -2935,7 +2909,6 @@ with block:
2935
  with gr.Row():
2936
  gr.Markdown("""
2937
  **SAMSum Leaderboard** ๐Ÿ”ฎ
2938
-
2939
  - **Metric:** ROUGE.
2940
  - **Languages:** English
2941
  """)
@@ -2962,7 +2935,6 @@ with block:
2962
  with gr.Row():
2963
  gr.Markdown("""
2964
  **DialogSum Leaderboard** ๐Ÿ”ฎ
2965
-
2966
  - **Metric:** ROUGE.
2967
  - **Languages:** English
2968
  """)
@@ -2993,7 +2965,6 @@ with block:
2993
  with gr.Row():
2994
  gr.Markdown("""
2995
  **OCNLI Leaderboard** ๐Ÿ”ฎ
2996
-
2997
  - **Metric:** Accuracy.
2998
  - **Languages:** Chinese
2999
  """)
@@ -3020,7 +2991,6 @@ with block:
3020
  with gr.Row():
3021
  gr.Markdown("""
3022
  **C3 Leaderboard** ๐Ÿ”ฎ
3023
-
3024
  - **Metric:** Accuracy.
3025
  - **Languages:** Chinese
3026
  """)
@@ -3049,7 +3019,6 @@ with block:
3049
  with gr.Row():
3050
  gr.Markdown("""
3051
  **COLA Leaderboard** ๐Ÿ”ฎ
3052
-
3053
  - **Metric:** Accuracy.
3054
  - **Languages:** English
3055
  """)
@@ -3076,7 +3045,6 @@ with block:
3076
  with gr.Row():
3077
  gr.Markdown("""
3078
  **QQP Leaderboard** ๐Ÿ”ฎ
3079
-
3080
  - **Metric:** Accuracy.
3081
  - **Languages:** English
3082
  """)
@@ -3103,7 +3071,6 @@ with block:
3103
  with gr.Row():
3104
  gr.Markdown("""
3105
  **MNLI Leaderboard** ๐Ÿ”ฎ
3106
-
3107
  - **Metric:** Accuracy.
3108
  - **Languages:** English
3109
  """)
@@ -3130,7 +3097,6 @@ with block:
3130
  with gr.Row():
3131
  gr.Markdown("""
3132
  **QNLI Leaderboard** ๐Ÿ”ฎ
3133
-
3134
  - **Metric:** Accuracy.
3135
  - **Languages:** English
3136
  """)
@@ -3158,7 +3124,6 @@ with block:
3158
  with gr.Row():
3159
  gr.Markdown("""
3160
  **WNLI Leaderboard** ๐Ÿ”ฎ
3161
-
3162
  - **Metric:** Accuracy.
3163
  - **Languages:** English
3164
  """)
@@ -3186,13 +3151,10 @@ with block:
3186
  with gr.Row():
3187
  gr.Markdown("""
3188
  **RTE Leaderboard** ๐Ÿ”ฎ
3189
-
3190
  - **Metric:** Accuracy.
3191
  - **Languages:** English
3192
  """)
3193
 
3194
-
3195
-
3196
  # dataset
3197
  with gr.TabItem("MRPC"):
3198
  with gr.TabItem("Zero Shot"):
@@ -3214,12 +3176,10 @@ with block:
3214
  with gr.Row():
3215
  gr.Markdown("""
3216
  **MRPC Leaderboard** ๐Ÿ”ฎ
3217
-
3218
  - **Metric:** Accuracy.
3219
  - **Languages:** English
3220
  """)
3221
 
3222
-
3223
  gr.Markdown(r"""
3224
  ### If our datasets and leaderboard are useful, please consider cite:
3225
  ```bibtex
 
2204
  with block:
2205
  gr.Markdown(f"""
2206
  ### SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a>. Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.
2207
+ - **Number of Datasets**: > 30, **Number of Languages**: > 8, **Number of Models**: {NUM_MODELS}, **Mode of Evaluation**: Zero-Shot, Five-Shot
 
 
 
2208
 
2209
+ ### Possible Issues:
2210
  - For base models, the output of base model is not truncated as no EOS detected. Evaluation could be affected, especially with length-aware metrics.
2211
 
2212
  ### The following table shows the performance of the models on the SeaEval benchmark.
 
2258
  with gr.Row():
2259
  gr.Markdown("""
2260
  **Cross-XQUAD Leaderboard** ๐Ÿ”ฎ
 
2261
  - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2262
  - **Languages:** English, Chinese, Spanish, Vietnamese
2263
  """)
 
2301
  with gr.Row():
2302
  gr.Markdown("""
2303
  **Cross-MMLU Leaderboard** ๐Ÿ”ฎ
 
2304
  - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2305
  - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2306
  """)
 
2342
  with gr.Row():
2343
  gr.Markdown("""
2344
  **Cross-LogiQA Leaderboard** ๐Ÿ”ฎ
 
2345
  - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2346
  - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2347
  """)
 
2371
  with gr.Row():
2372
  gr.Markdown("""
2373
  **SG_EVAL Leaderboard** ๐Ÿ”ฎ
 
2374
  - **Metric:** Accuracy
2375
  - **Languages:** English
2376
  """)
 
2399
  with gr.Row():
2400
  gr.Markdown("""
2401
  **US_EVAL Leaderboard** ๐Ÿ”ฎ
 
2402
  - **Metric:** Accuracy
2403
  - **Languages:** English
2404
  """)
 
2426
  with gr.Row():
2427
  gr.Markdown("""
2428
  **CN_EVAL Leaderboard** ๐Ÿ”ฎ
 
2429
  - **Metric:** Accuracy
2430
  - **Languages:** Chinese
2431
  """)
 
2452
  with gr.Row():
2453
  gr.Markdown("""
2454
  **PH_EVAL Leaderboard** ๐Ÿ”ฎ
 
2455
  - **Metric:** Accuracy
2456
  - **Languages:** English
2457
  """)
 
2478
  with gr.Row():
2479
  gr.Markdown("""
2480
  **SING2ENG Leaderboard** ๐Ÿ”ฎ
 
2481
  - **Metric:** BLEU Avg.
2482
  - **Languages:** English
2483
  """)
 
2507
  with gr.Row():
2508
  gr.Markdown("""
2509
  **MMLU Leaderboard** ๐Ÿ”ฎ
 
2510
  - **Metric:** Accuracy.
2511
  - **Languages:** English
2512
  """)
 
2534
  with gr.Row():
2535
  gr.Markdown("""
2536
  **MMLU Full Leaderboard** ๐Ÿ”ฎ
 
2537
  - **Metric:** Accuracy.
2538
  - **Languages:** English
2539
  """)
 
2561
  with gr.Row():
2562
  gr.Markdown("""
2563
  **C_EVAL Leaderboard** ๐Ÿ”ฎ
 
2564
  - **Metric:** Accuracy.
2565
  - **Languages:** Chinese
2566
  """)
 
2588
  with gr.Row():
2589
  gr.Markdown("""
2590
  **C_EVAL Full Leaderboard** ๐Ÿ”ฎ
 
2591
  - **Metric:** Accuracy.
2592
  - **Languages:** Chinese
2593
  """)
 
2614
  with gr.Row():
2615
  gr.Markdown("""
2616
  **CMMLU Leaderboard** ๐Ÿ”ฎ
 
2617
  - **Metric:** Accuracy.
2618
  - **Languages:** Chinese
2619
  """)
 
2641
  with gr.Row():
2642
  gr.Markdown("""
2643
  **CMMLU Full Leaderboard** ๐Ÿ”ฎ
 
2644
  - **Metric:** Accuracy.
2645
  - **Languages:** Chinese
2646
  """)
 
2667
  with gr.Row():
2668
  gr.Markdown("""
2669
  **ZBench Leaderboard** ๐Ÿ”ฎ
 
2670
  - **Metric:** Accuracy.
2671
  - **Languages:** Chinese
2672
  """)
 
2692
  with gr.Row():
2693
  gr.Markdown("""
2694
  **IndoMMLU Leaderboard** ๐Ÿ”ฎ
 
2695
  - **Metric:** Accuracy.
2696
  - **Languages:** Bahasa Indonesian
2697
  """)
 
2721
  with gr.Row():
2722
  gr.Markdown("""
2723
  **flores_ind2eng Leaderboard** ๐Ÿ”ฎ
 
2724
  - **Metric:** BLEU Avg.
2725
  - **Languages:** English
2726
  """)
 
2747
  with gr.Row():
2748
  gr.Markdown("""
2749
  **flores_vie2eng Leaderboard** ๐Ÿ”ฎ
 
2750
  - **Metric:** BLEU Avg.
2751
  - **Languages:** English
2752
  """)
 
2774
  with gr.Row():
2775
  gr.Markdown("""
2776
  **flores_zho2eng Leaderboard** ๐Ÿ”ฎ
 
2777
  - **Metric:** BLEU Avg.
2778
  - **Languages:** English
2779
  """)
 
2800
  with gr.Row():
2801
  gr.Markdown("""
2802
  **flores_zsm2eng Leaderboard** ๐Ÿ”ฎ
 
2803
  - **Metric:** BLEU Avg.
2804
  - **Languages:** English
2805
  """)
 
2827
  )
2828
  with gr.Row():
2829
  gr.Markdown("""
2830
+ **Ind_emotion Leaderboard** ๐Ÿ”ฎ
 
2831
  - **Metric:** Accuracy.
2832
  - **Languages:** Indonesian
2833
  """)
 
2854
  with gr.Row():
2855
  gr.Markdown("""
2856
  **SST2 Leaderboard** ๐Ÿ”ฎ
 
2857
  - **Metric:** Accuracy.
2858
  - **Languages:** English
2859
  """)
 
2884
  with gr.Row():
2885
  gr.Markdown("""
2886
  **DREAM Leaderboard** ๐Ÿ”ฎ
 
2887
  - **Metric:** Accuracy.
2888
  - **Languages:** English
2889
  """)
 
2909
  with gr.Row():
2910
  gr.Markdown("""
2911
  **SAMSum Leaderboard** ๐Ÿ”ฎ
 
2912
  - **Metric:** ROUGE.
2913
  - **Languages:** English
2914
  """)
 
2935
  with gr.Row():
2936
  gr.Markdown("""
2937
  **DialogSum Leaderboard** ๐Ÿ”ฎ
 
2938
  - **Metric:** ROUGE.
2939
  - **Languages:** English
2940
  """)
 
2965
  with gr.Row():
2966
  gr.Markdown("""
2967
  **OCNLI Leaderboard** ๐Ÿ”ฎ
 
2968
  - **Metric:** Accuracy.
2969
  - **Languages:** Chinese
2970
  """)
 
2991
  with gr.Row():
2992
  gr.Markdown("""
2993
  **C3 Leaderboard** ๐Ÿ”ฎ
 
2994
  - **Metric:** Accuracy.
2995
  - **Languages:** Chinese
2996
  """)
 
3019
  with gr.Row():
3020
  gr.Markdown("""
3021
  **COLA Leaderboard** ๐Ÿ”ฎ
 
3022
  - **Metric:** Accuracy.
3023
  - **Languages:** English
3024
  """)
 
3045
  with gr.Row():
3046
  gr.Markdown("""
3047
  **QQP Leaderboard** ๐Ÿ”ฎ
 
3048
  - **Metric:** Accuracy.
3049
  - **Languages:** English
3050
  """)
 
3071
  with gr.Row():
3072
  gr.Markdown("""
3073
  **MNLI Leaderboard** ๐Ÿ”ฎ
 
3074
  - **Metric:** Accuracy.
3075
  - **Languages:** English
3076
  """)
 
3097
  with gr.Row():
3098
  gr.Markdown("""
3099
  **QNLI Leaderboard** ๐Ÿ”ฎ
 
3100
  - **Metric:** Accuracy.
3101
  - **Languages:** English
3102
  """)
 
3124
  with gr.Row():
3125
  gr.Markdown("""
3126
  **WNLI Leaderboard** ๐Ÿ”ฎ
 
3127
  - **Metric:** Accuracy.
3128
  - **Languages:** English
3129
  """)
 
3151
  with gr.Row():
3152
  gr.Markdown("""
3153
  **RTE Leaderboard** ๐Ÿ”ฎ
 
3154
  - **Metric:** Accuracy.
3155
  - **Languages:** English
3156
  """)
3157
 
 
 
3158
  # dataset
3159
  with gr.TabItem("MRPC"):
3160
  with gr.TabItem("Zero Shot"):
 
3176
  with gr.Row():
3177
  gr.Markdown("""
3178
  **MRPC Leaderboard** ๐Ÿ”ฎ
 
3179
  - **Metric:** Accuracy.
3180
  - **Languages:** English
3181
  """)
3182
 
 
3183
  gr.Markdown(r"""
3184
  ### If our datasets and leaderboard are useful, please consider cite:
3185
  ```bibtex