binwang commited on
Commit
682595a
·
1 Parent(s): ee26773
Files changed (1) hide show
  1. app.py +1431 -194
app.py CHANGED
@@ -1202,287 +1202,1521 @@ def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):
1202
  ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot")
1203
  ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot")
1204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1205
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1206
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1207
 
1208
- block = gr.Blocks()
1209
- with block:
1210
- gr.Markdown(f"""
1211
- SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a> Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1212
 
1213
- - **Total Datasets**: 31
1214
- - **Total Languages**: 8
1215
- - **Total Models**: {NUM_MODELS}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1216
  """)
 
1217
  with gr.Tabs():
1218
 
1219
 
1220
- # dataset 1: cross-mmlu
1221
- with gr.TabItem("Cross-MMLU"):
 
 
1222
  with gr.Row():
1223
  gr.Markdown("""
1224
- **Cross-MMLU Leaderboard** 🔮
1225
 
1226
- - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
1227
- - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
1228
  """)
1229
 
1230
  with gr.TabItem("zero_shot"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1231
 
 
 
 
 
 
 
 
 
 
1232
 
 
1233
  with gr.TabItem("Overall"):
 
 
 
 
 
 
1234
 
 
 
1235
  with gr.Row():
1236
- cross_mmlu_zero_shot_overall = gr.components.Dataframe(
1237
- CROSS_MMLU_ZERO_SHOT_OVERALL,
1238
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
1239
  type="pandas",
1240
  )
1241
 
1242
 
1243
- with gr.TabItem("Language Performance"):
1244
 
 
 
 
 
 
 
 
 
 
 
 
 
1245
  with gr.Row():
1246
- cross_mmlu_zero_shot_overall = gr.components.Dataframe(
1247
- CROSS_MMLU_ZERO_SHOT_LANGUAGE,
1248
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1249
  type="pandas",
1250
  )
1251
 
1252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1253
  with gr.TabItem("five_shot"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1254
 
1255
 
 
 
1256
  with gr.TabItem("Overall"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1257
 
 
 
1258
  with gr.Row():
1259
- cross_mmlu_zero_shot_overall = gr.components.Dataframe(
1260
- CROSS_MMLU_FIVE_SHOT_OVERALL,
1261
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
1262
  type="pandas",
1263
  )
1264
 
1265
 
1266
- with gr.TabItem("Language Performance"):
1267
 
 
 
1268
  with gr.Row():
1269
  gr.components.Dataframe(
1270
- CROSS_MMLU_FIVE_SHOT_LANGUAGE,
1271
- datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
1272
  type="pandas",
1273
  )
1274
 
1275
 
1276
-
1277
- # dataset 2: cross-logiqa
1278
- with gr.TabItem("Cross-LogiQA"):
1279
  with gr.Row():
1280
  gr.Markdown("""
1281
- **Cross-LogiQA Leaderboard** 🔮
1282
 
1283
- - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
1284
- - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
1285
  """)
1286
 
1287
  with gr.TabItem("zero_shot"):
1288
-
1289
-
1290
  with gr.TabItem("Overall"):
1291
-
1292
  with gr.Row():
1293
  gr.components.Dataframe(
1294
- CROSS_LOGIQA_ZERO_SHOT_OVERALL,
1295
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
1296
  type="pandas",
1297
  )
1298
 
1299
 
1300
- with gr.TabItem("Language Performance"):
1301
 
 
 
1302
  with gr.Row():
1303
  gr.components.Dataframe(
1304
- CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
1305
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
1306
  type="pandas",
1307
  )
1308
 
 
 
 
 
 
 
 
 
 
1309
 
1310
- with gr.TabItem("five_shot"):
1311
-
1312
-
1313
  with gr.TabItem("Overall"):
1314
-
1315
  with gr.Row():
1316
  gr.components.Dataframe(
1317
- CROSS_LOGIQA_FIVE_SHOT_OVERALL,
1318
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
1319
  type="pandas",
1320
  )
1321
 
1322
 
1323
- with gr.TabItem("Language Performance"):
1324
 
 
 
1325
  with gr.Row():
1326
  gr.components.Dataframe(
1327
- CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
1328
- datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
1329
  type="pandas",
1330
  )
1331
 
1332
-
1333
- # dataset 3: SG_EVAL
1334
- with gr.TabItem("SG_EVAL"):
1335
  with gr.Row():
1336
  gr.Markdown("""
1337
- **SG_EVAL Leaderboard** 🔮
1338
 
1339
- - **Metric:** Accuracy
1340
- - **Languages:** English
1341
  """)
1342
 
1343
  with gr.TabItem("zero_shot"):
1344
  with gr.TabItem("Overall"):
1345
  with gr.Row():
1346
  gr.components.Dataframe(
1347
- SG_EVAL_ZERO_SHOT,
1348
- datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
1349
  type="pandas",
1350
  )
1351
 
 
 
1352
  with gr.TabItem("five_shot"):
1353
  with gr.TabItem("Overall"):
1354
  with gr.Row():
1355
  gr.components.Dataframe(
1356
- SG_EVAL_FIVE_SHOT,
1357
- datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
1358
  type="pandas",
1359
  )
1360
 
1361
-
1362
- # dataset 4:
1363
- with gr.TabItem("US_EVAL"):
1364
  with gr.Row():
1365
  gr.Markdown("""
1366
- **US_EVAL Leaderboard** 🔮
1367
 
1368
- - **Metric:** Accuracy
1369
- - **Languages:** English
1370
  """)
1371
 
1372
  with gr.TabItem("zero_shot"):
1373
  with gr.TabItem("Overall"):
1374
  with gr.Row():
1375
  gr.components.Dataframe(
1376
- US_EVAL_ZERO_SHOT,
1377
- datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
1378
  type="pandas",
1379
  )
1380
 
 
 
1381
  with gr.TabItem("five_shot"):
1382
  with gr.TabItem("Overall"):
1383
  with gr.Row():
1384
  gr.components.Dataframe(
1385
- US_EVAL_FIVE_SHOT,
1386
- datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
1387
  type="pandas",
1388
  )
1389
 
1390
 
1391
- # dataset 5:
1392
- with gr.TabItem("CN_EVAL"):
1393
  with gr.Row():
1394
  gr.Markdown("""
1395
- **CN_EVAL Leaderboard** 🔮
1396
 
1397
- - **Metric:** Accuracy
1398
- - **Languages:** Chinese
1399
  """)
1400
 
1401
  with gr.TabItem("zero_shot"):
1402
  with gr.TabItem("Overall"):
1403
  with gr.Row():
1404
  gr.components.Dataframe(
1405
- CN_EVAL_ZERO_SHOT,
1406
- datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
1407
  type="pandas",
1408
  )
1409
 
 
 
1410
  with gr.TabItem("five_shot"):
1411
  with gr.TabItem("Overall"):
1412
  with gr.Row():
1413
  gr.components.Dataframe(
1414
- CN_EVAL_FIVE_SHOT,
1415
- datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
1416
  type="pandas",
1417
  )
1418
 
1419
 
1420
-
1421
- # dataset 6:
1422
- with gr.TabItem("PH_EVAL"):
1423
  with gr.Row():
1424
  gr.Markdown("""
1425
- **PH_EVAL Leaderboard** 🔮
1426
 
1427
- - **Metric:** Accuracy
1428
- - **Languages:** English
1429
  """)
1430
 
1431
  with gr.TabItem("zero_shot"):
1432
  with gr.TabItem("Overall"):
1433
  with gr.Row():
1434
  gr.components.Dataframe(
1435
- PH_EVAL_ZERO_SHOT,
1436
- datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
1437
  type="pandas",
1438
  )
1439
 
 
 
1440
  with gr.TabItem("five_shot"):
1441
  with gr.TabItem("Overall"):
1442
  with gr.Row():
1443
  gr.components.Dataframe(
1444
- PH_EVAL_FIVE_SHOT,
1445
- datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
1446
  type="pandas",
1447
  )
1448
 
1449
-
1450
- # dataset 7:
1451
- with gr.TabItem("Singlish to English Translation"):
1452
  with gr.Row():
1453
  gr.Markdown("""
1454
- **SING2ENG Leaderboard** 🔮
1455
 
1456
- - **Metric:** BLEU Avg.
1457
- - **Languages:** English
1458
  """)
1459
 
1460
  with gr.TabItem("zero_shot"):
1461
  with gr.TabItem("Overall"):
1462
  with gr.Row():
1463
  gr.components.Dataframe(
1464
- SING2ENG_ZERO_SHOT,
1465
- datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns),
1466
  type="pandas",
1467
  )
1468
 
 
 
1469
  with gr.TabItem("five_shot"):
1470
  with gr.TabItem("Overall"):
1471
  with gr.Row():
1472
  gr.components.Dataframe(
1473
- SING2ENG_FIVE_SHOT,
1474
- datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns),
1475
  type="pandas",
1476
  )
1477
 
1478
 
1479
- # dataset 8:
1480
- with gr.TabItem("FLORES Indonesian to English Translation"):
1481
  with gr.Row():
1482
  gr.Markdown("""
1483
- **flores_ind2eng Leaderboard** 🔮
1484
 
1485
- - **Metric:** BLEU Avg.
1486
  - **Languages:** English
1487
  """)
1488
 
@@ -1490,28 +2724,29 @@ with block:
1490
  with gr.TabItem("Overall"):
1491
  with gr.Row():
1492
  gr.components.Dataframe(
1493
- FLORES_IND2ENG_ZERO_SHOT,
1494
- datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns),
1495
  type="pandas",
1496
  )
1497
 
 
 
1498
  with gr.TabItem("five_shot"):
1499
  with gr.TabItem("Overall"):
1500
  with gr.Row():
1501
  gr.components.Dataframe(
1502
- FLORES_IND2ENG_FIVE_SHOT,
1503
- datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns),
1504
  type="pandas",
1505
  )
1506
 
1507
-
1508
- # dataset 9:
1509
- with gr.TabItem("FLORES Vitenamese to English Translation"):
1510
  with gr.Row():
1511
  gr.Markdown("""
1512
- **flores_vie2eng Leaderboard** 🔮
1513
 
1514
- - **Metric:** BLEU Avg.
1515
  - **Languages:** English
1516
  """)
1517
 
@@ -1519,29 +2754,31 @@ with block:
1519
  with gr.TabItem("Overall"):
1520
  with gr.Row():
1521
  gr.components.Dataframe(
1522
- FLORES_VIE2ENG_ZERO_SHOT,
1523
- datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns),
1524
  type="pandas",
1525
  )
1526
 
 
 
1527
  with gr.TabItem("five_shot"):
1528
  with gr.TabItem("Overall"):
1529
  with gr.Row():
1530
  gr.components.Dataframe(
1531
- FLORES_VIE2ENG_FIVE_SHOT,
1532
- datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns),
1533
  type="pandas",
1534
  )
1535
 
1536
 
1537
 
1538
- # dataset 10:
1539
- with gr.TabItem("FLORES Chinese to English Translation"):
1540
  with gr.Row():
1541
  gr.Markdown("""
1542
- **flores_zho2eng Leaderboard** 🔮
1543
 
1544
- - **Metric:** BLEU Avg.
1545
  - **Languages:** English
1546
  """)
1547
 
@@ -1549,29 +2786,30 @@ with block:
1549
  with gr.TabItem("Overall"):
1550
  with gr.Row():
1551
  gr.components.Dataframe(
1552
- FLORES_ZHO2ENG_ZERO_SHOT,
1553
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns),
1554
  type="pandas",
1555
  )
1556
 
 
 
1557
  with gr.TabItem("five_shot"):
1558
  with gr.TabItem("Overall"):
1559
  with gr.Row():
1560
  gr.components.Dataframe(
1561
- FLORES_ZHO2ENG_FIVE_SHOT,
1562
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns),
1563
  type="pandas",
1564
  )
1565
 
1566
 
1567
-
1568
- # dataset 11:
1569
- with gr.TabItem("FLORES Malay to English Translation"):
1570
  with gr.Row():
1571
  gr.Markdown("""
1572
- **flores_zsm2eng Leaderboard** 🔮
1573
 
1574
- - **Metric:** BLEU Avg.
1575
  - **Languages:** English
1576
  """)
1577
 
@@ -1579,26 +2817,28 @@ with block:
1579
  with gr.TabItem("Overall"):
1580
  with gr.Row():
1581
  gr.components.Dataframe(
1582
- FLORES_ZSM2ENG_ZERO_SHOT,
1583
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns),
1584
  type="pandas",
1585
  )
1586
 
 
 
1587
  with gr.TabItem("five_shot"):
1588
  with gr.TabItem("Overall"):
1589
  with gr.Row():
1590
  gr.components.Dataframe(
1591
- FLORES_ZSM2ENG_FIVE_SHOT,
1592
- datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
1593
  type="pandas",
1594
  )
1595
 
1596
 
1597
- # dataset 12:
1598
- with gr.TabItem("MMLU"):
1599
  with gr.Row():
1600
  gr.Markdown("""
1601
- **MMLU Leaderboard** 🔮
1602
 
1603
  - **Metric:** Accuracy.
1604
  - **Languages:** English
@@ -1608,26 +2848,28 @@ with block:
1608
  with gr.TabItem("Overall"):
1609
  with gr.Row():
1610
  gr.components.Dataframe(
1611
- MMLU_ZERO_SHOT,
1612
- datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
1613
  type="pandas",
1614
  )
1615
 
 
 
1616
  with gr.TabItem("five_shot"):
1617
  with gr.TabItem("Overall"):
1618
  with gr.Row():
1619
  gr.components.Dataframe(
1620
- MMLU_FIVE_SHOT,
1621
- datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
1622
  type="pandas",
1623
  )
1624
 
1625
 
1626
- # dataset 13:
1627
- with gr.TabItem("MMLU Full"):
1628
  with gr.Row():
1629
  gr.Markdown("""
1630
- **MMLU Full Leaderboard** 🔮
1631
 
1632
  - **Metric:** Accuracy.
1633
  - **Languages:** English
@@ -1637,8 +2879,8 @@ with block:
1637
  with gr.TabItem("Overall"):
1638
  with gr.Row():
1639
  gr.components.Dataframe(
1640
- MMLU_FULL_ZERO_SHOT,
1641
- datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
1642
  type="pandas",
1643
  )
1644
 
@@ -1648,27 +2890,28 @@ with block:
1648
  with gr.TabItem("Overall"):
1649
  with gr.Row():
1650
  gr.components.Dataframe(
1651
- MMLU_FULL_FIVE_SHOT,
1652
- datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
1653
  type="pandas",
1654
  )
1655
 
1656
- # dataset 14:
1657
- with gr.TabItem("C_EVAL"):
 
1658
  with gr.Row():
1659
  gr.Markdown("""
1660
- **C_EVAL Leaderboard** 🔮
1661
 
1662
  - **Metric:** Accuracy.
1663
- - **Languages:** Chinese
1664
  """)
1665
 
1666
  with gr.TabItem("zero_shot"):
1667
  with gr.TabItem("Overall"):
1668
  with gr.Row():
1669
  gr.components.Dataframe(
1670
- C_EVAL_ZERO_SHOT,
1671
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
1672
  type="pandas",
1673
  )
1674
 
@@ -1678,28 +2921,28 @@ with block:
1678
  with gr.TabItem("Overall"):
1679
  with gr.Row():
1680
  gr.components.Dataframe(
1681
- C_EVAL_FIVE_SHOT,
1682
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
1683
  type="pandas",
1684
  )
1685
 
1686
 
1687
- # dataset 15:
1688
- with gr.TabItem("C_EVAL Full"):
1689
  with gr.Row():
1690
  gr.Markdown("""
1691
- **C_EVAL Full Leaderboard** 🔮
1692
 
1693
  - **Metric:** Accuracy.
1694
- - **Languages:** Chinese
1695
  """)
1696
 
1697
  with gr.TabItem("zero_shot"):
1698
  with gr.TabItem("Overall"):
1699
  with gr.Row():
1700
  gr.components.Dataframe(
1701
- C_EVAL_FULL_ZERO_SHOT,
1702
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
1703
  type="pandas",
1704
  )
1705
 
@@ -1709,27 +2952,28 @@ with block:
1709
  with gr.TabItem("Overall"):
1710
  with gr.Row():
1711
  gr.components.Dataframe(
1712
- C_EVAL_FULL_FIVE_SHOT,
1713
- datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
1714
  type="pandas",
1715
  )
1716
 
1717
- # dataset 16:
1718
- with gr.TabItem("CMMLU"):
 
1719
  with gr.Row():
1720
  gr.Markdown("""
1721
- **CMMLU Leaderboard** 🔮
1722
 
1723
  - **Metric:** Accuracy.
1724
- - **Languages:** Chinese
1725
  """)
1726
 
1727
  with gr.TabItem("zero_shot"):
1728
  with gr.TabItem("Overall"):
1729
  with gr.Row():
1730
  gr.components.Dataframe(
1731
- CMMLU_ZERO_SHOT,
1732
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
1733
  type="pandas",
1734
  )
1735
 
@@ -1739,27 +2983,28 @@ with block:
1739
  with gr.TabItem("Overall"):
1740
  with gr.Row():
1741
  gr.components.Dataframe(
1742
- CMMLU_FIVE_SHOT,
1743
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
1744
  type="pandas",
1745
  )
1746
 
1747
- # dataset 17:
1748
- with gr.TabItem("CMMLU Full"):
 
1749
  with gr.Row():
1750
  gr.Markdown("""
1751
- **CMMLU Full Leaderboard** 🔮
1752
 
1753
  - **Metric:** Accuracy.
1754
- - **Languages:** Chinese
1755
  """)
1756
 
1757
  with gr.TabItem("zero_shot"):
1758
  with gr.TabItem("Overall"):
1759
  with gr.Row():
1760
  gr.components.Dataframe(
1761
- CMMLU_FULL_ZERO_SHOT,
1762
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
1763
  type="pandas",
1764
  )
1765
 
@@ -1769,27 +3014,28 @@ with block:
1769
  with gr.TabItem("Overall"):
1770
  with gr.Row():
1771
  gr.components.Dataframe(
1772
- CMMLU_FULL_FIVE_SHOT,
1773
- datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
1774
  type="pandas",
1775
  )
1776
 
1777
- # dataset 18:
1778
- with gr.TabItem("ZBench"):
 
1779
  with gr.Row():
1780
  gr.Markdown("""
1781
- **ZBench Leaderboard** 🔮
1782
 
1783
  - **Metric:** Accuracy.
1784
- - **Languages:** Chinese
1785
  """)
1786
 
1787
  with gr.TabItem("zero_shot"):
1788
  with gr.TabItem("Overall"):
1789
  with gr.Row():
1790
  gr.components.Dataframe(
1791
- ZBENCH_ZERO_SHOT,
1792
- datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
1793
  type="pandas",
1794
  )
1795
 
@@ -1799,8 +3045,8 @@ with block:
1799
  with gr.TabItem("Overall"):
1800
  with gr.Row():
1801
  gr.components.Dataframe(
1802
- ZBENCH_FIVE_SHOT,
1803
- datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
1804
  type="pandas",
1805
  )
1806
 
@@ -1814,15 +3060,6 @@ with block:
1814
 
1815
 
1816
 
1817
-
1818
-
1819
-
1820
-
1821
-
1822
-
1823
-
1824
-
1825
-
1826
 
1827
  gr.Markdown(r"""
1828
 
 
1202
  ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot")
1203
  ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot")
1204
 
1205
+
1206
+
1207
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1208
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1209
+
1210
+
1211
+ def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
1212
+
1213
+ df_list = []
1214
+
1215
+ for model in MODEL_LIST:
1216
+
1217
+
1218
+ results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']]
1219
+
1220
+
1221
+ try:
1222
+ accuracy = median([results['accuracy'] for results in results_list])
1223
+
1224
+ except:
1225
+ print(results_list)
1226
+ accuracy = -1
1227
+
1228
+
1229
+ res = {
1230
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1231
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1232
+ "Accuracy": accuracy,
1233
+ }
1234
+
1235
+ df_list.append(res)
1236
+
1237
+
1238
+ df = pd.DataFrame(df_list)
1239
+ # If there are any models that are the same, merge them
1240
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1241
+ df = df.groupby("Model", as_index=False).first()
1242
+ # Put 'Model' column first
1243
+ #cols = sorted(list(df.columns))
1244
+ cols = list(df.columns)
1245
+ cols.insert(0, cols.pop(cols.index("Model")))
1246
+ df = df[cols]
1247
+
1248
+ if rank:
1249
+ df = add_rank(df, compute_average=True)
1250
+
1251
+ if fillna:
1252
+ df.fillna("", inplace=True)
1253
+
1254
+ return df
1255
+
1256
+
1257
+ IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot")
1258
+ IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot")
1259
+
1260
+
1261
+
1262
+
1263
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1264
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1265
+
1266
+
1267
+ def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True):
1268
+
1269
+ df_list = []
1270
+
1271
+ for model in MODEL_LIST:
1272
+
1273
+
1274
+ results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']]
1275
+
1276
+
1277
+ try:
1278
+ accuracy = median([results['accuracy'] for results in results_list])
1279
+
1280
+ except:
1281
+ print(results_list)
1282
+ accuracy = -1
1283
+
1284
+
1285
+ res = {
1286
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1287
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1288
+ "Accuracy": accuracy,
1289
+ }
1290
+
1291
+ df_list.append(res)
1292
+
1293
+
1294
+ df = pd.DataFrame(df_list)
1295
+ # If there are any models that are the same, merge them
1296
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1297
+ df = df.groupby("Model", as_index=False).first()
1298
+ # Put 'Model' column first
1299
+ #cols = sorted(list(df.columns))
1300
+ cols = list(df.columns)
1301
+ cols.insert(0, cols.pop(cols.index("Model")))
1302
+ df = df[cols]
1303
+
1304
+ if rank:
1305
+ df = add_rank(df, compute_average=True)
1306
+
1307
+ if fillna:
1308
+ df.fillna("", inplace=True)
1309
+
1310
+ return df
1311
+
1312
+
1313
+ OCNLI_ZERO_SHOT = get_data_ocnli(eval_mode="zero_shot")
1314
+ OCNLI_FIVE_SHOT = get_data_ocnli(eval_mode="five_shot")
1315
+
1316
+
1317
+
1318
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1319
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1320
+
1321
+
1322
+ def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True):
1323
+
1324
+ df_list = []
1325
+
1326
+ for model in MODEL_LIST:
1327
+
1328
+
1329
+ results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']]
1330
+
1331
+
1332
+ try:
1333
+ accuracy = median([results['accuracy'] for results in results_list])
1334
+
1335
+ except:
1336
+ print(results_list)
1337
+ accuracy = -1
1338
+
1339
+
1340
+ res = {
1341
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1342
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1343
+ "Accuracy": accuracy,
1344
+ }
1345
+
1346
+ df_list.append(res)
1347
+
1348
+
1349
+ df = pd.DataFrame(df_list)
1350
+ # If there are any models that are the same, merge them
1351
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1352
+ df = df.groupby("Model", as_index=False).first()
1353
+ # Put 'Model' column first
1354
+ #cols = sorted(list(df.columns))
1355
+ cols = list(df.columns)
1356
+ cols.insert(0, cols.pop(cols.index("Model")))
1357
+ df = df[cols]
1358
+
1359
+ if rank:
1360
+ df = add_rank(df, compute_average=True)
1361
+
1362
+ if fillna:
1363
+ df.fillna("", inplace=True)
1364
+
1365
+ return df
1366
+
1367
+
1368
+ C3_ZERO_SHOT = get_data_c3(eval_mode="zero_shot")
1369
+ C3_FIVE_SHOT = get_data_c3(eval_mode="five_shot")
1370
+
1371
+
1372
+
1373
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1374
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1375
+
1376
+
1377
+ def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):
1378
+
1379
+ df_list = []
1380
+
1381
+ for model in MODEL_LIST:
1382
+
1383
+
1384
+ results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']]
1385
+
1386
+
1387
+ try:
1388
+ accuracy = median([results['accuracy'] for results in results_list])
1389
+
1390
+ except:
1391
+ print(results_list)
1392
+ accuracy = -1
1393
+
1394
+
1395
+ res = {
1396
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1397
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1398
+ "Accuracy": accuracy,
1399
+ }
1400
+
1401
+ df_list.append(res)
1402
+
1403
+
1404
+ df = pd.DataFrame(df_list)
1405
+ # If there are any models that are the same, merge them
1406
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1407
+ df = df.groupby("Model", as_index=False).first()
1408
+ # Put 'Model' column first
1409
+ #cols = sorted(list(df.columns))
1410
+ cols = list(df.columns)
1411
+ cols.insert(0, cols.pop(cols.index("Model")))
1412
+ df = df[cols]
1413
+
1414
+ if rank:
1415
+ df = add_rank(df, compute_average=True)
1416
+
1417
+ if fillna:
1418
+ df.fillna("", inplace=True)
1419
+
1420
+ return df
1421
+
1422
+
1423
+ DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot")
1424
+ DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot")
1425
+
1426
+
1427
+
1428
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1429
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1430
+
1431
+
1432
+ def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):
1433
+
1434
+ df_list = []
1435
+
1436
+ for model in MODEL_LIST:
1437
+
1438
+
1439
+ results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']]
1440
+
1441
+
1442
+ try:
1443
+ rouge1 = median([results['rouge1'] for results in results_list])
1444
+ rouge2 = median([results['rouge2'] for results in results_list])
1445
+ rougeL = median([results['rougeL'] for results in results_list])
1446
+
1447
+ except:
1448
+ print(results_list)
1449
+ rouge1 = -1
1450
+ rouge2 = -1
1451
+ rougeL = -1
1452
+
1453
+
1454
+ res = {
1455
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1456
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1457
+ "ROUGE-1": rouge1,
1458
+ "ROUGE-2": rouge2,
1459
+ "ROUGE-L": rougeL,
1460
+ }
1461
+
1462
+ df_list.append(res)
1463
+
1464
+
1465
+ df = pd.DataFrame(df_list)
1466
+ # If there are any models that are the same, merge them
1467
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1468
+ df = df.groupby("Model", as_index=False).first()
1469
+ # Put 'Model' column first
1470
+ #cols = sorted(list(df.columns))
1471
+ cols = list(df.columns)
1472
+ cols.insert(0, cols.pop(cols.index("Model")))
1473
+ df = df[cols]
1474
+
1475
+ if rank:
1476
+ df = add_rank(df, compute_average=True)
1477
+
1478
+ if fillna:
1479
+ df.fillna("", inplace=True)
1480
+
1481
+ return df
1482
+
1483
+
1484
+ SAMSUM_ZERO_SHOT = get_data_samsum(eval_mode="zero_shot")
1485
+ SAMSUM_FIVE_SHOT = get_data_samsum(eval_mode="five_shot")
1486
+
1487
+
1488
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1489
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1490
+
1491
+
1492
+ def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True):
1493
+
1494
+ df_list = []
1495
+
1496
+ for model in MODEL_LIST:
1497
+
1498
+
1499
+ results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']]
1500
+
1501
+
1502
+ try:
1503
+ rouge1 = median([results['rouge1'] for results in results_list])
1504
+ rouge2 = median([results['rouge2'] for results in results_list])
1505
+ rougeL = median([results['rougeL'] for results in results_list])
1506
+
1507
+ except:
1508
+ print(results_list)
1509
+ rouge1 = -1
1510
+ rouge2 = -1
1511
+ rougeL = -1
1512
+
1513
+
1514
+ res = {
1515
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1516
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1517
+ "ROUGE-1": rouge1,
1518
+ "ROUGE-2": rouge2,
1519
+ "ROUGE-L": rougeL,
1520
+ }
1521
+
1522
+ df_list.append(res)
1523
+
1524
+
1525
+ df = pd.DataFrame(df_list)
1526
+ # If there are any models that are the same, merge them
1527
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1528
+ df = df.groupby("Model", as_index=False).first()
1529
+ # Put 'Model' column first
1530
+ #cols = sorted(list(df.columns))
1531
+ cols = list(df.columns)
1532
+ cols.insert(0, cols.pop(cols.index("Model")))
1533
+ df = df[cols]
1534
+
1535
+ if rank:
1536
+ df = add_rank(df, compute_average=True)
1537
+
1538
+ if fillna:
1539
+ df.fillna("", inplace=True)
1540
+
1541
+ return df
1542
+
1543
+
1544
+ DIALOGSUM_ZERO_SHOT = get_data_dialogsum(eval_mode="zero_shot")
1545
+ DIALOGSUM_FIVE_SHOT = get_data_dialogsum(eval_mode="five_shot")
1546
+
1547
+
1548
+
1549
+
1550
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1551
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1552
+
1553
+
1554
+ def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True):
1555
+
1556
+ df_list = []
1557
+
1558
+ for model in MODEL_LIST:
1559
+
1560
+
1561
+ results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']]
1562
+
1563
+
1564
+ try:
1565
+ accuracy = median([results['accuracy'] for results in results_list])
1566
+
1567
+ except:
1568
+ print(results_list)
1569
+ accuracy = -1
1570
+
1571
+
1572
+ res = {
1573
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1574
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1575
+ "Accuracy": accuracy,
1576
+ }
1577
+
1578
+ df_list.append(res)
1579
+
1580
+
1581
+ df = pd.DataFrame(df_list)
1582
+ # If there are any models that are the same, merge them
1583
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1584
+ df = df.groupby("Model", as_index=False).first()
1585
+ # Put 'Model' column first
1586
+ #cols = sorted(list(df.columns))
1587
+ cols = list(df.columns)
1588
+ cols.insert(0, cols.pop(cols.index("Model")))
1589
+ df = df[cols]
1590
+
1591
+ if rank:
1592
+ df = add_rank(df, compute_average=True)
1593
+
1594
+ if fillna:
1595
+ df.fillna("", inplace=True)
1596
+
1597
+ return df
1598
+
1599
+
1600
+ SST2_ZERO_SHOT = get_data_sst2(eval_mode="zero_shot")
1601
+ SST2_FIVE_SHOT = get_data_sst2(eval_mode="five_shot")
1602
+
1603
+
1604
+
1605
+
1606
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1607
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1608
+
1609
+
1610
+ def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True):
1611
+
1612
+ df_list = []
1613
+
1614
+ for model in MODEL_LIST:
1615
+
1616
+
1617
+ results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']]
1618
+
1619
+
1620
+ try:
1621
+ accuracy = median([results['accuracy'] for results in results_list])
1622
+
1623
+ except:
1624
+ print(results_list)
1625
+ accuracy = -1
1626
+
1627
+
1628
+ res = {
1629
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1630
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1631
+ "Accuracy": accuracy,
1632
+ }
1633
+
1634
+ df_list.append(res)
1635
+
1636
+
1637
+ df = pd.DataFrame(df_list)
1638
+ # If there are any models that are the same, merge them
1639
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1640
+ df = df.groupby("Model", as_index=False).first()
1641
+ # Put 'Model' column first
1642
+ #cols = sorted(list(df.columns))
1643
+ cols = list(df.columns)
1644
+ cols.insert(0, cols.pop(cols.index("Model")))
1645
+ df = df[cols]
1646
+
1647
+ if rank:
1648
+ df = add_rank(df, compute_average=True)
1649
+
1650
+ if fillna:
1651
+ df.fillna("", inplace=True)
1652
+
1653
+ return df
1654
+
1655
+
1656
+ COLA_ZERO_SHOT = get_data_cola(eval_mode="zero_shot")
1657
+ COLA_FIVE_SHOT = get_data_cola(eval_mode="five_shot")
1658
+
1659
+
1660
+
1661
+
1662
+
1663
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1664
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1665
+
1666
+
1667
+ def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True):
1668
+
1669
+ df_list = []
1670
+
1671
+ for model in MODEL_LIST:
1672
+
1673
+
1674
+ results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']]
1675
+
1676
+
1677
+ try:
1678
+ accuracy = median([results['accuracy'] for results in results_list])
1679
+
1680
+ except:
1681
+ print(results_list)
1682
+ accuracy = -1
1683
+
1684
+
1685
+ res = {
1686
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1687
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1688
+ "Accuracy": accuracy,
1689
+ }
1690
+
1691
+ df_list.append(res)
1692
+
1693
+
1694
+ df = pd.DataFrame(df_list)
1695
+ # If there are any models that are the same, merge them
1696
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1697
+ df = df.groupby("Model", as_index=False).first()
1698
+ # Put 'Model' column first
1699
+ #cols = sorted(list(df.columns))
1700
+ cols = list(df.columns)
1701
+ cols.insert(0, cols.pop(cols.index("Model")))
1702
+ df = df[cols]
1703
+
1704
+ if rank:
1705
+ df = add_rank(df, compute_average=True)
1706
+
1707
+ if fillna:
1708
+ df.fillna("", inplace=True)
1709
+
1710
+ return df
1711
+
1712
+
1713
+ QQP_ZERO_SHOT = get_data_qqp(eval_mode="zero_shot")
1714
+ QQP_FIVE_SHOT = get_data_qqp(eval_mode="five_shot")
1715
+
1716
+
1717
+
1718
+
1719
+
1720
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1721
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1722
+
1723
+
1724
+ def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True):
1725
+
1726
+ df_list = []
1727
+
1728
+ for model in MODEL_LIST:
1729
+
1730
+
1731
+ results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']]
1732
+
1733
+
1734
+ try:
1735
+ accuracy = median([results['accuracy'] for results in results_list])
1736
+
1737
+ except:
1738
+ print(results_list)
1739
+ accuracy = -1
1740
+
1741
+
1742
+ res = {
1743
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1744
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1745
+ "Accuracy": accuracy,
1746
+ }
1747
+
1748
+ df_list.append(res)
1749
+
1750
+
1751
+ df = pd.DataFrame(df_list)
1752
+ # If there are any models that are the same, merge them
1753
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1754
+ df = df.groupby("Model", as_index=False).first()
1755
+ # Put 'Model' column first
1756
+ #cols = sorted(list(df.columns))
1757
+ cols = list(df.columns)
1758
+ cols.insert(0, cols.pop(cols.index("Model")))
1759
+ df = df[cols]
1760
+
1761
+ if rank:
1762
+ df = add_rank(df, compute_average=True)
1763
+
1764
+ if fillna:
1765
+ df.fillna("", inplace=True)
1766
+
1767
+ return df
1768
+
1769
+
1770
+ MNLI_ZERO_SHOT = get_data_mnli(eval_mode="zero_shot")
1771
+ MNLI_FIVE_SHOT = get_data_mnli(eval_mode="five_shot")
1772
+
1773
+
1774
+
1775
+
1776
+
1777
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1778
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1779
+
1780
+
1781
+ def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True):
1782
+
1783
+ df_list = []
1784
+
1785
+ for model in MODEL_LIST:
1786
+
1787
+
1788
+ results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']]
1789
+
1790
+
1791
+ try:
1792
+ accuracy = median([results['accuracy'] for results in results_list])
1793
+
1794
+ except:
1795
+ print(results_list)
1796
+ accuracy = -1
1797
+
1798
+
1799
+ res = {
1800
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1801
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1802
+ "Accuracy": accuracy,
1803
+ }
1804
+
1805
+ df_list.append(res)
1806
+
1807
+
1808
+ df = pd.DataFrame(df_list)
1809
+ # If there are any models that are the same, merge them
1810
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1811
+ df = df.groupby("Model", as_index=False).first()
1812
+ # Put 'Model' column first
1813
+ #cols = sorted(list(df.columns))
1814
+ cols = list(df.columns)
1815
+ cols.insert(0, cols.pop(cols.index("Model")))
1816
+ df = df[cols]
1817
+
1818
+ if rank:
1819
+ df = add_rank(df, compute_average=True)
1820
+
1821
+ if fillna:
1822
+ df.fillna("", inplace=True)
1823
+
1824
+ return df
1825
+
1826
+
1827
+ QNLI_ZERO_SHOT = get_data_qnli(eval_mode="zero_shot")
1828
+ QNLI_FIVE_SHOT = get_data_qnli(eval_mode="five_shot")
1829
+
1830
+
1831
+
1832
+
1833
+
1834
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1835
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1836
+
1837
+
1838
+ def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):
1839
+
1840
+ df_list = []
1841
+
1842
+ for model in MODEL_LIST:
1843
+
1844
+
1845
+ results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']]
1846
+
1847
+
1848
+ try:
1849
+ accuracy = median([results['accuracy'] for results in results_list])
1850
+
1851
+ except:
1852
+ print(results_list)
1853
+ accuracy = -1
1854
+
1855
+
1856
+ res = {
1857
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1858
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1859
+ "Accuracy": accuracy,
1860
+ }
1861
+
1862
+ df_list.append(res)
1863
+
1864
+
1865
+ df = pd.DataFrame(df_list)
1866
+ # If there are any models that are the same, merge them
1867
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1868
+ df = df.groupby("Model", as_index=False).first()
1869
+ # Put 'Model' column first
1870
+ #cols = sorted(list(df.columns))
1871
+ cols = list(df.columns)
1872
+ cols.insert(0, cols.pop(cols.index("Model")))
1873
+ df = df[cols]
1874
+
1875
+ if rank:
1876
+ df = add_rank(df, compute_average=True)
1877
+
1878
+ if fillna:
1879
+ df.fillna("", inplace=True)
1880
+
1881
+ return df
1882
+
1883
+
1884
+ WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot")
1885
+ WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot")
1886
+
1887
+
1888
+
1889
+
1890
+
1891
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1892
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1893
+
1894
+
1895
+ def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True):
1896
+
1897
+ df_list = []
1898
+
1899
+ for model in MODEL_LIST:
1900
+
1901
+
1902
+ results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']]
1903
+
1904
+
1905
+ try:
1906
+ accuracy = median([results['accuracy'] for results in results_list])
1907
+
1908
+ except:
1909
+ print(results_list)
1910
+ accuracy = -1
1911
+
1912
+
1913
+ res = {
1914
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1915
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1916
+ "Accuracy": accuracy,
1917
+ }
1918
+
1919
+ df_list.append(res)
1920
+
1921
+
1922
+ df = pd.DataFrame(df_list)
1923
+ # If there are any models that are the same, merge them
1924
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1925
+ df = df.groupby("Model", as_index=False).first()
1926
+ # Put 'Model' column first
1927
+ #cols = sorted(list(df.columns))
1928
+ cols = list(df.columns)
1929
+ cols.insert(0, cols.pop(cols.index("Model")))
1930
+ df = df[cols]
1931
+
1932
+ if rank:
1933
+ df = add_rank(df, compute_average=True)
1934
+
1935
+ if fillna:
1936
+ df.fillna("", inplace=True)
1937
+
1938
+ return df
1939
+
1940
+
1941
+ RTE_ZERO_SHOT = get_data_rte(eval_mode="zero_shot")
1942
+ RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot")
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1950
+ # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
1951
+
1952
+
1953
+ def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):
1954
+
1955
+ df_list = []
1956
+
1957
+ for model in MODEL_LIST:
1958
+
1959
+
1960
+ results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']]
1961
+
1962
+
1963
+ try:
1964
+ accuracy = median([results['accuracy'] for results in results_list])
1965
+
1966
+ except:
1967
+ print(results_list)
1968
+ accuracy = -1
1969
+
1970
+
1971
+ res = {
1972
+ "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
1973
+ "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
1974
+ "Accuracy": accuracy,
1975
+ }
1976
+
1977
+ df_list.append(res)
1978
+
1979
+
1980
+ df = pd.DataFrame(df_list)
1981
+ # If there are any models that are the same, merge them
1982
+ # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
1983
+ df = df.groupby("Model", as_index=False).first()
1984
+ # Put 'Model' column first
1985
+ #cols = sorted(list(df.columns))
1986
+ cols = list(df.columns)
1987
+ cols.insert(0, cols.pop(cols.index("Model")))
1988
+ df = df[cols]
1989
+
1990
+ if rank:
1991
+ df = add_rank(df, compute_average=True)
1992
+
1993
+ if fillna:
1994
+ df.fillna("", inplace=True)
1995
+
1996
+ return df
1997
+
1998
+
1999
+ MRPC_ZERO_SHOT = get_data_mrpc(eval_mode="zero_shot")
2000
+ MRPC_FIVE_SHOT = get_data_mrpc(eval_mode="five_shot")
2001
+
2002
+
2003
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2004
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
2005
 
2006
+ block = gr.Blocks()
2007
+ with block:
2008
+
2009
+ gr.Markdown(f"""
2010
+ SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a>. Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.
2011
+
2012
+ - **Total Datasets**: 31
2013
+ - **Total Languages**: 8
2014
+ - **Total Models**: {NUM_MODELS}
2015
+ - **Mode of Evaluation**: Zero-Shot, Five-Shot
2016
+
2017
+ The following table shows the performance of the models on the SeaEval benchmark.
2018
+
2019
+ """)
2020
+
2021
+ with gr.Tabs():
2022
+
2023
+
2024
+ # dataset 1: cross-mmlu
2025
+ with gr.TabItem("Cross-MMLU"):
2026
+ with gr.Row():
2027
+ gr.Markdown("""
2028
+ **Cross-MMLU Leaderboard** 🔮
2029
+
2030
+ - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2031
+ - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2032
+ """)
2033
+
2034
+ with gr.TabItem("zero_shot"):
2035
+
2036
+
2037
+ with gr.TabItem("Overall"):
2038
+
2039
+ with gr.Row():
2040
+ cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2041
+ CROSS_MMLU_ZERO_SHOT_OVERALL,
2042
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
2043
+ type="pandas",
2044
+ )
2045
+
2046
+
2047
+ with gr.TabItem("Language Performance"):
2048
+
2049
+ with gr.Row():
2050
+ cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2051
+ CROSS_MMLU_ZERO_SHOT_LANGUAGE,
2052
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
2053
+ type="pandas",
2054
+ )
2055
+
2056
+
2057
+ with gr.TabItem("five_shot"):
2058
+
2059
+
2060
+ with gr.TabItem("Overall"):
2061
+
2062
+ with gr.Row():
2063
+ cross_mmlu_zero_shot_overall = gr.components.Dataframe(
2064
+ CROSS_MMLU_FIVE_SHOT_OVERALL,
2065
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
2066
+ type="pandas",
2067
+ )
2068
+
2069
+
2070
+ with gr.TabItem("Language Performance"):
2071
+
2072
+ with gr.Row():
2073
+ gr.components.Dataframe(
2074
+ CROSS_MMLU_FIVE_SHOT_LANGUAGE,
2075
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
2076
+ type="pandas",
2077
+ )
2078
+
2079
+
2080
+
2081
+ # dataset 2: cross-logiqa
2082
+ with gr.TabItem("Cross-LogiQA"):
2083
+ with gr.Row():
2084
+ gr.Markdown("""
2085
+ **Cross-LogiQA Leaderboard** 🔮
2086
+
2087
+ - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
2088
+ - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
2089
+ """)
2090
+
2091
+ with gr.TabItem("zero_shot"):
2092
+
2093
+
2094
+ with gr.TabItem("Overall"):
2095
+
2096
+ with gr.Row():
2097
+ gr.components.Dataframe(
2098
+ CROSS_LOGIQA_ZERO_SHOT_OVERALL,
2099
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
2100
+ type="pandas",
2101
+ )
2102
+
2103
+
2104
+ with gr.TabItem("Language Performance"):
2105
+
2106
+ with gr.Row():
2107
+ gr.components.Dataframe(
2108
+ CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
2109
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
2110
+ type="pandas",
2111
+ )
2112
+
2113
+
2114
+ with gr.TabItem("five_shot"):
2115
+
2116
+
2117
+ with gr.TabItem("Overall"):
2118
+
2119
+ with gr.Row():
2120
+ gr.components.Dataframe(
2121
+ CROSS_LOGIQA_FIVE_SHOT_OVERALL,
2122
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
2123
+ type="pandas",
2124
+ )
2125
+
2126
+
2127
+ with gr.TabItem("Language Performance"):
2128
+
2129
+ with gr.Row():
2130
+ gr.components.Dataframe(
2131
+ CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
2132
+ datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
2133
+ type="pandas",
2134
+ )
2135
+
2136
+
2137
+ # dataset 3: SG_EVAL
2138
+ with gr.TabItem("SG_EVAL"):
2139
+ with gr.Row():
2140
+ gr.Markdown("""
2141
+ **SG_EVAL Leaderboard** 🔮
2142
+
2143
+ - **Metric:** Accuracy
2144
+ - **Languages:** English
2145
+ """)
2146
+
2147
+ with gr.TabItem("zero_shot"):
2148
+ with gr.TabItem("Overall"):
2149
+ with gr.Row():
2150
+ gr.components.Dataframe(
2151
+ SG_EVAL_ZERO_SHOT,
2152
+ datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
2153
+ type="pandas",
2154
+ )
2155
+
2156
+ with gr.TabItem("five_shot"):
2157
+ with gr.TabItem("Overall"):
2158
+ with gr.Row():
2159
+ gr.components.Dataframe(
2160
+ SG_EVAL_FIVE_SHOT,
2161
+ datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
2162
+ type="pandas",
2163
+ )
2164
+
2165
+
2166
+ # dataset 4:
2167
+ with gr.TabItem("US_EVAL"):
2168
+ with gr.Row():
2169
+ gr.Markdown("""
2170
+ **US_EVAL Leaderboard** 🔮
2171
+
2172
+ - **Metric:** Accuracy
2173
+ - **Languages:** English
2174
+ """)
2175
+
2176
+ with gr.TabItem("zero_shot"):
2177
+ with gr.TabItem("Overall"):
2178
+ with gr.Row():
2179
+ gr.components.Dataframe(
2180
+ US_EVAL_ZERO_SHOT,
2181
+ datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
2182
+ type="pandas",
2183
+ )
2184
+
2185
+ with gr.TabItem("five_shot"):
2186
+ with gr.TabItem("Overall"):
2187
+ with gr.Row():
2188
+ gr.components.Dataframe(
2189
+ US_EVAL_FIVE_SHOT,
2190
+ datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
2191
+ type="pandas",
2192
+ )
2193
+
2194
+
2195
+ # dataset 5:
2196
+ with gr.TabItem("CN_EVAL"):
2197
+ with gr.Row():
2198
+ gr.Markdown("""
2199
+ **CN_EVAL Leaderboard** 🔮
2200
+
2201
+ - **Metric:** Accuracy
2202
+ - **Languages:** Chinese
2203
+ """)
2204
+
2205
+ with gr.TabItem("zero_shot"):
2206
+ with gr.TabItem("Overall"):
2207
+ with gr.Row():
2208
+ gr.components.Dataframe(
2209
+ CN_EVAL_ZERO_SHOT,
2210
+ datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
2211
+ type="pandas",
2212
+ )
2213
+
2214
+ with gr.TabItem("five_shot"):
2215
+ with gr.TabItem("Overall"):
2216
+ with gr.Row():
2217
+ gr.components.Dataframe(
2218
+ CN_EVAL_FIVE_SHOT,
2219
+ datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
2220
+ type="pandas",
2221
+ )
2222
+
2223
 
2224
+
2225
+ # dataset 6:
2226
+ with gr.TabItem("PH_EVAL"):
2227
+ with gr.Row():
2228
+ gr.Markdown("""
2229
+ **PH_EVAL Leaderboard** 🔮
2230
+
2231
+ - **Metric:** Accuracy
2232
+ - **Languages:** English
2233
+ """)
2234
+
2235
+ with gr.TabItem("zero_shot"):
2236
+ with gr.TabItem("Overall"):
2237
+ with gr.Row():
2238
+ gr.components.Dataframe(
2239
+ PH_EVAL_ZERO_SHOT,
2240
+ datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
2241
+ type="pandas",
2242
+ )
2243
+
2244
+ with gr.TabItem("five_shot"):
2245
+ with gr.TabItem("Overall"):
2246
+ with gr.Row():
2247
+ gr.components.Dataframe(
2248
+ PH_EVAL_FIVE_SHOT,
2249
+ datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
2250
+ type="pandas",
2251
+ )
2252
+
2253
+
2254
+ # dataset 7:
2255
+ with gr.TabItem("Singlish to English Translation"):
2256
+ with gr.Row():
2257
+ gr.Markdown("""
2258
+ **SING2ENG Leaderboard** 🔮
2259
+
2260
+ - **Metric:** BLEU Avg.
2261
+ - **Languages:** English
2262
+ """)
2263
+
2264
+ with gr.TabItem("zero_shot"):
2265
+ with gr.TabItem("Overall"):
2266
+ with gr.Row():
2267
+ gr.components.Dataframe(
2268
+ SING2ENG_ZERO_SHOT,
2269
+ datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns),
2270
+ type="pandas",
2271
+ )
2272
+
2273
+ with gr.TabItem("five_shot"):
2274
+ with gr.TabItem("Overall"):
2275
+ with gr.Row():
2276
+ gr.components.Dataframe(
2277
+ SING2ENG_FIVE_SHOT,
2278
+ datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns),
2279
+ type="pandas",
2280
+ )
2281
+
2282
+
2283
+ gr.Markdown(f"""
2284
+ The following are datasets that are not originally collected by SeaEval, but are included in the leaderboard for completeness.
2285
  """)
2286
+
2287
  with gr.Tabs():
2288
 
2289
 
2290
+
2291
+
2292
+ # dataset 8:
2293
+ with gr.TabItem("FLORES Indonesian to English Translation"):
2294
  with gr.Row():
2295
  gr.Markdown("""
2296
+ **flores_ind2eng Leaderboard** 🔮
2297
 
2298
+ - **Metric:** BLEU Avg.
2299
+ - **Languages:** English
2300
  """)
2301
 
2302
  with gr.TabItem("zero_shot"):
2303
+ with gr.TabItem("Overall"):
2304
+ with gr.Row():
2305
+ gr.components.Dataframe(
2306
+ FLORES_IND2ENG_ZERO_SHOT,
2307
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns),
2308
+ type="pandas",
2309
+ )
2310
+
2311
+ with gr.TabItem("five_shot"):
2312
+ with gr.TabItem("Overall"):
2313
+ with gr.Row():
2314
+ gr.components.Dataframe(
2315
+ FLORES_IND2ENG_FIVE_SHOT,
2316
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns),
2317
+ type="pandas",
2318
+ )
2319
+
2320
 
2321
+ # dataset 9:
2322
+ with gr.TabItem("FLORES Vitenamese to English Translation"):
2323
+ with gr.Row():
2324
+ gr.Markdown("""
2325
+ **flores_vie2eng Leaderboard** 🔮
2326
+
2327
+ - **Metric:** BLEU Avg.
2328
+ - **Languages:** English
2329
+ """)
2330
 
2331
+ with gr.TabItem("zero_shot"):
2332
  with gr.TabItem("Overall"):
2333
+ with gr.Row():
2334
+ gr.components.Dataframe(
2335
+ FLORES_VIE2ENG_ZERO_SHOT,
2336
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns),
2337
+ type="pandas",
2338
+ )
2339
 
2340
+ with gr.TabItem("five_shot"):
2341
+ with gr.TabItem("Overall"):
2342
  with gr.Row():
2343
+ gr.components.Dataframe(
2344
+ FLORES_VIE2ENG_FIVE_SHOT,
2345
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns),
2346
  type="pandas",
2347
  )
2348
 
2349
 
 
2350
 
2351
+ # dataset 10:
2352
+ with gr.TabItem("FLORES Chinese to English Translation"):
2353
+ with gr.Row():
2354
+ gr.Markdown("""
2355
+ **flores_zho2eng Leaderboard** 🔮
2356
+
2357
+ - **Metric:** BLEU Avg.
2358
+ - **Languages:** English
2359
+ """)
2360
+
2361
+ with gr.TabItem("zero_shot"):
2362
+ with gr.TabItem("Overall"):
2363
  with gr.Row():
2364
+ gr.components.Dataframe(
2365
+ FLORES_ZHO2ENG_ZERO_SHOT,
2366
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns),
2367
+ type="pandas",
2368
+ )
2369
+
2370
+ with gr.TabItem("five_shot"):
2371
+ with gr.TabItem("Overall"):
2372
+ with gr.Row():
2373
+ gr.components.Dataframe(
2374
+ FLORES_ZHO2ENG_FIVE_SHOT,
2375
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns),
2376
+ type="pandas",
2377
+ )
2378
+
2379
+
2380
+
2381
+ # dataset 11:
2382
+ with gr.TabItem("FLORES Malay to English Translation"):
2383
+ with gr.Row():
2384
+ gr.Markdown("""
2385
+ **flores_zsm2eng Leaderboard** 🔮
2386
+
2387
+ - **Metric:** BLEU Avg.
2388
+ - **Languages:** English
2389
+ """)
2390
+
2391
+ with gr.TabItem("zero_shot"):
2392
+ with gr.TabItem("Overall"):
2393
+ with gr.Row():
2394
+ gr.components.Dataframe(
2395
+ FLORES_ZSM2ENG_ZERO_SHOT,
2396
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns),
2397
+ type="pandas",
2398
+ )
2399
+
2400
+ with gr.TabItem("five_shot"):
2401
+ with gr.TabItem("Overall"):
2402
+ with gr.Row():
2403
+ gr.components.Dataframe(
2404
+ FLORES_ZSM2ENG_FIVE_SHOT,
2405
+ datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
2406
  type="pandas",
2407
  )
2408
 
2409
 
2410
+ # dataset 12:
2411
+ with gr.TabItem("MMLU"):
2412
+ with gr.Row():
2413
+ gr.Markdown("""
2414
+ **MMLU Leaderboard** 🔮
2415
+
2416
+ - **Metric:** Accuracy.
2417
+ - **Languages:** English
2418
+ """)
2419
+
2420
+ with gr.TabItem("zero_shot"):
2421
+ with gr.TabItem("Overall"):
2422
+ with gr.Row():
2423
+ gr.components.Dataframe(
2424
+ MMLU_ZERO_SHOT,
2425
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
2426
+ type="pandas",
2427
+ )
2428
+
2429
  with gr.TabItem("five_shot"):
2430
+ with gr.TabItem("Overall"):
2431
+ with gr.Row():
2432
+ gr.components.Dataframe(
2433
+ MMLU_FIVE_SHOT,
2434
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
2435
+ type="pandas",
2436
+ )
2437
+
2438
+
2439
+ # dataset 13:
2440
+ with gr.TabItem("MMLU Full"):
2441
+ with gr.Row():
2442
+ gr.Markdown("""
2443
+ **MMLU Full Leaderboard** 🔮
2444
+
2445
+ - **Metric:** Accuracy.
2446
+ - **Languages:** English
2447
+ """)
2448
+
2449
+ with gr.TabItem("zero_shot"):
2450
+ with gr.TabItem("Overall"):
2451
+ with gr.Row():
2452
+ gr.components.Dataframe(
2453
+ MMLU_FULL_ZERO_SHOT,
2454
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
2455
+ type="pandas",
2456
+ )
2457
 
2458
 
2459
+
2460
+ with gr.TabItem("five_shot"):
2461
  with gr.TabItem("Overall"):
2462
+ with gr.Row():
2463
+ gr.components.Dataframe(
2464
+ MMLU_FULL_FIVE_SHOT,
2465
+ datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
2466
+ type="pandas",
2467
+ )
2468
+
2469
+ # dataset 14:
2470
+ with gr.TabItem("C_EVAL"):
2471
+ with gr.Row():
2472
+ gr.Markdown("""
2473
+ **C_EVAL Leaderboard** 🔮
2474
+
2475
+ - **Metric:** Accuracy.
2476
+ - **Languages:** Chinese
2477
+ """)
2478
 
2479
+ with gr.TabItem("zero_shot"):
2480
+ with gr.TabItem("Overall"):
2481
  with gr.Row():
2482
+ gr.components.Dataframe(
2483
+ C_EVAL_ZERO_SHOT,
2484
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
2485
  type="pandas",
2486
  )
2487
 
2488
 
 
2489
 
2490
+ with gr.TabItem("five_shot"):
2491
+ with gr.TabItem("Overall"):
2492
  with gr.Row():
2493
  gr.components.Dataframe(
2494
+ C_EVAL_FIVE_SHOT,
2495
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
2496
  type="pandas",
2497
  )
2498
 
2499
 
2500
+ # dataset 15:
2501
+ with gr.TabItem("C_EVAL Full"):
 
2502
  with gr.Row():
2503
  gr.Markdown("""
2504
+ **C_EVAL Full Leaderboard** 🔮
2505
 
2506
+ - **Metric:** Accuracy.
2507
+ - **Languages:** Chinese
2508
  """)
2509
 
2510
  with gr.TabItem("zero_shot"):
 
 
2511
  with gr.TabItem("Overall"):
 
2512
  with gr.Row():
2513
  gr.components.Dataframe(
2514
+ C_EVAL_FULL_ZERO_SHOT,
2515
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
2516
  type="pandas",
2517
  )
2518
 
2519
 
 
2520
 
2521
+ with gr.TabItem("five_shot"):
2522
+ with gr.TabItem("Overall"):
2523
  with gr.Row():
2524
  gr.components.Dataframe(
2525
+ C_EVAL_FULL_FIVE_SHOT,
2526
+ datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
2527
  type="pandas",
2528
  )
2529
 
2530
+ # dataset 16:
2531
+ with gr.TabItem("CMMLU"):
2532
+ with gr.Row():
2533
+ gr.Markdown("""
2534
+ **CMMLU Leaderboard** 🔮
2535
+
2536
+ - **Metric:** Accuracy.
2537
+ - **Languages:** Chinese
2538
+ """)
2539
 
2540
+ with gr.TabItem("zero_shot"):
 
 
2541
  with gr.TabItem("Overall"):
 
2542
  with gr.Row():
2543
  gr.components.Dataframe(
2544
+ CMMLU_ZERO_SHOT,
2545
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
2546
  type="pandas",
2547
  )
2548
 
2549
 
 
2550
 
2551
+ with gr.TabItem("five_shot"):
2552
+ with gr.TabItem("Overall"):
2553
  with gr.Row():
2554
  gr.components.Dataframe(
2555
+ CMMLU_FIVE_SHOT,
2556
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
2557
  type="pandas",
2558
  )
2559
 
2560
+ # dataset 17:
2561
+ with gr.TabItem("CMMLU Full"):
 
2562
  with gr.Row():
2563
  gr.Markdown("""
2564
+ **CMMLU Full Leaderboard** 🔮
2565
 
2566
+ - **Metric:** Accuracy.
2567
+ - **Languages:** Chinese
2568
  """)
2569
 
2570
  with gr.TabItem("zero_shot"):
2571
  with gr.TabItem("Overall"):
2572
  with gr.Row():
2573
  gr.components.Dataframe(
2574
+ CMMLU_FULL_ZERO_SHOT,
2575
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
2576
  type="pandas",
2577
  )
2578
 
2579
+
2580
+
2581
  with gr.TabItem("five_shot"):
2582
  with gr.TabItem("Overall"):
2583
  with gr.Row():
2584
  gr.components.Dataframe(
2585
+ CMMLU_FULL_FIVE_SHOT,
2586
+ datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
2587
  type="pandas",
2588
  )
2589
 
2590
+ # dataset 18:
2591
+ with gr.TabItem("ZBench"):
 
2592
  with gr.Row():
2593
  gr.Markdown("""
2594
+ **ZBench Leaderboard** 🔮
2595
 
2596
+ - **Metric:** Accuracy.
2597
+ - **Languages:** Chinese
2598
  """)
2599
 
2600
  with gr.TabItem("zero_shot"):
2601
  with gr.TabItem("Overall"):
2602
  with gr.Row():
2603
  gr.components.Dataframe(
2604
+ ZBENCH_ZERO_SHOT,
2605
+ datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
2606
  type="pandas",
2607
  )
2608
 
2609
+
2610
+
2611
  with gr.TabItem("five_shot"):
2612
  with gr.TabItem("Overall"):
2613
  with gr.Row():
2614
  gr.components.Dataframe(
2615
+ ZBENCH_FIVE_SHOT,
2616
+ datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
2617
  type="pandas",
2618
  )
2619
 
2620
 
2621
+ # dataset 18:
2622
+ with gr.TabItem("ind_emotion"):
2623
  with gr.Row():
2624
  gr.Markdown("""
2625
+ **ind_emotion Leaderboard** 🔮
2626
 
2627
+ - **Metric:** Accuracy.
2628
+ - **Languages:** Indonesian
2629
  """)
2630
 
2631
  with gr.TabItem("zero_shot"):
2632
  with gr.TabItem("Overall"):
2633
  with gr.Row():
2634
  gr.components.Dataframe(
2635
+ IND_EMOTION_ZERO_SHOT,
2636
+ datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns),
2637
  type="pandas",
2638
  )
2639
 
2640
+
2641
+
2642
  with gr.TabItem("five_shot"):
2643
  with gr.TabItem("Overall"):
2644
  with gr.Row():
2645
  gr.components.Dataframe(
2646
+ IND_EMOTION_ZERO_SHOT,
2647
+ datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns),
2648
  type="pandas",
2649
  )
2650
 
2651
 
2652
+ # dataset
2653
+ with gr.TabItem("OCNLI"):
 
2654
  with gr.Row():
2655
  gr.Markdown("""
2656
+ **OCNLI Leaderboard** 🔮
2657
 
2658
+ - **Metric:** Accuracy.
2659
+ - **Languages:** Chinese
2660
  """)
2661
 
2662
  with gr.TabItem("zero_shot"):
2663
  with gr.TabItem("Overall"):
2664
  with gr.Row():
2665
  gr.components.Dataframe(
2666
+ OCNLI_ZERO_SHOT,
2667
+ datatype=["number", "markdown"] + ["number"] * len(OCNLI_ZERO_SHOT.columns),
2668
  type="pandas",
2669
  )
2670
 
2671
+
2672
+
2673
  with gr.TabItem("five_shot"):
2674
  with gr.TabItem("Overall"):
2675
  with gr.Row():
2676
  gr.components.Dataframe(
2677
+ OCNLI_FIVE_SHOT,
2678
+ datatype=["number", "markdown"] + ["number"] * len(OCNLI_FIVE_SHOT.columns),
2679
  type="pandas",
2680
  )
2681
 
2682
+ # dataset
2683
+ with gr.TabItem("C3"):
 
2684
  with gr.Row():
2685
  gr.Markdown("""
2686
+ **C3 Leaderboard** 🔮
2687
 
2688
+ - **Metric:** Accuracy.
2689
+ - **Languages:** Chinese
2690
  """)
2691
 
2692
  with gr.TabItem("zero_shot"):
2693
  with gr.TabItem("Overall"):
2694
  with gr.Row():
2695
  gr.components.Dataframe(
2696
+ C3_ZERO_SHOT,
2697
+ datatype=["number", "markdown"] + ["number"] * len(C3_ZERO_SHOT.columns),
2698
  type="pandas",
2699
  )
2700
 
2701
+
2702
+
2703
  with gr.TabItem("five_shot"):
2704
  with gr.TabItem("Overall"):
2705
  with gr.Row():
2706
  gr.components.Dataframe(
2707
+ C3_FIVE_SHOT,
2708
+ datatype=["number", "markdown"] + ["number"] * len(C3_FIVE_SHOT.columns),
2709
  type="pandas",
2710
  )
2711
 
2712
 
2713
+ # dataset
2714
+ with gr.TabItem("DREAM"):
2715
  with gr.Row():
2716
  gr.Markdown("""
2717
+ **DREAM Leaderboard** 🔮
2718
 
2719
+ - **Metric:** Accuracy.
2720
  - **Languages:** English
2721
  """)
2722
 
 
2724
  with gr.TabItem("Overall"):
2725
  with gr.Row():
2726
  gr.components.Dataframe(
2727
+ DREAM_ZERO_SHOT,
2728
+ datatype=["number", "markdown"] + ["number"] * len(DREAM_ZERO_SHOT.columns),
2729
  type="pandas",
2730
  )
2731
 
2732
+
2733
+
2734
  with gr.TabItem("five_shot"):
2735
  with gr.TabItem("Overall"):
2736
  with gr.Row():
2737
  gr.components.Dataframe(
2738
+ DREAM_FIVE_SHOT,
2739
+ datatype=["number", "markdown"] + ["number"] * len(DREAM_FIVE_SHOT.columns),
2740
  type="pandas",
2741
  )
2742
 
2743
+ # dataset
2744
+ with gr.TabItem("SAMSum"):
 
2745
  with gr.Row():
2746
  gr.Markdown("""
2747
+ **SAMSum Leaderboard** 🔮
2748
 
2749
+ - **Metric:** ROUGE.
2750
  - **Languages:** English
2751
  """)
2752
 
 
2754
  with gr.TabItem("Overall"):
2755
  with gr.Row():
2756
  gr.components.Dataframe(
2757
+ SAMSUM_ZERO_SHOT,
2758
+ datatype=["number", "markdown"] + ["number"] * len(SAMSUM_ZERO_SHOT.columns),
2759
  type="pandas",
2760
  )
2761
 
2762
+
2763
+
2764
  with gr.TabItem("five_shot"):
2765
  with gr.TabItem("Overall"):
2766
  with gr.Row():
2767
  gr.components.Dataframe(
2768
+ SAMSUM_FIVE_SHOT,
2769
+ datatype=["number", "markdown"] + ["number"] * len(SAMSUM_FIVE_SHOT.columns),
2770
  type="pandas",
2771
  )
2772
 
2773
 
2774
 
2775
+ # dataset
2776
+ with gr.TabItem("DialogSum"):
2777
  with gr.Row():
2778
  gr.Markdown("""
2779
+ **DialogSum Leaderboard** 🔮
2780
 
2781
+ - **Metric:** ROUGE.
2782
  - **Languages:** English
2783
  """)
2784
 
 
2786
  with gr.TabItem("Overall"):
2787
  with gr.Row():
2788
  gr.components.Dataframe(
2789
+ DIALOGSUM_ZERO_SHOT,
2790
+ datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_ZERO_SHOT.columns),
2791
  type="pandas",
2792
  )
2793
 
2794
+
2795
+
2796
  with gr.TabItem("five_shot"):
2797
  with gr.TabItem("Overall"):
2798
  with gr.Row():
2799
  gr.components.Dataframe(
2800
+ DIALOGSUM_FIVE_SHOT,
2801
+ datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_FIVE_SHOT.columns),
2802
  type="pandas",
2803
  )
2804
 
2805
 
2806
+ # dataset
2807
+ with gr.TabItem("SST2"):
 
2808
  with gr.Row():
2809
  gr.Markdown("""
2810
+ **SST2 Leaderboard** 🔮
2811
 
2812
+ - **Metric:** Accuracy.
2813
  - **Languages:** English
2814
  """)
2815
 
 
2817
  with gr.TabItem("Overall"):
2818
  with gr.Row():
2819
  gr.components.Dataframe(
2820
+ SST2_ZERO_SHOT,
2821
+ datatype=["number", "markdown"] + ["number"] * len(SST2_ZERO_SHOT.columns),
2822
  type="pandas",
2823
  )
2824
 
2825
+
2826
+
2827
  with gr.TabItem("five_shot"):
2828
  with gr.TabItem("Overall"):
2829
  with gr.Row():
2830
  gr.components.Dataframe(
2831
+ SST2_FIVE_SHOT,
2832
+ datatype=["number", "markdown"] + ["number"] * len(SST2_FIVE_SHOT.columns),
2833
  type="pandas",
2834
  )
2835
 
2836
 
2837
+ # dataset
2838
+ with gr.TabItem("COLA"):
2839
  with gr.Row():
2840
  gr.Markdown("""
2841
+ **COLA Leaderboard** 🔮
2842
 
2843
  - **Metric:** Accuracy.
2844
  - **Languages:** English
 
2848
  with gr.TabItem("Overall"):
2849
  with gr.Row():
2850
  gr.components.Dataframe(
2851
+ COLA_ZERO_SHOT,
2852
+ datatype=["number", "markdown"] + ["number"] * len(COLA_ZERO_SHOT.columns),
2853
  type="pandas",
2854
  )
2855
 
2856
+
2857
+
2858
  with gr.TabItem("five_shot"):
2859
  with gr.TabItem("Overall"):
2860
  with gr.Row():
2861
  gr.components.Dataframe(
2862
+ COLA_FIVE_SHOT,
2863
+ datatype=["number", "markdown"] + ["number"] * len(COLA_FIVE_SHOT.columns),
2864
  type="pandas",
2865
  )
2866
 
2867
 
2868
+ # dataset
2869
+ with gr.TabItem("QQP"):
2870
  with gr.Row():
2871
  gr.Markdown("""
2872
+ **QQP Leaderboard** 🔮
2873
 
2874
  - **Metric:** Accuracy.
2875
  - **Languages:** English
 
2879
  with gr.TabItem("Overall"):
2880
  with gr.Row():
2881
  gr.components.Dataframe(
2882
+ QQP_ZERO_SHOT,
2883
+ datatype=["number", "markdown"] + ["number"] * len(QQP_ZERO_SHOT.columns),
2884
  type="pandas",
2885
  )
2886
 
 
2890
  with gr.TabItem("Overall"):
2891
  with gr.Row():
2892
  gr.components.Dataframe(
2893
+ QQP_FIVE_SHOT,
2894
+ datatype=["number", "markdown"] + ["number"] * len(QQP_FIVE_SHOT.columns),
2895
  type="pandas",
2896
  )
2897
 
2898
+
2899
+ # dataset
2900
+ with gr.TabItem("MNLI"):
2901
  with gr.Row():
2902
  gr.Markdown("""
2903
+ **MNLI Leaderboard** 🔮
2904
 
2905
  - **Metric:** Accuracy.
2906
+ - **Languages:** English
2907
  """)
2908
 
2909
  with gr.TabItem("zero_shot"):
2910
  with gr.TabItem("Overall"):
2911
  with gr.Row():
2912
  gr.components.Dataframe(
2913
+ MNLI_ZERO_SHOT,
2914
+ datatype=["number", "markdown"] + ["number"] * len(MNLI_ZERO_SHOT.columns),
2915
  type="pandas",
2916
  )
2917
 
 
2921
  with gr.TabItem("Overall"):
2922
  with gr.Row():
2923
  gr.components.Dataframe(
2924
+ MNLI_FIVE_SHOT,
2925
+ datatype=["number", "markdown"] + ["number"] * len(MNLI_FIVE_SHOT.columns),
2926
  type="pandas",
2927
  )
2928
 
2929
 
2930
+ # dataset
2931
+ with gr.TabItem("QNLI"):
2932
  with gr.Row():
2933
  gr.Markdown("""
2934
+ **QNLI Leaderboard** 🔮
2935
 
2936
  - **Metric:** Accuracy.
2937
+ - **Languages:** English
2938
  """)
2939
 
2940
  with gr.TabItem("zero_shot"):
2941
  with gr.TabItem("Overall"):
2942
  with gr.Row():
2943
  gr.components.Dataframe(
2944
+ QNLI_ZERO_SHOT,
2945
+ datatype=["number", "markdown"] + ["number"] * len(QNLI_ZERO_SHOT.columns),
2946
  type="pandas",
2947
  )
2948
 
 
2952
  with gr.TabItem("Overall"):
2953
  with gr.Row():
2954
  gr.components.Dataframe(
2955
+ QNLI_FIVE_SHOT,
2956
+ datatype=["number", "markdown"] + ["number"] * len(QNLI_FIVE_SHOT.columns),
2957
  type="pandas",
2958
  )
2959
 
2960
+
2961
+ # dataset
2962
+ with gr.TabItem("WNLI"):
2963
  with gr.Row():
2964
  gr.Markdown("""
2965
+ **WNLI Leaderboard** 🔮
2966
 
2967
  - **Metric:** Accuracy.
2968
+ - **Languages:** English
2969
  """)
2970
 
2971
  with gr.TabItem("zero_shot"):
2972
  with gr.TabItem("Overall"):
2973
  with gr.Row():
2974
  gr.components.Dataframe(
2975
+ WNLI_ZERO_SHOT,
2976
+ datatype=["number", "markdown"] + ["number"] * len(WNLI_ZERO_SHOT.columns),
2977
  type="pandas",
2978
  )
2979
 
 
2983
  with gr.TabItem("Overall"):
2984
  with gr.Row():
2985
  gr.components.Dataframe(
2986
+ WNLI_FIVE_SHOT,
2987
+ datatype=["number", "markdown"] + ["number"] * len(WNLI_FIVE_SHOT.columns),
2988
  type="pandas",
2989
  )
2990
 
2991
+
2992
+ # dataset
2993
+ with gr.TabItem("RTE"):
2994
  with gr.Row():
2995
  gr.Markdown("""
2996
+ **RTE Leaderboard** 🔮
2997
 
2998
  - **Metric:** Accuracy.
2999
+ - **Languages:** English
3000
  """)
3001
 
3002
  with gr.TabItem("zero_shot"):
3003
  with gr.TabItem("Overall"):
3004
  with gr.Row():
3005
  gr.components.Dataframe(
3006
+ RTE_ZERO_SHOT,
3007
+ datatype=["number", "markdown"] + ["number"] * len(RTE_ZERO_SHOT.columns),
3008
  type="pandas",
3009
  )
3010
 
 
3014
  with gr.TabItem("Overall"):
3015
  with gr.Row():
3016
  gr.components.Dataframe(
3017
+ RTE_FIVE_SHOT,
3018
+ datatype=["number", "markdown"] + ["number"] * len(RTE_FIVE_SHOT.columns),
3019
  type="pandas",
3020
  )
3021
 
3022
+
3023
+ # dataset
3024
+ with gr.TabItem("MRPC"):
3025
  with gr.Row():
3026
  gr.Markdown("""
3027
+ **MRPC Leaderboard** 🔮
3028
 
3029
  - **Metric:** Accuracy.
3030
+ - **Languages:** English
3031
  """)
3032
 
3033
  with gr.TabItem("zero_shot"):
3034
  with gr.TabItem("Overall"):
3035
  with gr.Row():
3036
  gr.components.Dataframe(
3037
+ MRPC_ZERO_SHOT,
3038
+ datatype=["number", "markdown"] + ["number"] * len(MRPC_ZERO_SHOT.columns),
3039
  type="pandas",
3040
  )
3041
 
 
3045
  with gr.TabItem("Overall"):
3046
  with gr.Row():
3047
  gr.components.Dataframe(
3048
+ MRPC_FIVE_SHOT,
3049
+ datatype=["number", "markdown"] + ["number"] * len(MRPC_FIVE_SHOT.columns),
3050
  type="pandas",
3051
  )
3052
 
 
3060
 
3061
 
3062
 
 
 
 
 
 
 
 
 
 
3063
 
3064
  gr.Markdown(r"""
3065