binwang commited on
Commit
5da889a
·
1 Parent(s): b6e5026

new results with random

Browse files
Files changed (2) hide show
  1. all_results.json +0 -0
  2. app.py +39 -42
all_results.json CHANGED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,9 +1,11 @@
1
- from functools import partial
2
  import json
3
 
4
  import gradio as gr
5
  import pandas as pd
6
 
 
 
7
 
8
  print("Loading datasets...")
9
 
@@ -67,13 +69,13 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
67
 
68
  try:
69
  overall_acc = [results['overall_acc'] for results in results_list]
70
- overall_acc = sum(overall_acc) / len(overall_acc)
71
 
72
  consistency_score_3 = [results['consistency_score_3'] for results in results_list]
73
- consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)
74
 
75
  AC3_3 = [results['AC3_3'] for results in results_list]
76
- AC3_3 = sum(AC3_3) / len(AC3_3)
77
 
78
  except:
79
  print(results_list)
@@ -126,21 +128,21 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
126
 
127
 
128
  try:
129
- English = [results['language_acc']['English'] for results in results_list]
130
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
131
- Chinese = [results['language_acc']['Chinese'] for results in results_list]
132
  Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
133
- Filipino = [results['language_acc']['Filipino'] for results in results_list]
134
- Spanish = [results['language_acc']['Spanish'] for results in results_list]
135
- Malay = [results['language_acc']['Malay'] for results in results_list]
136
 
137
- English = sum(English) / len(English)
138
- Vietnamese = sum(Vietnamese) / len(Vietnamese)
139
- Chinese = sum(Chinese) / len(Chinese)
140
- Indonesian = sum(Indonesian) / len(Indonesian)
141
- Filipino = sum(Filipino) / len(Filipino)
142
- Spanish = sum(Spanish) / len(Spanish)
143
- Malay = sum(Malay) / len(Malay)
144
 
145
 
146
  except:
@@ -208,13 +210,13 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
208
 
209
  try:
210
  overall_acc = [results['overall_acc'] for results in results_list]
211
- overall_acc = sum(overall_acc) / len(overall_acc)
212
 
213
  consistency_score_3 = [results['consistency_score_3'] for results in results_list]
214
- consistency_score_3 = sum(consistency_score_3) / len(consistency_score_3)
215
 
216
  AC3_3 = [results['AC3_3'] for results in results_list]
217
- AC3_3 = sum(AC3_3) / len(AC3_3)
218
 
219
  except:
220
  print(results_list)
@@ -267,21 +269,21 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
267
 
268
 
269
  try:
270
- English = [results['language_acc']['English'] for results in results_list]
271
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
272
- Chinese = [results['language_acc']['Chinese'] for results in results_list]
273
  Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
274
- Filipino = [results['language_acc']['Filipino'] for results in results_list]
275
- Spanish = [results['language_acc']['Spanish'] for results in results_list]
276
- Malay = [results['language_acc']['Malay'] for results in results_list]
277
 
278
- English = sum(English) / len(English)
279
- Vietnamese = sum(Vietnamese) / len(Vietnamese)
280
- Chinese = sum(Chinese) / len(Chinese)
281
- Indonesian = sum(Indonesian) / len(Indonesian)
282
- Filipino = sum(Filipino) / len(Filipino)
283
- Spanish = sum(Spanish) / len(Spanish)
284
- Malay = sum(Malay) / len(Malay)
285
 
286
 
287
  except:
@@ -346,14 +348,12 @@ def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
346
 
347
 
348
  try:
349
- accuracy = [results['accuracy'] for results in results_list]
350
- accuracy = sum(accuracy) / len(accuracy)
351
 
352
  except:
353
  print(results_list)
354
  accuracy = -1
355
 
356
-
357
  res = {
358
  "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
359
  "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
@@ -401,8 +401,7 @@ def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
401
 
402
 
403
  try:
404
- accuracy = [results['accuracy'] for results in results_list]
405
- accuracy = sum(accuracy) / len(accuracy)
406
 
407
  except:
408
  print(results_list)
@@ -456,8 +455,7 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
456
 
457
 
458
  try:
459
- accuracy = [results['accuracy'] for results in results_list]
460
- accuracy = sum(accuracy) / len(accuracy)
461
 
462
  except:
463
  print(results_list)
@@ -511,8 +509,7 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
511
 
512
 
513
  try:
514
- accuracy = [results['accuracy'] for results in results_list]
515
- accuracy = sum(accuracy) / len(accuracy)
516
 
517
  except:
518
  print(results_list)
@@ -789,8 +786,8 @@ with block:
789
  with gr.TabItem("Overall"):
790
  with gr.Row():
791
  gr.components.Dataframe(
792
- PH_EVAL_ZERO_SHOT,
793
- datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
794
  type="pandas",
795
  )
796
 
 
1
+
2
  import json
3
 
4
  import gradio as gr
5
  import pandas as pd
6
 
7
+ from statistics import median
8
+
9
 
10
  print("Loading datasets...")
11
 
 
69
 
70
  try:
71
  overall_acc = [results['overall_acc'] for results in results_list]
72
+ overall_acc = median(overall_acc)
73
 
74
  consistency_score_3 = [results['consistency_score_3'] for results in results_list]
75
+ consistency_score_3 = median(consistency_score_3)
76
 
77
  AC3_3 = [results['AC3_3'] for results in results_list]
78
+ AC3_3 = median(AC3_3)
79
 
80
  except:
81
  print(results_list)
 
128
 
129
 
130
  try:
131
+ English = [results['language_acc']['English'] for results in results_list]
132
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
133
+ Chinese = [results['language_acc']['Chinese'] for results in results_list]
134
  Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
135
+ Filipino = [results['language_acc']['Filipino'] for results in results_list]
136
+ Spanish = [results['language_acc']['Spanish'] for results in results_list]
137
+ Malay = [results['language_acc']['Malay'] for results in results_list]
138
 
139
+ English = median(English)
140
+ Vietnamese = median(Vietnamese)
141
+ Chinese = median(Chinese)
142
+ Indonesian = median(Indonesian)
143
+ Filipino = median(Filipino)
144
+ Spanish = median(Spanish)
145
+ Malay = median(Malay)
146
 
147
 
148
  except:
 
210
 
211
  try:
212
  overall_acc = [results['overall_acc'] for results in results_list]
213
+ overall_acc = median(overall_acc)
214
 
215
  consistency_score_3 = [results['consistency_score_3'] for results in results_list]
216
+ consistency_score_3 = median(consistency_score_3)
217
 
218
  AC3_3 = [results['AC3_3'] for results in results_list]
219
+ AC3_3 = median(AC3_3)
220
 
221
  except:
222
  print(results_list)
 
269
 
270
 
271
  try:
272
+ English = [results['language_acc']['English'] for results in results_list]
273
  Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
274
+ Chinese = [results['language_acc']['Chinese'] for results in results_list]
275
  Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
276
+ Filipino = [results['language_acc']['Filipino'] for results in results_list]
277
+ Spanish = [results['language_acc']['Spanish'] for results in results_list]
278
+ Malay = [results['language_acc']['Malay'] for results in results_list]
279
 
280
+ English = median(English)
281
+ Vietnamese = median(Vietnamese)
282
+ Chinese = median(Chinese)
283
+ Indonesian = median(Indonesian)
284
+ Filipino = median(Filipino)
285
+ Spanish = median(Spanish)
286
+ Malay = median(Malay)
287
 
288
 
289
  except:
 
348
 
349
 
350
  try:
351
+ accuracy = median([results['accuracy'] for results in results_list])
 
352
 
353
  except:
354
  print(results_list)
355
  accuracy = -1
356
 
 
357
  res = {
358
  "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
359
  "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
 
401
 
402
 
403
  try:
404
+ accuracy = median([results['accuracy'] for results in results_list])
 
405
 
406
  except:
407
  print(results_list)
 
455
 
456
 
457
  try:
458
+ accuracy = median([results['accuracy'] for results in results_list])
 
459
 
460
  except:
461
  print(results_list)
 
509
 
510
 
511
  try:
512
+ accuracy = median([results['accuracy'] for results in results_list])
 
513
 
514
  except:
515
  print(results_list)
 
786
  with gr.TabItem("Overall"):
787
  with gr.Row():
788
  gr.components.Dataframe(
789
+ PH_EVAL_FIVE_SHOT,
790
+ datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
791
  type="pandas",
792
  )
793