Update README.md
Browse files
README.md
CHANGED
@@ -38,7 +38,44 @@ Merged using this [Colab notebook](https://colab.research.google.com/drive/1a76Y
|
|
38 |
|
39 |
#### FIN-bench scores:
|
40 |
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
### Framework versions
|
44 |
|
|
|
38 |
|
39 |
#### FIN-bench scores:
|
40 |
|
41 |
+
| Task |Version| Metric |Value | |Stderr|
|
42 |
+
|------------------------------------------------|------:|---------------------|-----:|---|-----:|
|
43 |
+
|bigbench_analogies | 0|multiple_choice_grade|0.6308|± |0.0425|
|
44 |
+
|bigbench_arithmetic_1_digit_addition | 0|multiple_choice_grade|0.6400|± |0.0482|
|
45 |
+
|bigbench_arithmetic_1_digit_division | 0|multiple_choice_grade|0.7391|± |0.0936|
|
46 |
+
|bigbench_arithmetic_1_digit_multiplication | 0|multiple_choice_grade|0.2800|± |0.0451|
|
47 |
+
|bigbench_arithmetic_1_digit_subtraction | 0|multiple_choice_grade|0.5000|± |0.0503|
|
48 |
+
|bigbench_arithmetic_2_digit_addition | 0|multiple_choice_grade|0.1800|± |0.0386|
|
49 |
+
|bigbench_arithmetic_2_digit_division | 0|multiple_choice_grade|0.4800|± |0.0502|
|
50 |
+
|bigbench_arithmetic_2_digit_multiplication | 0|multiple_choice_grade|0.0800|± |0.0273|
|
51 |
+
|bigbench_arithmetic_2_digit_subtraction | 0|multiple_choice_grade|0.2500|± |0.0435|
|
52 |
+
|bigbench_arithmetic_3_digit_addition | 0|multiple_choice_grade|0.1800|± |0.0386|
|
53 |
+
|bigbench_arithmetic_3_digit_division | 0|multiple_choice_grade|0.2500|± |0.0435|
|
54 |
+
|bigbench_arithmetic_3_digit_multiplication | 0|multiple_choice_grade|0.1700|± |0.0378|
|
55 |
+
|bigbench_arithmetic_3_digit_subtraction | 0|multiple_choice_grade|0.5000|± |0.0503|
|
56 |
+
|bigbench_arithmetic_4_digit_addition | 0|multiple_choice_grade|0.2600|± |0.0441|
|
57 |
+
|bigbench_arithmetic_4_digit_division | 0|multiple_choice_grade|0.2500|± |0.0435|
|
58 |
+
|bigbench_arithmetic_4_digit_multiplication | 0|multiple_choice_grade|0.2100|± |0.0409|
|
59 |
+
|bigbench_arithmetic_4_digit_subtraction | 0|multiple_choice_grade|0.5200|± |0.0502|
|
60 |
+
|bigbench_arithmetic_5_digit_addition | 0|multiple_choice_grade|0.3900|± |0.0490|
|
61 |
+
|bigbench_arithmetic_5_digit_division | 0|multiple_choice_grade|0.1600|± |0.0368|
|
62 |
+
|bigbench_arithmetic_5_digit_multiplication | 0|multiple_choice_grade|0.1000|± |0.0302|
|
63 |
+
|bigbench_arithmetic_5_digit_subtraction | 0|multiple_choice_grade|0.6100|± |0.0490|
|
64 |
+
|bigbench_cause_and_effect_one_sentence | 0|multiple_choice_grade|0.6471|± |0.0676|
|
65 |
+
|bigbench_cause_and_effect_one_sentence_no_prompt| 0|multiple_choice_grade|0.6863|± |0.0656|
|
66 |
+
|bigbench_cause_and_effect_two_sentences | 0|multiple_choice_grade|0.3922|± |0.0690|
|
67 |
+
|bigbench_emotions | 0|multiple_choice_grade|0.2812|± |0.0357|
|
68 |
+
|bigbench_empirical_judgments | 0|multiple_choice_grade|0.2828|± |0.0455|
|
69 |
+
|bigbench_general_knowledge | 0|multiple_choice_grade|0.4000|± |0.0590|
|
70 |
+
|bigbench_hhh_alignment_harmless | 0|multiple_choice_grade|0.3621|± |0.0637|
|
71 |
+
|bigbench_hhh_alignment_helpful | 0|multiple_choice_grade|0.3559|± |0.0629|
|
72 |
+
|bigbench_hhh_alignment_honest | 0|multiple_choice_grade|0.3729|± |0.0635|
|
73 |
+
|bigbench_hhh_alignment_other | 0|multiple_choice_grade|0.5581|± |0.0766|
|
74 |
+
|bigbench_intent_recognition | 0|multiple_choice_grade|0.1879|± |0.0149|
|
75 |
+
|bigbench_misconceptions | 0|multiple_choice_grade|0.5373|± |0.0432|
|
76 |
+
|bigbench_paraphrase | 0|multiple_choice_grade|0.5150|± |0.0354|
|
77 |
+
|bigbench_sentence_ambiguity | 0|multiple_choice_grade|0.5000|± |0.0651|
|
78 |
+
|bigbench_similarities_abstraction | 0|multiple_choice_grade|0.7368|± |0.0508|
|
79 |
|
80 |
### Framework versions
|
81 |
|