Update README.md
Browse files
README.md
CHANGED
@@ -79,8 +79,9 @@ Nous Benchmark:
|
|
79 |
|---------------------------------------------------|------:|------:|---------:|-------:|------:|
|
80 |
|[Master-Yi-9B](https://huggingface.co/qnguyen3/Master-Yi-9B)| 43.55| 71.48| 48.54| 41.43| 51.25|
|
81 |
|
82 |
-
|
83 |
### AGIEval
|
|
|
84 |
| Task |Version| Metric |Value| |Stderr|
|
85 |
|------------------------------|------:|--------|----:|---|-----:|
|
86 |
|agieval_aqua_rat | 0|acc |35.83|± | 3.01|
|
@@ -101,8 +102,10 @@ Nous Benchmark:
|
|
101 |
| | |acc_norm|47.73|± | 3.38|
|
102 |
|
103 |
Average: 43.55%
|
|
|
104 |
|
105 |
### GPT4All
|
|
|
106 |
| Task |Version| Metric |Value| |Stderr|
|
107 |
|-------------|------:|--------|----:|---|-----:|
|
108 |
|arc_challenge| 0|acc |54.95|± | 1.45|
|
@@ -119,16 +122,20 @@ Average: 43.55%
|
|
119 |
|winogrande | 0|acc |72.61|± | 1.25|
|
120 |
|
121 |
Average: 71.48%
|
|
|
122 |
|
123 |
### TruthfulQA
|
|
|
124 |
| Task |Version|Metric|Value| |Stderr|
|
125 |
|-------------|------:|------|----:|---|-----:|
|
126 |
|truthfulqa_mc| 1|mc1 |33.05|± | 1.65|
|
127 |
| | |mc2 |48.54|± | 1.54|
|
128 |
|
129 |
Average: 48.54%
|
|
|
130 |
|
131 |
### Bigbench
|
|
|
132 |
| Task |Version| Metric |Value| |Stderr|
|
133 |
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|
134 |
|bigbench_causal_judgement | 0|multiple_choice_grade|54.74|± | 3.62|
|
@@ -162,8 +169,8 @@ OpenLLM Benchmark:
|
|
162 |
|---------------------------------------------------|---:|--------:|----:|---------:|---------:|----:|------:|
|
163 |
|[Master-Yi-9B](https://huggingface.co/qnguyen3/Master-Yi-9B)|61.6| 79.89|69.95| 48.59| 77.35|67.48| 67.48|
|
164 |
|
165 |
-
```
|
166 |
### ARC
|
|
|
167 |
| Task |Version| Metric | Value | |Stderr|
|
168 |
|-------------|------:|--------------------|-------------|---|------|
|
169 |
|arc_challenge| 1|acc,none | 0.59| | |
|
@@ -173,8 +180,10 @@ OpenLLM Benchmark:
|
|
173 |
| | |alias |arc_challenge| | |
|
174 |
|
175 |
Average: 61.6%
|
|
|
176 |
|
177 |
### HellaSwag
|
|
|
178 |
| Task |Version| Metric | Value | |Stderr|
|
179 |
|---------|------:|--------------------|---------|---|------|
|
180 |
|hellaswag| 1|acc,none | 0.61| | |
|
@@ -184,8 +193,10 @@ Average: 61.6%
|
|
184 |
| | |alias |hellaswag| | |
|
185 |
|
186 |
Average: 79.89%
|
|
|
187 |
|
188 |
### MMLU
|
|
|
189 |
| Task |Version| Metric | Value | |Stderr|
|
190 |
|----------------------------------------|-------|---------------|---------------------------------------|---|------|
|
191 |
|mmlu |N/A |acc,none | 0.7| | |
|
@@ -376,8 +387,10 @@ Average: 79.89%
|
|
376 |
| | |acc_stderr,none|0.03 | | |
|
377 |
|
378 |
Average: 69.95%
|
|
|
379 |
|
380 |
### TruthfulQA
|
|
|
381 |
| Task |Version| Metric | Value | |Stderr|
|
382 |
|--------------|-------|-----------------------|-----------------|---|------|
|
383 |
|truthfulqa |N/A |bleu_acc,none | 0.45| | |
|
@@ -440,8 +453,10 @@ Average: 69.95%
|
|
440 |
| | |alias | - truthfulqa_mc2| | |
|
441 |
|
442 |
Average: 48.59%
|
|
|
443 |
|
444 |
### Winogrande
|
|
|
445 |
| Task |Version| Metric | Value | |Stderr|
|
446 |
|----------|------:|---------------|----------|---|------|
|
447 |
|winogrande| 1|acc,none | 0.77| | |
|
@@ -449,8 +464,10 @@ Average: 48.59%
|
|
449 |
| | |alias |winogrande| | |
|
450 |
|
451 |
Average: 77.35%
|
|
|
452 |
|
453 |
### GSM8K
|
|
|
454 |
|Task |Version| Metric |Value| |Stderr|
|
455 |
|-----|------:|-----------------------------------|-----|---|------|
|
456 |
|gsm8k| 3|exact_match,strict-match | 0.67| | |
|
|
|
79 |
|---------------------------------------------------|------:|------:|---------:|-------:|------:|
|
80 |
|[Master-Yi-9B](https://huggingface.co/qnguyen3/Master-Yi-9B)| 43.55| 71.48| 48.54| 41.43| 51.25|
|
81 |
|
82 |
+
|
83 |
### AGIEval
|
84 |
+
```
|
85 |
| Task |Version| Metric |Value| |Stderr|
|
86 |
|------------------------------|------:|--------|----:|---|-----:|
|
87 |
|agieval_aqua_rat | 0|acc |35.83|± | 3.01|
|
|
|
102 |
| | |acc_norm|47.73|± | 3.38|
|
103 |
|
104 |
Average: 43.55%
|
105 |
+
```
|
106 |
|
107 |
### GPT4All
|
108 |
+
```
|
109 |
| Task |Version| Metric |Value| |Stderr|
|
110 |
|-------------|------:|--------|----:|---|-----:|
|
111 |
|arc_challenge| 0|acc |54.95|± | 1.45|
|
|
|
122 |
|winogrande | 0|acc |72.61|± | 1.25|
|
123 |
|
124 |
Average: 71.48%
|
125 |
+
```
|
126 |
|
127 |
### TruthfulQA
|
128 |
+
```
|
129 |
| Task |Version|Metric|Value| |Stderr|
|
130 |
|-------------|------:|------|----:|---|-----:|
|
131 |
|truthfulqa_mc| 1|mc1 |33.05|± | 1.65|
|
132 |
| | |mc2 |48.54|± | 1.54|
|
133 |
|
134 |
Average: 48.54%
|
135 |
+
```
|
136 |
|
137 |
### Bigbench
|
138 |
+
```
|
139 |
| Task |Version| Metric |Value| |Stderr|
|
140 |
|------------------------------------------------|------:|---------------------|----:|---|-----:|
|
141 |
|bigbench_causal_judgement | 0|multiple_choice_grade|54.74|± | 3.62|
|
|
|
169 |
|---------------------------------------------------|---:|--------:|----:|---------:|---------:|----:|------:|
|
170 |
|[Master-Yi-9B](https://huggingface.co/qnguyen3/Master-Yi-9B)|61.6| 79.89|69.95| 48.59| 77.35|67.48| 67.48|
|
171 |
|
|
|
172 |
### ARC
|
173 |
+
```
|
174 |
| Task |Version| Metric | Value | |Stderr|
|
175 |
|-------------|------:|--------------------|-------------|---|------|
|
176 |
|arc_challenge| 1|acc,none | 0.59| | |
|
|
|
180 |
| | |alias |arc_challenge| | |
|
181 |
|
182 |
Average: 61.6%
|
183 |
+
```
|
184 |
|
185 |
### HellaSwag
|
186 |
+
```
|
187 |
| Task |Version| Metric | Value | |Stderr|
|
188 |
|---------|------:|--------------------|---------|---|------|
|
189 |
|hellaswag| 1|acc,none | 0.61| | |
|
|
|
193 |
| | |alias |hellaswag| | |
|
194 |
|
195 |
Average: 79.89%
|
196 |
+
```
|
197 |
|
198 |
### MMLU
|
199 |
+
```
|
200 |
| Task |Version| Metric | Value | |Stderr|
|
201 |
|----------------------------------------|-------|---------------|---------------------------------------|---|------|
|
202 |
|mmlu |N/A |acc,none | 0.7| | |
|
|
|
387 |
| | |acc_stderr,none|0.03 | | |
|
388 |
|
389 |
Average: 69.95%
|
390 |
+
```
|
391 |
|
392 |
### TruthfulQA
|
393 |
+
```
|
394 |
| Task |Version| Metric | Value | |Stderr|
|
395 |
|--------------|-------|-----------------------|-----------------|---|------|
|
396 |
|truthfulqa |N/A |bleu_acc,none | 0.45| | |
|
|
|
453 |
| | |alias | - truthfulqa_mc2| | |
|
454 |
|
455 |
Average: 48.59%
|
456 |
+
```
|
457 |
|
458 |
### Winogrande
|
459 |
+
```
|
460 |
| Task |Version| Metric | Value | |Stderr|
|
461 |
|----------|------:|---------------|----------|---|------|
|
462 |
|winogrande| 1|acc,none | 0.77| | |
|
|
|
464 |
| | |alias |winogrande| | |
|
465 |
|
466 |
Average: 77.35%
|
467 |
+
```
|
468 |
|
469 |
### GSM8K
|
470 |
+
```
|
471 |
|Task |Version| Metric |Value| |Stderr|
|
472 |
|-----|------:|-----------------------------------|-----|---|------|
|
473 |
|gsm8k| 3|exact_match,strict-match | 0.67| | |
|