alexmarques
commited on
Commit
•
b8dfdce
1
Parent(s):
72eb322
Update README.md
Browse files
README.md
CHANGED
@@ -144,30 +144,32 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
144 |
|
145 |
<table>
|
146 |
<tr>
|
|
|
|
|
147 |
<td><strong>Benchmark</strong>
|
148 |
</td>
|
149 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
150 |
</td>
|
151 |
-
<td><strong>Meta-Llama-3.1-8B-Instruct-quantized.
|
152 |
</td>
|
153 |
<td><strong>Recovery</strong>
|
154 |
</td>
|
155 |
</tr>
|
156 |
<tr>
|
157 |
-
<td><strong>
|
|
|
|
|
158 |
</td>
|
159 |
<td>25.8 (25.1 / 26.5)
|
160 |
</td>
|
161 |
-
<td>
|
162 |
</td>
|
163 |
-
<td>
|
164 |
</td>
|
165 |
</tr>
|
166 |
<tr>
|
167 |
-
<td><strong>OpenLLM v1</strong>
|
168 |
</td>
|
169 |
-
</tr>
|
170 |
-
<tr>
|
171 |
<td>MMLU (5-shot)
|
172 |
</td>
|
173 |
<td>68.3
|
@@ -248,10 +250,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
248 |
</td>
|
249 |
</tr>
|
250 |
<tr>
|
251 |
-
<td><strong>OpenLLM v2</strong>
|
252 |
</td>
|
253 |
-
</tr>
|
254 |
-
<tr>
|
255 |
<td>MMLU-Pro (5-shot)
|
256 |
</td>
|
257 |
<td>30.8
|
@@ -282,7 +282,7 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
282 |
</td>
|
283 |
</tr>
|
284 |
<tr>
|
285 |
-
<td>Math
|
286 |
</td>
|
287 |
<td>15.7
|
288 |
</td>
|
@@ -322,10 +322,8 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
322 |
</td>
|
323 |
</tr>
|
324 |
<tr>
|
325 |
-
<td><strong>Coding</strong>
|
326 |
</td>
|
327 |
-
</tr>
|
328 |
-
<tr>
|
329 |
<td>HumanEval pass@1
|
330 |
</td>
|
331 |
<td>67.3
|
@@ -345,8 +343,81 @@ Detailed model outputs are available as HuggingFace datasets for [Arena-Hard](ht
|
|
345 |
<td>97.4%
|
346 |
</td>
|
347 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
</table>
|
349 |
|
|
|
350 |
### Reproduction
|
351 |
|
352 |
The results were obtained using the following commands:
|
@@ -438,6 +509,90 @@ lm_eval \
|
|
438 |
--batch_size auto
|
439 |
```
|
440 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
441 |
#### HumanEval and HumanEval+
|
442 |
##### Generation
|
443 |
```
|
|
|
144 |
|
145 |
<table>
|
146 |
<tr>
|
147 |
+
<td><strong>Category</strong>
|
148 |
+
</td>
|
149 |
<td><strong>Benchmark</strong>
|
150 |
</td>
|
151 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
152 |
</td>
|
153 |
+
<td><strong>Meta-Llama-3.1-8B-Instruct-quantized.w8a8 (this model)</strong>
|
154 |
</td>
|
155 |
<td><strong>Recovery</strong>
|
156 |
</td>
|
157 |
</tr>
|
158 |
<tr>
|
159 |
+
<td rowspan="1" ><strong>LLM as a judge</strong>
|
160 |
+
</td>
|
161 |
+
<td>Arena Hard
|
162 |
</td>
|
163 |
<td>25.8 (25.1 / 26.5)
|
164 |
</td>
|
165 |
+
<td>27.2 (27.6 / 26.7)
|
166 |
</td>
|
167 |
+
<td>105.4%
|
168 |
</td>
|
169 |
</tr>
|
170 |
<tr>
|
171 |
+
<td rowspan="8" ><strong>OpenLLM v1</strong>
|
172 |
</td>
|
|
|
|
|
173 |
<td>MMLU (5-shot)
|
174 |
</td>
|
175 |
<td>68.3
|
|
|
250 |
</td>
|
251 |
</tr>
|
252 |
<tr>
|
253 |
+
<td rowspan="7" ><strong>OpenLLM v2</strong>
|
254 |
</td>
|
|
|
|
|
255 |
<td>MMLU-Pro (5-shot)
|
256 |
</td>
|
257 |
<td>30.8
|
|
|
282 |
</td>
|
283 |
</tr>
|
284 |
<tr>
|
285 |
+
<td>Math-lvl-5 (4-shot)
|
286 |
</td>
|
287 |
<td>15.7
|
288 |
</td>
|
|
|
322 |
</td>
|
323 |
</tr>
|
324 |
<tr>
|
325 |
+
<td rowspan="2" ><strong>Coding</strong>
|
326 |
</td>
|
|
|
|
|
327 |
<td>HumanEval pass@1
|
328 |
</td>
|
329 |
<td>67.3
|
|
|
343 |
<td>97.4%
|
344 |
</td>
|
345 |
</tr>
|
346 |
+
<tr>
|
347 |
+
<td rowspan="9" ><strong>Multilingual</strong>
|
348 |
+
</td>
|
349 |
+
<td>Portuguese MMLU (5-shot)
|
350 |
+
</td>
|
351 |
+
<td>59.96
|
352 |
+
</td>
|
353 |
+
<td>58.69
|
354 |
+
</td>
|
355 |
+
<td>97.9%
|
356 |
+
</td>
|
357 |
+
</tr>
|
358 |
+
<tr>
|
359 |
+
<td>Spanish MMLU (5-shot)
|
360 |
+
</td>
|
361 |
+
<td>60.25
|
362 |
+
</td>
|
363 |
+
<td>58.39
|
364 |
+
</td>
|
365 |
+
<td>96.9%
|
366 |
+
</td>
|
367 |
+
</tr>
|
368 |
+
<tr>
|
369 |
+
<td>Italian MMLU (5-shot)
|
370 |
+
</td>
|
371 |
+
<td>59.23
|
372 |
+
</td>
|
373 |
+
<td>57.82
|
374 |
+
</td>
|
375 |
+
<td>97.6%
|
376 |
+
</td>
|
377 |
+
</tr>
|
378 |
+
<tr>
|
379 |
+
<td>German MMLU (5-shot)
|
380 |
+
</td>
|
381 |
+
<td>58.63
|
382 |
+
</td>
|
383 |
+
<td>56.22
|
384 |
+
</td>
|
385 |
+
<td>95.9%
|
386 |
+
</td>
|
387 |
+
</tr>
|
388 |
+
<tr>
|
389 |
+
<td>French MMLU (5-shot)
|
390 |
+
</td>
|
391 |
+
<td>59.65
|
392 |
+
</td>
|
393 |
+
<td>57.58
|
394 |
+
</td>
|
395 |
+
<td>96.5%
|
396 |
+
</td>
|
397 |
+
</tr>
|
398 |
+
<tr>
|
399 |
+
<td>Hindi MMLU (5-shot)
|
400 |
+
</td>
|
401 |
+
<td>50.10
|
402 |
+
</td>
|
403 |
+
<td>47.14
|
404 |
+
</td>
|
405 |
+
<td>94.1%
|
406 |
+
</td>
|
407 |
+
</tr>
|
408 |
+
<tr>
|
409 |
+
<td>Thai MMLU (5-shot)
|
410 |
+
</td>
|
411 |
+
<td>49.12
|
412 |
+
</td>
|
413 |
+
<td>46.72
|
414 |
+
</td>
|
415 |
+
<td>95.1%
|
416 |
+
</td>
|
417 |
+
</tr>
|
418 |
</table>
|
419 |
|
420 |
+
|
421 |
### Reproduction
|
422 |
|
423 |
The results were obtained using the following commands:
|
|
|
509 |
--batch_size auto
|
510 |
```
|
511 |
|
512 |
+
#### MMLU Portuguese
|
513 |
+
```
|
514 |
+
lm_eval \
|
515 |
+
--model vllm \
|
516 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
517 |
+
--tasks mmlu_pt_llama_3.1_instruct \
|
518 |
+
--fewshot_as_multiturn \
|
519 |
+
--apply_chat_template \
|
520 |
+
--num_fewshot 5 \
|
521 |
+
--batch_size auto
|
522 |
+
```
|
523 |
+
|
524 |
+
#### MMLU Spanish
|
525 |
+
```
|
526 |
+
lm_eval \
|
527 |
+
--model vllm \
|
528 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
529 |
+
--tasks mmlu_es_llama_3.1_instruct \
|
530 |
+
--fewshot_as_multiturn \
|
531 |
+
--apply_chat_template \
|
532 |
+
--num_fewshot 5 \
|
533 |
+
--batch_size auto
|
534 |
+
```
|
535 |
+
|
536 |
+
#### MMLU Italian
|
537 |
+
```
|
538 |
+
lm_eval \
|
539 |
+
--model vllm \
|
540 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
541 |
+
--tasks mmlu_it_llama_3.1_instruct \
|
542 |
+
--fewshot_as_multiturn \
|
543 |
+
--apply_chat_template \
|
544 |
+
--num_fewshot 5 \
|
545 |
+
--batch_size auto
|
546 |
+
```
|
547 |
+
|
548 |
+
#### MMLU German
|
549 |
+
```
|
550 |
+
lm_eval \
|
551 |
+
--model vllm \
|
552 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
553 |
+
--tasks mmlu_de_llama_3.1_instruct \
|
554 |
+
--fewshot_as_multiturn \
|
555 |
+
--apply_chat_template \
|
556 |
+
--num_fewshot 5 \
|
557 |
+
--batch_size auto
|
558 |
+
```
|
559 |
+
|
560 |
+
#### MMLU French
|
561 |
+
```
|
562 |
+
lm_eval \
|
563 |
+
--model vllm \
|
564 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
565 |
+
--tasks mmlu_fr_llama_3.1_instruct \
|
566 |
+
--fewshot_as_multiturn \
|
567 |
+
--apply_chat_template \
|
568 |
+
--num_fewshot 5 \
|
569 |
+
--batch_size auto
|
570 |
+
```
|
571 |
+
|
572 |
+
#### MMLU Hindi
|
573 |
+
```
|
574 |
+
lm_eval \
|
575 |
+
--model vllm \
|
576 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
577 |
+
--tasks mmlu_hi_llama_3.1_instruct \
|
578 |
+
--fewshot_as_multiturn \
|
579 |
+
--apply_chat_template \
|
580 |
+
--num_fewshot 5 \
|
581 |
+
--batch_size auto
|
582 |
+
```
|
583 |
+
|
584 |
+
#### MMLU Thai
|
585 |
+
```
|
586 |
+
lm_eval \
|
587 |
+
--model vllm \
|
588 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
589 |
+
--tasks mmlu_th_llama_3.1_instruct \
|
590 |
+
--fewshot_as_multiturn \
|
591 |
+
--apply_chat_template \
|
592 |
+
--num_fewshot 5 \
|
593 |
+
--batch_size auto
|
594 |
+
```
|
595 |
+
|
596 |
#### HumanEval and HumanEval+
|
597 |
##### Generation
|
598 |
```
|