djstrong commited on
Commit
acebd17
·
1 Parent(s): e333ea5
Files changed (3) hide show
  1. README.md +2 -2
  2. src/about.py +7 -46
  3. src/leaderboard/read_evals.py +1 -2
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Open PL LLM Leaderboard
3
- emoji: 🏆🇵🇱
4
  colorFrom: gray
5
  colorTo: red
6
  sdk: gradio
 
1
  ---
2
+ title: Polish Medical Leaderboard
3
+ emoji: 🇵🇱🩺🏆
4
  colorFrom: gray
5
  colorTo: red
6
  sdk: gradio
src/about.py CHANGED
@@ -129,12 +129,7 @@ TITLE = """
129
 
130
  # What does your leaderboard evaluate?
131
  INTRODUCTION_TEXT = f"""
132
- The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
133
- For now, models are tested without theirs templates.
134
-
135
- Almost every task has two versions: regex and multiple choice.
136
- * _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
137
- * _mc suffix means that a model is scored against every possible class (suitable also for base models)
138
 
139
  Average columns are normalized against scores by "Baseline (majority class)".
140
 
@@ -164,43 +159,13 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
164
 
165
  Tasks taken into account while calculating averages:
166
  * Average: {', '.join(all_tasks)}
167
- * Avg g: {', '.join(g_tasks)}
168
- * Avg mc: {', '.join(mc_tasks)}
169
- * Avg RAG: {', '.join(rag_tasks)}
170
-
171
- | Task | Dataset | Metric | Type |
172
- |---------------------------------|---------------------------------------|-----------|-----------------|
173
- | polemo2_in | allegro/klej-polemo2-in | accuracy | generate_until |
174
- | polemo2_in_mc | allegro/klej-polemo2-in | accuracy | multiple_choice |
175
- | polemo2_out | allegro/klej-polemo2-out | accuracy | generate_until |
176
- | polemo2_out_mc | allegro/klej-polemo2-out | accuracy | multiple_choice |
177
- | 8tags_mc | sdadas/8tags | accuracy | multiple_choice |
178
- | 8tags_g | sdadas/8tags | accuracy | generate_until |
179
- | belebele_mc | facebook/belebele | accuracy | multiple_choice |
180
- | belebele_g | facebook/belebele | accuracy | generate_until |
181
- | dyk_mc | allegro/klej-dyk | binary F1 | multiple_choice |
182
- | dyk_g | allegro/klej-dyk | binary F1 | generate_until |
183
- | ppc_mc | sdadas/ppc | accuracy | multiple_choice |
184
- | ppc_g | sdadas/ppc | accuracy | generate_until |
185
- | psc_mc | allegro/klej-psc | binary F1 | multiple_choice |
186
- | psc_g | allegro/klej-psc | binary F1 | generate_until |
187
- | cbd_mc | ptaszynski/PolishCyberbullyingDataset | macro F1 | multiple_choice |
188
- | cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
189
- | klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
190
- | klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
191
- | polqa_reranking_mc | ipipan/polqa | accuracy | multiple_choice |
192
- | polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
193
- | polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
194
- | poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
195
- | polish_poquad_open_book | enelpol/poleval2018_task3_test_10k | levenshtein | generate_until |
196
- | polish_eq_bench_first_turn | speakleash/EQ-Bench-PL | eq_bench | generate_until |
197
- | polish_eq_bench | speakleash/EQ-Bench-PL | eq_bench | generate_until |
198
 
199
  ## Reproducibility
200
  To reproduce our results, you need to clone the repository:
201
 
202
  ```
203
- git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish3
204
  cd lm-evaluation-harness
205
  pip install -e .
206
  ```
@@ -208,18 +173,14 @@ pip install -e .
208
  and run benchmark for 0-shot and 5-shot:
209
 
210
  ```
211
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate --num_fewshot 0 --output_path results/ --log_samples
212
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 0 --output_path results/ --log_samples
213
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --output_path results/ --log_samples
214
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --output_path results/ --log_samples
215
  ```
216
 
217
  With chat templates:
218
  ```
219
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate --num_fewshot 0 --output_path results/ --log_samples --apply_chat_template
220
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 0 --output_path results/ --log_samples --apply_chat_template
221
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --output_path results/ --log_samples --apply_chat_template
222
- lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --output_path results/ --log_samples --apply_chat_template
223
  ```
224
 
225
  ## List of Polish models
 
129
 
130
  # What does your leaderboard evaluate?
131
  INTRODUCTION_TEXT = f"""
132
+ The leaderboard evaluates language models on Polish Board Certification Examinations (Państwowy Egzamin Specjalizacyjny) from years 2018-2022.
 
 
 
 
 
133
 
134
  Average columns are normalized against scores by "Baseline (majority class)".
135
 
 
159
 
160
  Tasks taken into account while calculating averages:
161
  * Average: {', '.join(all_tasks)}
162
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  ## Reproducibility
165
  To reproduce our results, you need to clone the repository:
166
 
167
  ```
168
+ git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish4
169
  cd lm-evaluation-harness
170
  pip install -e .
171
  ```
 
173
  and run benchmark for 0-shot and 5-shot:
174
 
175
  ```
176
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 0 --output_path results/ --log_samples
177
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 5 --output_path results/ --log_samples
 
 
178
  ```
179
 
180
  With chat templates:
181
  ```
182
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 0 --output_path results/ --log_samples --apply_chat_template
183
+ lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_pes --num_fewshot 5 --output_path results/ --log_samples --apply_chat_template
 
 
184
  ```
185
 
186
  ## List of Polish models
src/leaderboard/read_evals.py CHANGED
@@ -387,6 +387,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
387
  model_result_filepaths = []
388
 
389
  for root, _, files in os.walk(results_path):
 
390
  # We should only have json files in model results
391
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
392
  continue
@@ -398,8 +399,6 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
398
  files = [files[-1]]
399
 
400
  for file in files:
401
- print(file)
402
- # if '_polish_pes_' not in file: continue
403
  model_result_filepaths.append(os.path.join(root, file))
404
 
405
  # print('PATHS:', model_result_filepaths)
 
387
  model_result_filepaths = []
388
 
389
  for root, _, files in os.walk(results_path):
390
+ if '_polish_pes_' not in root: continue
391
  # We should only have json files in model results
392
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
393
  continue
 
399
  files = [files[-1]]
400
 
401
  for file in files:
 
 
402
  model_result_filepaths.append(os.path.join(root, file))
403
 
404
  # print('PATHS:', model_result_filepaths)