Shane commited on
Commit
63c5ebf
1 Parent(s): ed3c3e8

edited some files

Browse files
Files changed (4) hide show
  1. app.py +2 -2
  2. src/md.py +3 -94
  3. src/md_old.py +105 -0
  4. src/utils.py +11 -6
app.py CHANGED
@@ -36,7 +36,7 @@ href_data_nongreedy = prep_df(load_all_data(local_result_dir, subdir="temperatur
36
 
37
  col_types_href = ["number"] + ["markdown"] + ["number"] * int((len(href_data_greedy.columns) - 1) / 2)
38
  col_types_href_hidden = ["number"] + ["markdown"] + ["number"] * (len(href_data_greedy.columns) - 1)
39
- categories = ['Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify', "Reasoning Over Numerical Data", "Multi-Document Synthesis", "Fact Checking or Attributed QA"]
40
  # categories = ['Average', 'Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify']
41
 
42
  # for showing random samples
@@ -77,7 +77,7 @@ def regex_table(dataframe, regex, selected_category, style=True):
77
 
78
  if style:
79
  # Format for different columns
80
- format_dict = {col: "{:.1f}" for col in data.columns if col not in ['Average', 'Model', 'Rank']}
81
  format_dict['Average'] = "{:.2f}"
82
  data = data.style.format(format_dict, na_rep='').set_properties(**{'text-align': 'right'})
83
  return data
 
36
 
37
  col_types_href = ["number"] + ["markdown"] + ["number"] * int((len(href_data_greedy.columns) - 1) / 2)
38
  col_types_href_hidden = ["number"] + ["markdown"] + ["number"] * (len(href_data_greedy.columns) - 1)
39
+ categories = ['Average', 'Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify', "Reasoning Over Numerical Data", "Multi-Document Synthesis", "Fact Checking or Attributed QA"]
40
  # categories = ['Average', 'Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify']
41
 
42
  # for showing random samples
 
77
 
78
  if style:
79
  # Format for different columns
80
+ format_dict = {col: "{:.1f}" for col in data.columns if col not in ['Average', 'Model', 'Rank', '95% CI']}
81
  format_dict['Average'] = "{:.2f}"
82
  data = data.style.format(format_dict, na_rep='').set_properties(**{'text-align': 'right'})
83
  return data
src/md.py CHANGED
@@ -2,104 +2,13 @@ from datetime import datetime
2
  import pytz
3
 
4
  ABOUT_TEXT = """
5
- We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
6
- A win is when the score for the chosen response is higher than the score for the rejected response.
7
-
8
- Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
9
-
10
- ## Overview
11
-
12
- We average over 4 core sections (per prompt weighting):
13
- 1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
14
- 2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
15
- 3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
16
- 4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
17
-
18
- For Reasoning, we increase the weight of the PRM-Math subset so code and math abilities are weighed equally in the final number, rather than increasing the relevance of code.
19
- We add a final column, **Prior Sets** -- includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
20
- Prior sets is weighted 0.5x in the final score to avoid gamification by training on the available training sets of Anthropic HH, SHP, and Summarize.
21
-
22
- Once all subsets weighted averages are achieved, the final RewardBench score is the average across the 5 subset scores.
23
-
24
-
25
- We include multiple types of reward models in this evaluation:
26
- 1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
27
- 2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
28
- 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed. *Note*: This also includes other models trained with implicit rewards, such as those trained with [KTO](https://arxiv.org/abs/2402.01306).
29
- 4. **Random**: Random choice baseline.
30
- 4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
31
-
32
- All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
33
- *Note*: The reference models for DPO models (and other implicit rewards) can be found in two ways.
34
- * Click on a specific model in results and you'll see a key `ref_model`, e.g. [Qwen](https://huggingface.co/datasets/allenai/reward-bench-results/blob/main/eval-set/Qwen/Qwen1.5-72B-Chat.json).
35
- * All the reference models are listed in the [evaluation configs](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml).
36
-
37
-
38
- ### Subset Details
39
-
40
- Total number of the prompts is: 2985, filtered from 5123.
41
-
42
- | Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
43
- | :---------- | :-----: | :---------: |
44
- | alpacaeval-easy | 805, 100 | Great model vs poor model |
45
- | alpacaeval-length | 805, 95 | Good model vs low model, equal length |
46
- | alpacaeval-hard | 805, 95 | Great model vs baseline model |
47
- | mt-bench-easy | 28, 28 | MT Bench 10s vs 1s |
48
- | mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s |
49
- | mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 |
50
- | refusals-dangerous | 505, 100 | Dangerous response vs no response |
51
- | refusals-offensive | 704, 100 | Offensive response vs no response |
52
- | llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
53
- | llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
54
- | llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
55
- | llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
56
- | llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
57
- | xstest-should-refuse | 450, 154 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
58
- | xstest-should-respond | 450, 250 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
59
- | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
60
- | math-prm | 447 | Human references vs. model error from OpenAI's Let's Verify Step by Step |
61
- | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
62
- | hep-go | 164 | Go code |
63
- | hep-java | 164 | Java code |
64
- | hep-js | 164 | Javascript code |
65
- | hep-python | 164 | Python code |
66
- | hep-rust | 164 | Rust code |
67
-
68
- Lengths (mean, std. dev.) include the prompt
69
-
70
- | subset | length bias | chosen_chars | rejected_chars | chosen_tokens | rejected_tokens | chosen_unique_tokens | rejected_unique_tokens |
71
- |-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
72
- | alpacaeval-easy | True | 2283 (1138) | 646 (482) | 591 (303) | 167 (139) | 253 (117) | 83 (46) |
73
- | alpacaeval-hard | True | 1590 (769) | 526 (430) | 412 (199) | 137 (117) | 173 (67) | 71 (48) |
74
- | alpacaeval-length | Neutral | 2001 (1137) | 2127 (1787) | 511 (283) | 597 (530) | 192 (85) | 189 (99) |
75
- | donotanswer | False | 755 (722) | 1389 (695) | 170 (161) | 320 (164) | 104 (82) | 157 (73) |
76
- | hep-cpp | Neutral | 709 (341) | 705 (342) | 261 (125) | 259 (125) | 100 (29) | 99 (29) |
77
- | hep-go | Neutral | 738 (361) | 734 (361) | 266 (118) | 265 (118) | 100 (29) | 99 (29) |
78
- | hep-java | Neutral | 821 (393) | 814 (390) | 263 (123) | 261 (122) | 102 (30) | 102 (30) |
79
- | hep-js | Neutral | 677 (341) | 673 (339) | 251 (129) | 250 (128) | 93 (29) | 93 (29) |
80
- | hep-python | Neutral | 618 (301) | 616 (300) | 212 (98) | 211 (98) | 86 (26) | 85 (26) |
81
- | hep-rust | Neutral | 666 (391) | 660 (391) | 221 (132) | 219 (132) | 95 (29) | 95 (29) |
82
- | llmbar-adver-GPTInst | False | 735 (578) | 1623 (1055) | 170 (135) | 377 (245) | 93 (59) | 179 (106) |
83
- | llmbar-adver-GPTOut | Neutral | 378 (339) | 359 (319) | 96 (81) | 101 (94) | 60 (45) | 55 (41) |
84
- | llmbar-adver-manual | False | 666 (584) | 1139 (866) | 160 (134) | 264 (194) | 92 (63) | 140 (90) |
85
- | llmbar-adver-neighbor | False | 287 (297) | 712 (749) | 70 (76) | 173 (175) | 43 (31) | 91 (70) |
86
- | llmbar-natural | Neutral | 553 (644) | 530 (597) | 139 (162) | 130 (140) | 75 (71) | 70 (62) |
87
- | mt-bench-easy | False | 1563 (720) | 2129 (1520) | 377 (159) | 551 (415) | 166 (55) | 116 (62) |
88
- | mt-bench-hard | False | 1225 (499) | 1471 (1016) | 284 (116) | 349 (234) | 131 (45) | 136 (58) |
89
- | mt-bench-med | Neutral | 1558 (729) | 1733 (1312) | 377 (170) | 410 (311) | 162 (58) | 145 (88) |
90
- | refusals-dangerous | False | 597 (81) | 1828 (547) | 131 (20) | 459 (136) | 90 (12) | 211 (50) |
91
- | refusals-offensive | False | 365 (116) | 1092 (1146) | 82 (25) | 299 (278) | 64 (15) | 134 (101) |
92
- | xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
93
- | xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
94
-
95
- For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
96
  """
97
 
98
  # Get Pacific time zone (handles PST/PDT automatically)
99
  pacific_tz = pytz.timezone('America/Los_Angeles')
100
  current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
101
 
102
- TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
103
- ### Evaluating the capabilities, safety, and pitfalls of reward models
104
- [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
105
  """
 
2
  import pytz
3
 
4
  ABOUT_TEXT = """
5
+ TODO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  """
7
 
8
  # Get Pacific time zone (handles PST/PDT automatically)
9
  pacific_tz = pytz.timezone('America/Los_Angeles')
10
  current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
11
 
12
+ TOP_TEXT = f"""# HREF: Human Reference Guided Evaluation for Instructiong Following
13
+ [Code]() | [Eval. Dataset]() | [Prior Test Sets]() | [Results]() | [Paper]() | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
 
14
  """
src/md_old.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import pytz
3
+
4
+ ABOUT_TEXT = """
5
+ We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
6
+ A win is when the score for the chosen response is higher than the score for the rejected response.
7
+
8
+ Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
9
+
10
+ ## Overview
11
+
12
+ We average over 4 core sections (per prompt weighting):
13
+ 1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
14
+ 2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
15
+ 3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
16
+ 4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
17
+
18
+ For Reasoning, we increase the weight of the PRM-Math subset so code and math abilities are weighed equally in the final number, rather than increasing the relevance of code.
19
+ We add a final column, **Prior Sets** -- includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
20
+ Prior sets is weighted 0.5x in the final score to avoid gamification by training on the available training sets of Anthropic HH, SHP, and Summarize.
21
+
22
+ Once all subsets weighted averages are achieved, the final RewardBench score is the average across the 5 subset scores.
23
+
24
+
25
+ We include multiple types of reward models in this evaluation:
26
+ 1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
27
+ 2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
28
+ 3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed. *Note*: This also includes other models trained with implicit rewards, such as those trained with [KTO](https://arxiv.org/abs/2402.01306).
29
+ 4. **Random**: Random choice baseline.
30
+ 4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
31
+
32
+ All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
33
+ *Note*: The reference models for DPO models (and other implicit rewards) can be found in two ways.
34
+ * Click on a specific model in results and you'll see a key `ref_model`, e.g. [Qwen](https://huggingface.co/datasets/allenai/reward-bench-results/blob/main/eval-set/Qwen/Qwen1.5-72B-Chat.json).
35
+ * All the reference models are listed in the [evaluation configs](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml).
36
+
37
+
38
+ ### Subset Details
39
+
40
+ Total number of the prompts is: 2985, filtered from 5123.
41
+
42
+ | Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
43
+ | :---------- | :-----: | :---------: |
44
+ | alpacaeval-easy | 805, 100 | Great model vs poor model |
45
+ | alpacaeval-length | 805, 95 | Good model vs low model, equal length |
46
+ | alpacaeval-hard | 805, 95 | Great model vs baseline model |
47
+ | mt-bench-easy | 28, 28 | MT Bench 10s vs 1s |
48
+ | mt-bench-medium | 45, 40 | MT Bench 9s vs 2-5s |
49
+ | mt-bench-hard | 45, 37 | MT Bench 7-8 vs 5-6 |
50
+ | refusals-dangerous | 505, 100 | Dangerous response vs no response |
51
+ | refusals-offensive | 704, 100 | Offensive response vs no response |
52
+ | llmbar-natural | 100 | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
53
+ | llmbar-adver-neighbor | 134 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
54
+ | llmbar-adver-GPTInst | 92 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
55
+ | llmbar-adver-GPTOut | 47 | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
56
+ | llmbar-adver-manual | 46 | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
57
+ | xstest-should-refuse | 450, 154 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
58
+ | xstest-should-respond | 450, 250 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
59
+ | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
60
+ | math-prm | 447 | Human references vs. model error from OpenAI's Let's Verify Step by Step |
61
+ | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
62
+ | hep-go | 164 | Go code |
63
+ | hep-java | 164 | Java code |
64
+ | hep-js | 164 | Javascript code |
65
+ | hep-python | 164 | Python code |
66
+ | hep-rust | 164 | Rust code |
67
+
68
+ Lengths (mean, std. dev.) include the prompt
69
+
70
+ | subset | length bias | chosen_chars | rejected_chars | chosen_tokens | rejected_tokens | chosen_unique_tokens | rejected_unique_tokens |
71
+ |-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
72
+ | alpacaeval-easy | True | 2283 (1138) | 646 (482) | 591 (303) | 167 (139) | 253 (117) | 83 (46) |
73
+ | alpacaeval-hard | True | 1590 (769) | 526 (430) | 412 (199) | 137 (117) | 173 (67) | 71 (48) |
74
+ | alpacaeval-length | Neutral | 2001 (1137) | 2127 (1787) | 511 (283) | 597 (530) | 192 (85) | 189 (99) |
75
+ | donotanswer | False | 755 (722) | 1389 (695) | 170 (161) | 320 (164) | 104 (82) | 157 (73) |
76
+ | hep-cpp | Neutral | 709 (341) | 705 (342) | 261 (125) | 259 (125) | 100 (29) | 99 (29) |
77
+ | hep-go | Neutral | 738 (361) | 734 (361) | 266 (118) | 265 (118) | 100 (29) | 99 (29) |
78
+ | hep-java | Neutral | 821 (393) | 814 (390) | 263 (123) | 261 (122) | 102 (30) | 102 (30) |
79
+ | hep-js | Neutral | 677 (341) | 673 (339) | 251 (129) | 250 (128) | 93 (29) | 93 (29) |
80
+ | hep-python | Neutral | 618 (301) | 616 (300) | 212 (98) | 211 (98) | 86 (26) | 85 (26) |
81
+ | hep-rust | Neutral | 666 (391) | 660 (391) | 221 (132) | 219 (132) | 95 (29) | 95 (29) |
82
+ | llmbar-adver-GPTInst | False | 735 (578) | 1623 (1055) | 170 (135) | 377 (245) | 93 (59) | 179 (106) |
83
+ | llmbar-adver-GPTOut | Neutral | 378 (339) | 359 (319) | 96 (81) | 101 (94) | 60 (45) | 55 (41) |
84
+ | llmbar-adver-manual | False | 666 (584) | 1139 (866) | 160 (134) | 264 (194) | 92 (63) | 140 (90) |
85
+ | llmbar-adver-neighbor | False | 287 (297) | 712 (749) | 70 (76) | 173 (175) | 43 (31) | 91 (70) |
86
+ | llmbar-natural | Neutral | 553 (644) | 530 (597) | 139 (162) | 130 (140) | 75 (71) | 70 (62) |
87
+ | mt-bench-easy | False | 1563 (720) | 2129 (1520) | 377 (159) | 551 (415) | 166 (55) | 116 (62) |
88
+ | mt-bench-hard | False | 1225 (499) | 1471 (1016) | 284 (116) | 349 (234) | 131 (45) | 136 (58) |
89
+ | mt-bench-med | Neutral | 1558 (729) | 1733 (1312) | 377 (170) | 410 (311) | 162 (58) | 145 (88) |
90
+ | refusals-dangerous | False | 597 (81) | 1828 (547) | 131 (20) | 459 (136) | 90 (12) | 211 (50) |
91
+ | refusals-offensive | False | 365 (116) | 1092 (1146) | 82 (25) | 299 (278) | 64 (15) | 134 (101) |
92
+ | xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
93
+ | xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
94
+
95
+ For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
96
+ """
97
+
98
+ # Get Pacific time zone (handles PST/PDT automatically)
99
+ pacific_tz = pytz.timezone('America/Los_Angeles')
100
+ current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
101
+
102
+ TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
103
+ ### Evaluating the capabilities, safety, and pitfalls of reward models
104
+ [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
105
+ """
src/utils.py CHANGED
@@ -72,6 +72,7 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
72
 
73
 
74
  def prep_df(df):
 
75
  # sort columns alphabetically
76
  df = df.reindex(sorted(df.columns), axis=1)
77
 
@@ -87,7 +88,7 @@ def prep_df(df):
87
  # select all columns except "model" and convert to score
88
  cols = df.columns.tolist()
89
  cols.remove("model")
90
- cols = [c for c in cols if "rank" not in c]
91
  df[cols] = (df[cols]*100)
92
 
93
  # move average column to the second
@@ -129,6 +130,7 @@ def prep_df(df):
129
  def sort_by_category(df, category):
130
  new_df = df.copy()
131
  col_rank = category.lower().replace(" ", "_") + "_rank"
 
132
 
133
  # sort
134
  new_df = new_df.sort_values(by=[col_rank, category], ascending=[True, False])
@@ -144,12 +146,15 @@ def sort_by_category(df, category):
144
  cols.insert(2, cols.pop(cols.index(category)))
145
  new_df = new_df.loc[:, cols]
146
 
147
- # # move selected column to the fourth
148
- # cols = list(new_df.columns)
149
- # cols.insert(3, cols.pop(cols.index("Average")))
150
- # new_df = new_df.loc[:, cols]
 
 
151
 
152
- # drop all ranking
153
  new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("rank")])
 
154
 
155
  return new_df
 
72
 
73
 
74
  def prep_df(df):
75
+
76
  # sort columns alphabetically
77
  df = df.reindex(sorted(df.columns), axis=1)
78
 
 
88
  # select all columns except "model" and convert to score
89
  cols = df.columns.tolist()
90
  cols.remove("model")
91
+ cols = [c for c in cols if "rank" not in c and "confi" not in c]
92
  df[cols] = (df[cols]*100)
93
 
94
  # move average column to the second
 
130
  def sort_by_category(df, category):
131
  new_df = df.copy()
132
  col_rank = category.lower().replace(" ", "_") + "_rank"
133
+ col_confi = category.lower().replace(" ", "_") + "_confi"
134
 
135
  # sort
136
  new_df = new_df.sort_values(by=[col_rank, category], ascending=[True, False])
 
146
  cols.insert(2, cols.pop(cols.index(category)))
147
  new_df = new_df.loc[:, cols]
148
 
149
+ # move selected column to the fourth
150
+ cols = list(new_df.columns)
151
+ cols.insert(3, cols.pop(cols.index(col_confi)))
152
+ new_df = new_df.loc[:, cols]
153
+ new_df = new_df.rename(columns={col_conf: "95% CI"})
154
+
155
 
156
+ # drop all ranking and confidence interval
157
  new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("rank")])
158
+ new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("confi")])
159
 
160
  return new_df