href

Running

App Files Files Community

Shane commited on Nov 15, 2024

Commit

63c5ebf

1 Parent(s): ed3c3e8

edited some files

Browse files

Files changed (4) hide show

app.py +2 -2
src/md.py +3 -94
src/md_old.py +105 -0
src/utils.py +11 -6

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ href_data_nongreedy = prep_df(load_all_data(local_result_dir, subdir="temperatur
 col_types_href = ["number"] + ["markdown"] + ["number"] * int((len(href_data_greedy.columns) - 1) / 2)
 col_types_href_hidden = ["number"] + ["markdown"] + ["number"] * (len(href_data_greedy.columns) - 1)
-categories = ['Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify', "Reasoning Over Numerical Data", "Multi-Document Synthesis", "Fact Checking or Attributed QA"]
 # categories = ['Average', 'Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify']
 # for showing random samples
@@ -77,7 +77,7 @@ def regex_table(dataframe, regex, selected_category, style=True):
     if style:
          # Format for different columns
-        format_dict = {col: "{:.1f}" for col in data.columns if col not in ['Average', 'Model', 'Rank']}
         format_dict['Average'] = "{:.2f}"
         data = data.style.format(format_dict, na_rep='').set_properties(**{'text-align': 'right'})
     return data

 col_types_href = ["number"] + ["markdown"] + ["number"] * int((len(href_data_greedy.columns) - 1) / 2)
 col_types_href_hidden = ["number"] + ["markdown"] + ["number"] * (len(href_data_greedy.columns) - 1)
+categories = ['Average', 'Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify', "Reasoning Over Numerical Data", "Multi-Document Synthesis", "Fact Checking or Attributed QA"]
 # categories = ['Average', 'Brainstorm', 'Open QA', 'Closed QA', 'Extract', 'Generation', 'Rewrite', 'Summarize', 'Classify']
 # for showing random samples
     if style:
          # Format for different columns
+        format_dict = {col: "{:.1f}" for col in data.columns if col not in ['Average', 'Model', 'Rank', '95% CI']}
         format_dict['Average'] = "{:.2f}"
         data = data.style.format(format_dict, na_rep='').set_properties(**{'text-align': 'right'})
     return data

src/md.py CHANGED Viewed

@@ -2,104 +2,13 @@ from datetime import datetime
 import pytz
 ABOUT_TEXT = """
-We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
-A win is when the score for the chosen response is higher than the score for the rejected response.
-Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
-## Overview
-We average over 4 core sections (per prompt weighting):
-1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
-2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
-3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
-4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
-For Reasoning, we increase the weight of the PRM-Math subset so code and math abilities are weighed equally in the final number, rather than increasing the relevance of code.
-We add a final column, **Prior Sets** -- includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
-Prior sets is weighted 0.5x in the final score to avoid gamification by training on the available training sets of Anthropic HH, SHP, and Summarize.
-Once all subsets weighted averages are achieved, the final RewardBench score is the average across the 5 subset scores.
-We include multiple types of reward models in this evaluation:
-1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
-2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
-3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed. *Note*: This also includes other models trained with implicit rewards, such as those trained with [KTO](https://arxiv.org/abs/2402.01306).
-4. **Random**: Random choice baseline.
-4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
-All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
-*Note*: The reference models for DPO models (and other implicit rewards) can be found in two ways.
-* Click on a specific model in results and you'll see a key `ref_model`, e.g. [Qwen](https://huggingface.co/datasets/allenai/reward-bench-results/blob/main/eval-set/Qwen/Qwen1.5-72B-Chat.json).
-* All the reference models are listed in the [evaluation configs](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml).
-### Subset Details
-Total number of the prompts is: 2985, filtered from 5123.
-| Subset             | Num. Samples (Pre-filtering, post-filtering) | Description |
-| :---------- | :-----: | :---------: |
-| alpacaeval-easy    | 805, 100          | Great model vs poor model            |
-| alpacaeval-length    | 805, 95          | Good model vs low model, equal length            |
-| alpacaeval-hard    | 805, 95          | Great model vs baseline model            |
-| mt-bench-easy      | 28, 28           | MT Bench 10s vs 1s            |
-| mt-bench-medium    | 45, 40           | MT Bench 9s vs 2-5s            |
-| mt-bench-hard      | 45, 37          | MT Bench 7-8 vs 5-6            |
-| refusals-dangerous | 505, 100          | Dangerous response vs no response            |
-| refusals-offensive | 704, 100          | Offensive response vs no response            |
-| llmbar-natural     | 100          | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
-| llmbar-adver-neighbor | 134          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
-| llmbar-adver-GPTInst | 92          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
-| llmbar-adver-GPTOut |  47          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
-| llmbar-adver-manual |  46          | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
-| xstest-should-refuse | 450, 154         | False response dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
-| xstest-should-respond | 450, 250         | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
-| do not answer | 939, 136         | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer)        |
-| math-prm | 447         | Human references vs. model error from OpenAI's Let's Verify Step by Step        |
-| hep-cpp | 164         | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124))        |
-| hep-go | 164         |   Go code       |
-| hep-java | 164         |  Java code        |
-| hep-js | 164         |    Javascript code        |
-| hep-python | 164         |  Python code         |
-| hep-rust | 164         |   Rust code        |
-Lengths (mean, std. dev.) include the prompt
-| subset                | length bias | chosen_chars   | rejected_chars   | chosen_tokens   | rejected_tokens   | chosen_unique_tokens   | rejected_unique_tokens   |
-|-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
-| alpacaeval-easy       | True        | 2283 (1138)    | 646 (482)        | 591 (303)       | 167 (139)         | 253 (117)              | 83 (46)                  |
-| alpacaeval-hard       | True        | 1590 (769)     | 526 (430)        | 412 (199)       | 137 (117)         | 173 (67)               | 71 (48)                  |
-| alpacaeval-length     | Neutral       | 2001 (1137)    | 2127 (1787)      | 511 (283)       | 597 (530)         | 192 (85)               | 189 (99)                 |
-| donotanswer           | False       | 755 (722)      | 1389 (695)       | 170 (161)       | 320 (164)         | 104 (82)               | 157 (73)                 |
-| hep-cpp               | Neutral     | 709 (341)      | 705 (342)        | 261 (125)       | 259 (125)         | 100 (29)               | 99 (29)                  |
-| hep-go                | Neutral     | 738 (361)      | 734 (361)        | 266 (118)       | 265 (118)         | 100 (29)               | 99 (29)                  |
-| hep-java              | Neutral     | 821 (393)      | 814 (390)        | 263 (123)       | 261 (122)         | 102 (30)               | 102 (30)                 |
-| hep-js                | Neutral     | 677 (341)      | 673 (339)        | 251 (129)       | 250 (128)         | 93 (29)                | 93 (29)                  |
-| hep-python            | Neutral     | 618 (301)      | 616 (300)        | 212 (98)        | 211 (98)          | 86 (26)                | 85 (26)                  |
-| hep-rust              | Neutral     | 666 (391)      | 660 (391)        | 221 (132)       | 219 (132)         | 95 (29)                | 95 (29)                  |
-| llmbar-adver-GPTInst  | False       | 735 (578)      | 1623 (1055)      | 170 (135)       | 377 (245)         | 93 (59)                | 179 (106)                |
-| llmbar-adver-GPTOut   | Neutral     | 378 (339)      | 359 (319)        | 96 (81)         | 101 (94)          | 60 (45)                | 55 (41)                  |
-| llmbar-adver-manual   | False       | 666 (584)      | 1139 (866)       | 160 (134)       | 264 (194)         | 92 (63)                | 140 (90)                 |
-| llmbar-adver-neighbor | False       | 287 (297)      | 712 (749)        | 70 (76)         | 173 (175)         | 43 (31)                | 91 (70)                  |
-| llmbar-natural        | Neutral     | 553 (644)      | 530 (597)        | 139 (162)       | 130 (140)         | 75 (71)                | 70 (62)                  |
-| mt-bench-easy         | False       | 1563 (720)     | 2129 (1520)      | 377 (159)       | 551 (415)         | 166 (55)               | 116 (62)                 |
-| mt-bench-hard         | False       | 1225 (499)     | 1471 (1016)      | 284 (116)       | 349 (234)         | 131 (45)               | 136 (58)                 |
-| mt-bench-med          | Neutral       | 1558 (729)     | 1733 (1312)      | 377 (170)       | 410 (311)         | 162 (58)               | 145 (88)                 |
-| refusals-dangerous    | False       | 597 (81)       | 1828 (547)       | 131 (20)        | 459 (136)         | 90 (12)                | 211 (50)                 |
-| refusals-offensive    | False       | 365 (116)      | 1092 (1146)      | 82 (25)         | 299 (278)         | 64 (15)                | 134 (101)                |
-| xstest-should-refuse  | False       | 584 (419)      | 904 (493)        | 129 (89)        | 217 (115)         | 81 (47)                | 116 (53)                 |
-| xstest-should-respond | True        | 771 (420)      | 466 (427)        | 189 (105)       | 107 (94)          | 104 (48)               | 67 (48)                  |
-For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
 """
 # Get Pacific time zone (handles PST/PDT automatically)
 pacific_tz = pytz.timezone('America/Los_Angeles')
 current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
-TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
-### Evaluating the capabilities, safety, and pitfalls of reward models
-[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
 """

 import pytz
 ABOUT_TEXT = """
+TODO
 """
 # Get Pacific time zone (handles PST/PDT automatically)
 pacific_tz = pytz.timezone('America/Los_Angeles')
 current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
+TOP_TEXT = f"""# HREF: Human Reference Guided Evaluation for Instructiong Following
+[Code]() | [Eval. Dataset]() | [Prior Test Sets]() | [Results]() | [Paper]() | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
 """

src/md_old.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from datetime import datetime
+import pytz
+ABOUT_TEXT = """
+We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
+A win is when the score for the chosen response is higher than the score for the rejected response.
+Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
+## Overview
+We average over 4 core sections (per prompt weighting):
+1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
+2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
+3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
+4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
+For Reasoning, we increase the weight of the PRM-Math subset so code and math abilities are weighed equally in the final number, rather than increasing the relevance of code.
+We add a final column, **Prior Sets** -- includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
+Prior sets is weighted 0.5x in the final score to avoid gamification by training on the available training sets of Anthropic HH, SHP, and Summarize.
+Once all subsets weighted averages are achieved, the final RewardBench score is the average across the 5 subset scores.
+We include multiple types of reward models in this evaluation:
+1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
+2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
+3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed. *Note*: This also includes other models trained with implicit rewards, such as those trained with [KTO](https://arxiv.org/abs/2402.01306).
+4. **Random**: Random choice baseline.
+4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
+All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
+*Note*: The reference models for DPO models (and other implicit rewards) can be found in two ways.
+* Click on a specific model in results and you'll see a key `ref_model`, e.g. [Qwen](https://huggingface.co/datasets/allenai/reward-bench-results/blob/main/eval-set/Qwen/Qwen1.5-72B-Chat.json).
+* All the reference models are listed in the [evaluation configs](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml).
+### Subset Details
+Total number of the prompts is: 2985, filtered from 5123.
+| Subset             | Num. Samples (Pre-filtering, post-filtering) | Description |
+| :---------- | :-----: | :---------: |
+| alpacaeval-easy    | 805, 100          | Great model vs poor model            |
+| alpacaeval-length    | 805, 95          | Good model vs low model, equal length            |
+| alpacaeval-hard    | 805, 95          | Great model vs baseline model            |
+| mt-bench-easy      | 28, 28           | MT Bench 10s vs 1s            |
+| mt-bench-medium    | 45, 40           | MT Bench 9s vs 2-5s            |
+| mt-bench-hard      | 45, 37          | MT Bench 7-8 vs 5-6            |
+| refusals-dangerous | 505, 100          | Dangerous response vs no response            |
+| refusals-offensive | 704, 100          | Offensive response vs no response            |
+| llmbar-natural     | 100          | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
+| llmbar-adver-neighbor | 134          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
+| llmbar-adver-GPTInst | 92          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
+| llmbar-adver-GPTOut |  47          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
+| llmbar-adver-manual |  46          | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
+| xstest-should-refuse | 450, 154         | False response dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
+| xstest-should-respond | 450, 250         | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
+| do not answer | 939, 136         | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer)        |
+| math-prm | 447         | Human references vs. model error from OpenAI's Let's Verify Step by Step        |
+| hep-cpp | 164         | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124))        |
+| hep-go | 164         |   Go code       |
+| hep-java | 164         |  Java code        |
+| hep-js | 164         |    Javascript code        |
+| hep-python | 164         |  Python code         |
+| hep-rust | 164         |   Rust code        |
+Lengths (mean, std. dev.) include the prompt
+| subset                | length bias | chosen_chars   | rejected_chars   | chosen_tokens   | rejected_tokens   | chosen_unique_tokens   | rejected_unique_tokens   |
+|-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
+| alpacaeval-easy       | True        | 2283 (1138)    | 646 (482)        | 591 (303)       | 167 (139)         | 253 (117)              | 83 (46)                  |
+| alpacaeval-hard       | True        | 1590 (769)     | 526 (430)        | 412 (199)       | 137 (117)         | 173 (67)               | 71 (48)                  |
+| alpacaeval-length     | Neutral       | 2001 (1137)    | 2127 (1787)      | 511 (283)       | 597 (530)         | 192 (85)               | 189 (99)                 |
+| donotanswer           | False       | 755 (722)      | 1389 (695)       | 170 (161)       | 320 (164)         | 104 (82)               | 157 (73)                 |
+| hep-cpp               | Neutral     | 709 (341)      | 705 (342)        | 261 (125)       | 259 (125)         | 100 (29)               | 99 (29)                  |
+| hep-go                | Neutral     | 738 (361)      | 734 (361)        | 266 (118)       | 265 (118)         | 100 (29)               | 99 (29)                  |
+| hep-java              | Neutral     | 821 (393)      | 814 (390)        | 263 (123)       | 261 (122)         | 102 (30)               | 102 (30)                 |
+| hep-js                | Neutral     | 677 (341)      | 673 (339)        | 251 (129)       | 250 (128)         | 93 (29)                | 93 (29)                  |
+| hep-python            | Neutral     | 618 (301)      | 616 (300)        | 212 (98)        | 211 (98)          | 86 (26)                | 85 (26)                  |
+| hep-rust              | Neutral     | 666 (391)      | 660 (391)        | 221 (132)       | 219 (132)         | 95 (29)                | 95 (29)                  |
+| llmbar-adver-GPTInst  | False       | 735 (578)      | 1623 (1055)      | 170 (135)       | 377 (245)         | 93 (59)                | 179 (106)                |
+| llmbar-adver-GPTOut   | Neutral     | 378 (339)      | 359 (319)        | 96 (81)         | 101 (94)          | 60 (45)                | 55 (41)                  |
+| llmbar-adver-manual   | False       | 666 (584)      | 1139 (866)       | 160 (134)       | 264 (194)         | 92 (63)                | 140 (90)                 |
+| llmbar-adver-neighbor | False       | 287 (297)      | 712 (749)        | 70 (76)         | 173 (175)         | 43 (31)                | 91 (70)                  |
+| llmbar-natural        | Neutral     | 553 (644)      | 530 (597)        | 139 (162)       | 130 (140)         | 75 (71)                | 70 (62)                  |
+| mt-bench-easy         | False       | 1563 (720)     | 2129 (1520)      | 377 (159)       | 551 (415)         | 166 (55)               | 116 (62)                 |
+| mt-bench-hard         | False       | 1225 (499)     | 1471 (1016)      | 284 (116)       | 349 (234)         | 131 (45)               | 136 (58)                 |
+| mt-bench-med          | Neutral       | 1558 (729)     | 1733 (1312)      | 377 (170)       | 410 (311)         | 162 (58)               | 145 (88)                 |
+| refusals-dangerous    | False       | 597 (81)       | 1828 (547)       | 131 (20)        | 459 (136)         | 90 (12)                | 211 (50)                 |
+| refusals-offensive    | False       | 365 (116)      | 1092 (1146)      | 82 (25)         | 299 (278)         | 64 (15)                | 134 (101)                |
+| xstest-should-refuse  | False       | 584 (419)      | 904 (493)        | 129 (89)        | 217 (115)         | 81 (47)                | 116 (53)                 |
+| xstest-should-respond | True        | 771 (420)      | 466 (427)        | 189 (105)       | 107 (94)          | 104 (48)               | 67 (48)                  |
+For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
+"""
+# Get Pacific time zone (handles PST/PDT automatically)
+pacific_tz = pytz.timezone('America/Los_Angeles')
+current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
+TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
+### Evaluating the capabilities, safety, and pitfalls of reward models
+[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
+"""

src/utils.py CHANGED Viewed

@@ -72,6 +72,7 @@ def load_all_data(data_repo, subdir:str, subsubsets=False):    # use HF api to p
 def prep_df(df):
     # sort columns alphabetically
     df = df.reindex(sorted(df.columns), axis=1)
@@ -87,7 +88,7 @@ def prep_df(df):
     # select all columns except "model" and convert to score
     cols = df.columns.tolist()
     cols.remove("model")
-    cols = [c for c in cols if "rank" not in c]
     df[cols] = (df[cols]*100)
     # move average column to the second
@@ -129,6 +130,7 @@ def prep_df(df):
 def sort_by_category(df, category):
     new_df = df.copy()
     col_rank = category.lower().replace(" ", "_") + "_rank"
     # sort
     new_df = new_df.sort_values(by=[col_rank, category], ascending=[True, False])
@@ -144,12 +146,15 @@ def sort_by_category(df, category):
     cols.insert(2, cols.pop(cols.index(category)))
     new_df = new_df.loc[:, cols]
-    # # move selected column to the fourth
-    # cols = list(new_df.columns)
-    # cols.insert(3, cols.pop(cols.index("Average")))
-    # new_df = new_df.loc[:, cols]
-    # drop all ranking
     new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("rank")])
     return new_df

 def prep_df(df):
     # sort columns alphabetically
     df = df.reindex(sorted(df.columns), axis=1)
     # select all columns except "model" and convert to score
     cols = df.columns.tolist()
     cols.remove("model")
+    cols = [c for c in cols if "rank" not in c and "confi" not in c]
     df[cols] = (df[cols]*100)
     # move average column to the second
 def sort_by_category(df, category):
     new_df = df.copy()
     col_rank = category.lower().replace(" ", "_") + "_rank"
+    col_confi = category.lower().replace(" ", "_") + "_confi"
     # sort
     new_df = new_df.sort_values(by=[col_rank, category], ascending=[True, False])
     cols.insert(2, cols.pop(cols.index(category)))
     new_df = new_df.loc[:, cols]
+    # move selected column to the fourth
+    cols = list(new_df.columns)
+    cols.insert(3, cols.pop(cols.index(col_confi)))
+    new_df = new_df.loc[:, cols]
+    new_df = new_df.rename(columns={col_conf: "95% CI"})
+    # drop all ranking and confidence interval
     new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("rank")])
+    new_df = new_df.drop(columns=[c for c in new_df.columns if c.endswith("confi")])
     return new_df