open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

Clémentine commited on Mar 13, 2024

Commit

4ccfada

•

1 Parent(s): e4ab31c

fix display

Browse files

Files changed (4) hide show

README.md +1 -1
app.py +1 -1
src/display/about.py +10 -9
src/leaderboard/read_evals.py +3 -0

README.md CHANGED Viewed

@@ -17,7 +17,7 @@ space_ci:
   - H4_TOKEN
 tags:
 - leaderboard
-short_description: Ranking open LLMs and chat models on their capabilities
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

   - H4_TOKEN
 tags:
 - leaderboard
+short_description: Track, rank and evaluate open LLMs and chatbots
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -340,7 +340,7 @@ with demo:
         with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

         with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

src/display/about.py CHANGED Viewed

@@ -1,16 +1,8 @@
 from src.display.utils import ModelType
-TITLE = """<h1 align="center" id="space-title">🤗 Open LLM Leaderboard</h1>"""
 INTRODUCTION_TEXT = """
-📐 The 🤗 Open LLM Leaderboard aims to track, rank and evaluate open LLMs and chatbots.
-🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
-The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
-Other cool leaderboards:
-- [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
-- [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
 """
 icons = f"""
@@ -24,6 +16,9 @@ LLM_BENCHMARKS_TEXT = f"""
 ## ABOUT
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
 ### Tasks
 📈 We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
@@ -88,6 +83,12 @@ To get more information about quantization, see:
 ### Useful links
 - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
 - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
 """
 FAQ_TEXT = """

 from src.display.utils import ModelType
+TITLE = """<h1 style="text-align:left;float:left; id="space-title">🤗 Open LLM Leaderboard</h1> Track, rank and evaluate open LLMs and chatbots"""
 INTRODUCTION_TEXT = """
 """
 icons = f"""
 ## ABOUT
 With the plethora of large language models (LLMs) and chatbots being released week upon week, often with grandiose claims of their performance, it can be hard to filter out the genuine progress that is being made by the open-source community and which model is the current state of the art.
+🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
+The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details below!
 ### Tasks
 📈 We evaluate models on 6 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 ### Useful links
 - [Community resources](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/174)
 - [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
+### Other cool leaderboards:
+- [LLM safety](https://huggingface.co/spaces/AI-Secure/llm-trustworthy-leaderboard)
+- [LLM performance](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
 """
 FAQ_TEXT = """

src/leaderboard/read_evals.py CHANGED Viewed

@@ -204,6 +204,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
         eval_result.update_with_request_file(requests_path)
         if eval_result.full_model in dynamic_data:
             eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
         # Store results of same eval together
         eval_name = eval_result.eval_name

         eval_result.update_with_request_file(requests_path)
         if eval_result.full_model in dynamic_data:
             eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
+            # Hardcoding because of gating problem
+            if "meta-llama" in eval_result.full_model:
+                eval_result.still_on_hub = True
         # Store results of same eval together
         eval_name = eval_result.eval_name