Adding Evaluation Results (#3)

Browse files

- Adding Evaluation Results (12841743f988b039ee65cab970f5c04835f88bbb)

Co-authored-by: Open LLM Leaderboard PR Bot <leaderboard-pr-bot@users.noreply.huggingface.co>

Files changed (1) hide show

README.md +162 -53

README.md CHANGED Viewed

@@ -12,59 +12,62 @@ datasets:
 - databricks/databricks-dolly-15k
 - THUDM/webglm-qa
 widget:
-  - messages:
-      - role: system
-        content: You are a helpful assistant, who answers with empathy.
-      - role: user
-        content: Got a question for you!
-      - role: assistant
-        content: "Sure! What's it?"
-      - role: user
-        content: Why do you love cats so much!? 🐈
-  - messages:
-      - role: system
-        content: "You are a helpful assistant who answers user's questions with empathy."
-      - role: user
-        content: Who is Mona Lisa?
-  - messages:
-      - role: system
-        content: You are a helpful assistant who provides concise responses.
-      - role: user
-        content: Heya!
-      - role: assistant
-        content: Hi! How may I help you today?
-      - role: user
-        content: I need to build a simple website. Where should I start learning about web development?
-  - messages:
-      - role: user
-        content: Invited some friends to come home today. Give me some ideas for games to play with them!
-  - messages:
-      - role: system
-        content: "You are a helpful assistant who answers user's questions with details and curiosity."
-      - role: user
-        content: What are some potential applications for quantum computing?
-  - messages:
-      - role: system
-        content: You are a helpful assistant who gives creative responses.
-      - role: user
-        content: Write the specs of a game about mages in a fantasy world.
-  - messages:
-      - role: system
-        content: "You are a helpful assistant who answers user's questions with details."
-      - role: user
-        content: Tell me about the pros and cons of social media.
-  - messages:
-      - role: system
-        content: "You are a helpful assistant who answers user's questions with confidence."
-      - role: user
-        content: What is a dog?
-      - role: assistant
-        content: 'A dog is a four-legged, domesticated animal that is a member of the class Mammalia,
-          which includes all mammals. Dogs are known for their loyalty, playfulness, and
-          ability to be trained for various tasks. They are also used for hunting, herding,
-          and as service animals.'
-      - role: user
-        content: What is the color of an apple?
 inference:
   parameters:
     max_new_tokens: 250
@@ -174,6 +177,98 @@ model-index:
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
       name: Open LLM Leaderboard
 ---
 # A Llama Chat Model of 160M Parameters
@@ -255,3 +350,17 @@ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-le
 |TruthfulQA (0-shot)              |44.16|
 |Winogrande (5-shot)              |51.30|
 |GSM8k (5-shot)                   | 0.00|

 - databricks/databricks-dolly-15k
 - THUDM/webglm-qa
 widget:
+- messages:
+  - role: system
+    content: You are a helpful assistant, who answers with empathy.
+  - role: user
+    content: Got a question for you!
+  - role: assistant
+    content: Sure! What's it?
+  - role: user
+    content: Why do you love cats so much!? 🐈
+- messages:
+  - role: system
+    content: You are a helpful assistant who answers user's questions with empathy.
+  - role: user
+    content: Who is Mona Lisa?
+- messages:
+  - role: system
+    content: You are a helpful assistant who provides concise responses.
+  - role: user
+    content: Heya!
+  - role: assistant
+    content: Hi! How may I help you today?
+  - role: user
+    content: I need to build a simple website. Where should I start learning about
+      web development?
+- messages:
+  - role: user
+    content: Invited some friends to come home today. Give me some ideas for games
+      to play with them!
+- messages:
+  - role: system
+    content: You are a helpful assistant who answers user's questions with details
+      and curiosity.
+  - role: user
+    content: What are some potential applications for quantum computing?
+- messages:
+  - role: system
+    content: You are a helpful assistant who gives creative responses.
+  - role: user
+    content: Write the specs of a game about mages in a fantasy world.
+- messages:
+  - role: system
+    content: You are a helpful assistant who answers user's questions with details.
+  - role: user
+    content: Tell me about the pros and cons of social media.
+- messages:
+  - role: system
+    content: You are a helpful assistant who answers user's questions with confidence.
+  - role: user
+    content: What is a dog?
+  - role: assistant
+    content: A dog is a four-legged, domesticated animal that is a member of the class
+      Mammalia, which includes all mammals. Dogs are known for their loyalty, playfulness,
+      and ability to be trained for various tasks. They are also used for hunting,
+      herding, and as service animals.
+  - role: user
+    content: What is the color of an apple?
 inference:
   parameters:
     max_new_tokens: 250
     source:
       url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
       name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: IFEval (0-Shot)
+      type: HuggingFaceH4/ifeval
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: inst_level_strict_acc and prompt_level_strict_acc
+      value: 15.75
+      name: strict accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: BBH (3-Shot)
+      type: BBH
+      args:
+        num_few_shot: 3
+    metrics:
+    - type: acc_norm
+      value: 3.17
+      name: normalized accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MATH Lvl 5 (4-Shot)
+      type: hendrycks/competition_math
+      args:
+        num_few_shot: 4
+    metrics:
+    - type: exact_match
+      value: 0.0
+      name: exact match
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: GPQA (0-shot)
+      type: Idavidrein/gpqa
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: acc_norm
+      value: 1.01
+      name: acc_norm
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MuSR (0-shot)
+      type: TAUR-Lab/MuSR
+      args:
+        num_few_shot: 0
+    metrics:
+    - type: acc_norm
+      value: 3.17
+      name: acc_norm
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
+      name: Open LLM Leaderboard
+  - task:
+      type: text-generation
+      name: Text Generation
+    dataset:
+      name: MMLU-PRO (5-shot)
+      type: TIGER-Lab/MMLU-Pro
+      config: main
+      split: test
+      args:
+        num_few_shot: 5
+    metrics:
+    - type: acc
+      value: 1.51
+      name: accuracy
+    source:
+      url: https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query=Felladrin/Llama-160M-Chat-v1
+      name: Open LLM Leaderboard
 ---
 # A Llama Chat Model of 160M Parameters
 |TruthfulQA (0-shot)              |44.16|
 |Winogrande (5-shot)              |51.30|
 |GSM8k (5-shot)                   | 0.00|
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_Felladrin__Llama-160M-Chat-v1)
+|      Metric       |Value|
+|-------------------|----:|
+|Avg.               | 4.10|
+|IFEval (0-Shot)    |15.75|
+|BBH (3-Shot)       | 3.17|
+|MATH Lvl 5 (4-Shot)| 0.00|
+|GPQA (0-shot)      | 1.01|
+|MuSR (0-shot)      | 3.17|
+|MMLU-PRO (5-shot)  | 1.51|