|
from enum import Enum |
|
from dataclasses import dataclass |
|
|
|
|
|
@dataclass |
|
class TaskInfo: |
|
benchmark: str |
|
col_name: str |
|
metric: str |
|
|
|
|
|
|
|
|
|
from enum import Enum |
|
from dataclasses import dataclass |
|
|
|
|
|
@dataclass |
|
class TaskInfo: |
|
benchmark: str |
|
col_name: str |
|
metric: str |
|
|
|
|
|
class Tasks(Enum): |
|
Professional_Law = TaskInfo(benchmark='professional_law', col_name='Professional Law', metric='accuracy') |
|
Moral_Scenarios = TaskInfo(benchmark='moral_scenarios', col_name='Moral Scenarios', metric='accuracy') |
|
Miscellaneous = TaskInfo(benchmark='miscellaneous', col_name='Miscellaneous', metric='accuracy') |
|
High_School_Psychology = TaskInfo(benchmark='high_school_psychology', col_name='High School Psychology', metric='accuracy') |
|
High_School_Macroeconomics = TaskInfo(benchmark='high_school_macroeconomics', col_name='High School Macroeconomics', metric='accuracy') |
|
Elementary_Mathematics = TaskInfo(benchmark='elementary_mathematics', col_name='Elementary Mathematics', metric='accuracy') |
|
Prehistory = TaskInfo(benchmark='prehistory', col_name='Prehistory', metric='accuracy') |
|
Philosophy = TaskInfo(benchmark='philosophy', col_name='Philosophy', metric='accuracy') |
|
High_School_Biology = TaskInfo(benchmark='high_school_biology', col_name='High School Biology', metric='accuracy') |
|
Nutrition = TaskInfo(benchmark='nutrition', col_name='Nutrition', metric='accuracy') |
|
Professional_Accounting = TaskInfo(benchmark='professional_accounting', col_name='Professional Accounting', metric='accuracy') |
|
Professional_Medicine = TaskInfo(benchmark='professional_medicine', col_name='Professional Medicine', metric='accuracy') |
|
High_School_Mathematics = TaskInfo(benchmark='high_school_mathematics', col_name='High School Mathematics', metric='accuracy') |
|
Clinical_Knowledge = TaskInfo(benchmark='clinical_knowledge', col_name='Clinical Knowledge', metric='accuracy') |
|
Security_Studies = TaskInfo(benchmark='security_studies', col_name='Security Studies', metric='accuracy') |
|
High_School_Microeconomics = TaskInfo(benchmark='high_school_microeconomics', col_name='High School Microeconomics', metric='accuracy') |
|
High_School_World_History = TaskInfo(benchmark='high_school_world_history', col_name='High School World History', metric='accuracy') |
|
Conceptual_Physics = TaskInfo(benchmark='conceptual_physics', col_name='Conceptual Physics', metric='accuracy') |
|
Marketing = TaskInfo(benchmark='marketing', col_name='Marketing', metric='accuracy') |
|
High_School_Statistics = TaskInfo(benchmark='high_school_statistics', col_name='High School Statistics', metric='accuracy') |
|
High_School_US_History = TaskInfo(benchmark='high_school_us_history', col_name='High School US History', metric='accuracy') |
|
High_School_Chemistry = TaskInfo(benchmark='high_school_chemistry', col_name='High School Chemistry', metric='accuracy') |
|
Sociology = TaskInfo(benchmark='sociology', col_name='Sociology', metric='accuracy') |
|
High_School_Geography = TaskInfo(benchmark='high_school_geography', col_name='High School Geography', metric='accuracy') |
|
High_School_Government_and_Politics = TaskInfo(benchmark='high_school_government_and_politics', col_name='High School Government and Politics', metric='accuracy') |
|
College_Medicine = TaskInfo(benchmark='college_medicine', col_name='College Medicine', metric='accuracy') |
|
Virology = TaskInfo(benchmark='virology', col_name='Virology', metric='accuracy') |
|
High_School_European_History = TaskInfo(benchmark='high_school_european_history', col_name='High School European History', metric='accuracy') |
|
Logical_Fallacies = TaskInfo(benchmark='logical_fallacies', col_name='Logical Fallacies', metric='accuracy') |
|
Astronomy = TaskInfo(benchmark='astronomy', col_name='Astronomy', metric='accuracy') |
|
High_School_Physics = TaskInfo(benchmark='high_school_physics', col_name='High School Physics', metric='accuracy') |
|
Electrical_Engineering = TaskInfo(benchmark='electrical_engineering', col_name='Electrical Engineering', metric='accuracy') |
|
College_Biology = TaskInfo(benchmark='college_biology', col_name='College Biology', metric='accuracy') |
|
Anatomy = TaskInfo(benchmark='anatomy', col_name='Anatomy', metric='accuracy') |
|
Formal_Logic = TaskInfo(benchmark='formal_logic', col_name='Formal Logic', metric='accuracy') |
|
International_Law = TaskInfo(benchmark='international_law', col_name='International Law', metric='accuracy') |
|
Econometrics = TaskInfo(benchmark='econometrics', col_name='Econometrics', metric='accuracy') |
|
Machine_Learning = TaskInfo(benchmark='machine_learning', col_name='Machine Learning', metric='accuracy') |
|
Management = TaskInfo(benchmark='management', col_name='Management', metric='accuracy') |
|
College_Physics = TaskInfo(benchmark='college_physics', col_name='College Physics', metric='accuracy') |
|
US_Foreign_Policy = TaskInfo(benchmark='us_foreign_policy', col_name='US Foreign Policy', metric='accuracy') |
|
Business_Ethics = TaskInfo(benchmark='business_ethics', col_name='Business Ethics', metric='accuracy') |
|
College_Mathematics = TaskInfo(benchmark='college_mathematics', col_name='College Mathematics', metric='accuracy') |
|
College_Chemistry = TaskInfo(benchmark='college_chemistry', col_name='College Chemistry', metric='accuracy') |
|
College_Computer_Science = TaskInfo(benchmark='college_computer_science', col_name='College Computer Science', metric='accuracy') |
|
High_School_Computer_Science = TaskInfo(benchmark='high_school_computer_science', col_name='High School Computer Science', metric='accuracy') |
|
Computer_Security = TaskInfo(benchmark='computer_security', col_name='Computer Security', metric='accuracy') |
|
Global_Facts = TaskInfo(benchmark='global_facts', col_name='Global Facts', metric='accuracy') |
|
Medical_Genetics = TaskInfo(benchmark='medical_genetics', col_name='Medical Genetics', metric='accuracy') |
|
Abstract_Algebra = TaskInfo(benchmark='abstract_algebra', col_name='Abstract Algebra', metric='accuracy') |
|
|
|
|
|
|
|
|
|
TITLE = """ |
|
<h1 align="center">π Arabic MMMLU Evaluation Leaderboard for LLMs π</h1> |
|
""" |
|
|
|
INTRODUCTION_TEXT = """ |
|
Welcome to the Arabic MMMLU Evaluation for LLMs Leaderboard for the MMMLU dataset evaluation. This leaderboard displays the performance of various language models on the MMMLU dataset across different subjects. |
|
""" |
|
|
|
LLM_BENCHMARKS_TEXT = """ |
|
## About the MMMLU Benchmark |
|
|
|
The Massive Multitask Multilingual Language Understanding (MMMLU) benchmark is designed to evaluate models on a wide range of subjects. |
|
|
|
## How to Interpret the Leaderboard |
|
|
|
- **Model**: The name of the model evaluated. |
|
- **Average β¬οΈ**: The average accuracy across all subjects. |
|
- **Subject Columns**: The accuracy (%) for each individual subject. |
|
|
|
## How to Submit Your Model |
|
|
|
Go to the **Submit here!** tab and provide your model details to have it evaluated and appear on the leaderboard. |
|
""" |
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
Below are the lists of models that have been evaluated, are currently being evaluated, or are pending evaluation. |
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Citation" |
|
CITATION_BUTTON_TEXT = """ |
|
If you use this leaderboard or the MMMLU dataset in your research, please cite: |
|
@misc{AMMMLU, |
|
author = {Nacar, Omer}, |
|
title = {Arabic MMMLU Evaluation for LLMs Leaderboard}, |
|
year = {2024}, |
|
publisher = {Omartificial-Intelligence-Space}}" |
|
}""" |
|
|
|
|