|
from enum import Enum |
|
from dataclasses import dataclass |
|
|
|
|
|
@dataclass |
|
class TaskInfo: |
|
benchmark: str |
|
col_name: str |
|
metric: str |
|
|
|
|
|
|
|
|
|
from enum import Enum |
|
from dataclasses import dataclass |
|
|
|
|
|
@dataclass |
|
class TaskInfo: |
|
benchmark: str |
|
col_name: str |
|
metric: str |
|
|
|
|
|
class Tasks(Enum): |
|
Professional_Law = TaskInfo(benchmark='professional_law', col_name='Professional Law', metric='accuracy') |
|
Moral_Scenarios = TaskInfo(benchmark='moral_scenarios', col_name='Moral Scenarios', metric='accuracy') |
|
Miscellaneous = TaskInfo(benchmark='miscellaneous', col_name='Miscellaneous', metric='accuracy') |
|
High_School_Psychology = TaskInfo(benchmark='high_school_psychology', col_name='High School Psychology', metric='accuracy') |
|
High_School_Macroeconomics = TaskInfo(benchmark='high_school_macroeconomics', col_name='High School Macroeconomics', metric='accuracy') |
|
Elementary_Mathematics = TaskInfo(benchmark='elementary_mathematics', col_name='Elementary Mathematics', metric='accuracy') |
|
Prehistory = TaskInfo(benchmark='prehistory', col_name='Prehistory', metric='accuracy') |
|
Philosophy = TaskInfo(benchmark='philosophy', col_name='Philosophy', metric='accuracy') |
|
High_School_Biology = TaskInfo(benchmark='high_school_biology', col_name='High School Biology', metric='accuracy') |
|
Nutrition = TaskInfo(benchmark='nutrition', col_name='Nutrition', metric='accuracy') |
|
Professional_Accounting = TaskInfo(benchmark='professional_accounting', col_name='Professional Accounting', metric='accuracy') |
|
Professional_Medicine = TaskInfo(benchmark='professional_medicine', col_name='Professional Medicine', metric='accuracy') |
|
High_School_Mathematics = TaskInfo(benchmark='high_school_mathematics', col_name='High School Mathematics', metric='accuracy') |
|
Clinical_Knowledge = TaskInfo(benchmark='clinical_knowledge', col_name='Clinical Knowledge', metric='accuracy') |
|
Security_Studies = TaskInfo(benchmark='security_studies', col_name='Security Studies', metric='accuracy') |
|
High_School_Microeconomics = TaskInfo(benchmark='high_school_microeconomics', col_name='High School Microeconomics', metric='accuracy') |
|
High_School_World_History = TaskInfo(benchmark='high_school_world_history', col_name='High School World History', metric='accuracy') |
|
Conceptual_Physics = TaskInfo(benchmark='conceptual_physics', col_name='Conceptual Physics', metric='accuracy') |
|
Marketing = TaskInfo(benchmark='marketing', col_name='Marketing', metric='accuracy') |
|
High_School_Statistics = TaskInfo(benchmark='high_school_statistics', col_name='High School Statistics', metric='accuracy') |
|
High_School_US_History = TaskInfo(benchmark='high_school_us_history', col_name='High School US History', metric='accuracy') |
|
High_School_Chemistry = TaskInfo(benchmark='high_school_chemistry', col_name='High School Chemistry', metric='accuracy') |
|
Sociology = TaskInfo(benchmark='sociology', col_name='Sociology', metric='accuracy') |
|
High_School_Geography = TaskInfo(benchmark='high_school_geography', col_name='High School Geography', metric='accuracy') |
|
High_School_Government_and_Politics = TaskInfo(benchmark='high_school_government_and_politics', col_name='High School Government and Politics', metric='accuracy') |
|
College_Medicine = TaskInfo(benchmark='college_medicine', col_name='College Medicine', metric='accuracy') |
|
Virology = TaskInfo(benchmark='virology', col_name='Virology', metric='accuracy') |
|
High_School_European_History = TaskInfo(benchmark='high_school_european_history', col_name='High School European History', metric='accuracy') |
|
Logical_Fallacies = TaskInfo(benchmark='logical_fallacies', col_name='Logical Fallacies', metric='accuracy') |
|
Astronomy = TaskInfo(benchmark='astronomy', col_name='Astronomy', metric='accuracy') |
|
High_School_Physics = TaskInfo(benchmark='high_school_physics', col_name='High School Physics', metric='accuracy') |
|
Electrical_Engineering = TaskInfo(benchmark='electrical_engineering', col_name='Electrical Engineering', metric='accuracy') |
|
College_Biology = TaskInfo(benchmark='college_biology', col_name='College Biology', metric='accuracy') |
|
Anatomy = TaskInfo(benchmark='anatomy', col_name='Anatomy', metric='accuracy') |
|
Formal_Logic = TaskInfo(benchmark='formal_logic', col_name='Formal Logic', metric='accuracy') |
|
International_Law = TaskInfo(benchmark='international_law', col_name='International Law', metric='accuracy') |
|
Econometrics = TaskInfo(benchmark='econometrics', col_name='Econometrics', metric='accuracy') |
|
Machine_Learning = TaskInfo(benchmark='machine_learning', col_name='Machine Learning', metric='accuracy') |
|
Management = TaskInfo(benchmark='management', col_name='Management', metric='accuracy') |
|
College_Physics = TaskInfo(benchmark='college_physics', col_name='College Physics', metric='accuracy') |
|
US_Foreign_Policy = TaskInfo(benchmark='us_foreign_policy', col_name='US Foreign Policy', metric='accuracy') |
|
Business_Ethics = TaskInfo(benchmark='business_ethics', col_name='Business Ethics', metric='accuracy') |
|
College_Mathematics = TaskInfo(benchmark='college_mathematics', col_name='College Mathematics', metric='accuracy') |
|
College_Chemistry = TaskInfo(benchmark='college_chemistry', col_name='College Chemistry', metric='accuracy') |
|
College_Computer_Science = TaskInfo(benchmark='college_computer_science', col_name='College Computer Science', metric='accuracy') |
|
High_School_Computer_Science = TaskInfo(benchmark='high_school_computer_science', col_name='High School Computer Science', metric='accuracy') |
|
Computer_Security = TaskInfo(benchmark='computer_security', col_name='Computer Security', metric='accuracy') |
|
Global_Facts = TaskInfo(benchmark='global_facts', col_name='Global Facts', metric='accuracy') |
|
Medical_Genetics = TaskInfo(benchmark='medical_genetics', col_name='Medical Genetics', metric='accuracy') |
|
Abstract_Algebra = TaskInfo(benchmark='abstract_algebra', col_name='Abstract Algebra', metric='accuracy') |
|
|
|
|
|
|
|
|
|
TITLE = """ |
|
<div align="center"> |
|
<a href="https://imgbb.com/"> |
|
<img src="https://i.ibb.co/k1gQsTw/Blue-and-White-Modern-Technology-Company-Logo-2.png" alt="Blue-and-White-Modern-Technology-Company-Logo-2" border="0" width="500" height="auto"> |
|
</a> |
|
</div> |
|
""" |
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
<div style="background-color:#001f3f; padding: 20px; border-radius: 10px;"> |
|
<h1 style="color:#ffffff; font-family: Arial, sans-serif; text-align: center;"> |
|
Welcome to <span style="color:#f39c12;">ILMAAM</span>: Benchmark for Arabic System in Multitask Assessment |
|
</h1> |
|
<p style="color:#d4d4d4; font-family: 'Verdana', sans-serif; font-size: 18px; text-align: center;"> |
|
This leaderboard showcases the performance of various Arabic LLMs on the |
|
<strong style="color:#d4d4d4;">newly released MMMLU OpenAI Benchmark</strong> across different subjects. |
|
</p> |
|
</div> |
|
""" |
|
|
|
|
|
LLM_BENCHMARKS_TEXT = """ |
|
## About ILMAAM |
|
|
|
ILMAAM is based on The Massive Multitask Multilingual Language Understanding benchmark which is designed to evaluate Arabic models on a wide range of subjects. |
|
|
|
## How to Interpret the Leaderboard |
|
|
|
- **Model**: The name of the model evaluated. |
|
- **Average ⬆️**: The average accuracy across all subjects. |
|
- **Subject Columns**: The accuracy (%) for each individual subject. |
|
|
|
## How to Submit Your Model |
|
|
|
Go to the **Submit here!** tab and provide your model details to have it evaluated and appear on the leaderboard. |
|
""" |
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
Below are the lists of models that have been evaluated, are currently being evaluated, or are pending evaluation. |
|
""" |
|
|
|
CITATION_BUTTON_LABEL = "Citation" |
|
CITATION_BUTTON_TEXT = """ |
|
If you use this leaderboard or the MMMLU dataset in your research, please cite: |
|
@misc{ILMAAM, |
|
author = {Nacar, Omer}, |
|
title = {ILMAAM: Index for Language Models For Arabic Assessment on Multitasks}, |
|
year = {2024}, |
|
publisher = {Robotics and Internet-of-Things Lab, Prince Sultan University, Riyadh}" |
|
|
|
|
|
Acknowledgment: |
|
|
|
Thanks for Prince Sultan University and RIOTU Lab for their support. |
|
|
|
}""" |
|
|
|
|