Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Sep 24

Commit

96f572b

•

1 Parent(s): 42390ad

update submit

Browse files

Files changed (1) hide show

src/submission/submit.py +178 -35

src/submission/submit.py CHANGED Viewed

@@ -2,8 +2,15 @@ import json
 import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
@@ -14,6 +21,130 @@ from src.submission.check_validity import (
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
@@ -21,6 +152,7 @@ def add_new_eval(
     precision: str,
     weight_type: str,
     model_type: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -72,48 +204,59 @@ def add_new_eval(
     if not modelcard_OK:
         return styled_error(error_msg)
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-        "private": False,
-    }
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
     API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
         repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
     )
-    # Remove the local file
-    os.remove(out_path)
-    return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
-    )

 import os
 from datetime import datetime, timezone
+import torch
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from langchain.prompts import PromptTemplate
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
+def get_top_prediction(text, tokenizer, model):
+    inputs = tokenizer(text, return_tensors='pt')
+    if torch.cuda.is_available():
+        model = model.cuda()
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits[0, -1]
+    options = [' A', ' B', ' C', ' D']
+    option_logits = []
+    for option in options:
+        option_id = tokenizer(option).input_ids[-1]
+        option_logit = logits[option_id]
+        option_logits.append((option_logit.item(), option.strip()))
+    # Get the option with the highest logit
+    top_option = max(option_logits, key=lambda x: x[0])[1]
+    return top_option
+def evaluate_model_accuracy(model_name, num_examples):
+    try:
+        # Load the model and tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            trust_remote_code=True
+        )
+        if torch.cuda.is_available():
+            model = model.cuda()  # Move model to GPU if available
+        # Load your dataset
+        dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
+        dataset = dataset['test']
+        # Convert the dataset to a pandas DataFrame for easier manipulation
+        df_dataset = dataset.to_pandas()
+        # Get list of unique subjects
+        subjects = df_dataset['Subject'].unique()
+        # Define prompt template
+        template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
+Question: {Question}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+Answer:"""
+        prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
+        # Initialize counters and results
+        overall_correct_predictions = 0
+        overall_total_questions = 0
+        per_subject_results = []
+        detailed_results = []
+        for subject in subjects:
+            # Filter dataset for the current subject
+            subject_df = df_dataset[df_dataset['Subject'] == subject]
+            # Select up to num_examples questions
+            subject_df = subject_df.sample(n=min(num_examples, len(subject_df)), random_state=42)
+            # Initialize counters for this subject
+            correct_predictions = 0
+            total_questions = 0
+            for idx, data in subject_df.iterrows():
+                # Prepare text input
+                text = prompt_template.format(
+                    Question=data['Question'],
+                    A=data['A'],
+                    B=data['B'],
+                    C=data['C'],
+                    D=data['D']
+                )
+                # Get the top prediction
+                top_prediction = get_top_prediction(text, tokenizer, model)
+                is_correct = (top_prediction == data['Answer'])
+                correct_predictions += int(is_correct)
+                total_questions += 1
+                overall_correct_predictions += int(is_correct)
+                overall_total_questions +=1
+                detailed_results.append({
+                    'Subject': subject,
+                    'Question': data['Question'],
+                    'Answer': data['Answer'],
+                    'Prediction': top_prediction,
+                    'Correct': is_correct
+                })
+            # Compute accuracy for this subject
+            subject_accuracy = correct_predictions / total_questions if total_questions > 0 else 0
+            per_subject_results.append({
+                'Subject': subject,
+                'Total Score': correct_predictions,
+                'Total Questions': total_questions,
+                'Accuracy (%)': subject_accuracy * 100
+            })
+        # Compute overall accuracy
+        overall_accuracy = overall_correct_predictions / overall_total_questions if overall_total_questions > 0 else 0
+        # Convert per_subject_results to DataFrame
+        df_per_subject = pd.DataFrame(per_subject_results)
+        # Convert detailed_results to DataFrame
+        df_detailed_results = pd.DataFrame(detailed_results)
+        return overall_accuracy, df_per_subject, df_detailed_results
+    except Exception as e:
+        return f"Error: {str(e)}", pd.DataFrame(), pd.DataFrame()
 def add_new_eval(
     model: str,
     base_model: str,
     precision: str,
     weight_type: str,
     model_type: str,
+    num_examples: int  # New parameter
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     if not modelcard_OK:
         return styled_error(error_msg)
     # Check for duplicate submission
     if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted.")
+    # Now, perform the evaluation
+    try:
+        overall_accuracy, df_per_subject, df_detailed_results = evaluate_model_accuracy(model, int(num_examples))
+        if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
+            return styled_error(overall_accuracy)
+    except Exception as e:
+        return styled_error(f"An error occurred during evaluation: {str(e)}")
+    # Prepare results for storage
+    results_dict = {
+        "config": {
+            "model_name": model,
+            "model_sha": revision,
+            "model_dtype": precision,
+            "submitted_time": current_time,
+            "model_type": model_type,
+            "weight_type": weight_type,
+            "license": license,
+            "likes": model_info.likes,
+            "params": model_size,
+            "still_on_hub": True,
+            "precision": precision,
+        },
+        "results": {
+            "average": overall_accuracy * 100,
+        },
+    }
+    # Include per-subject accuracies
+    for idx, row in df_per_subject.iterrows():
+        subject_name = row['Subject']
+        accuracy = row['Accuracy (%)']
+        results_dict['results'][subject_name] = accuracy
+    # Save results to a JSON file
+    results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
+    with open(results_file_path, "w") as f:
+        json.dump(results_dict, f)
+    # Upload the results file
     API.upload_file(
+        path_or_fileobj=results_file_path,
+        path_in_repo=results_file_path.split(f"{EVAL_RESULTS_PATH}/")[1],
+        repo_id=RESULTS_REPO,
         repo_type="dataset",
+        commit_message=f"Add results for {model}"
     )
+    # Remove the local results file
+    os.remove(results_file_path)
+    return styled_message("Your model has been evaluated and the results are now on the leaderboard!")