Spaces:

Omartificial-Intelligence-Space
/

Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Sep 24

Commit

4b7beb0

•

1 Parent(s): 7f8c86c

update sum

Browse files

Files changed (1) hide show

src/submission/submit.py +75 -92

src/submission/submit.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import os
 from datetime import datetime, timezone
-import traceback
 import torch
 import pandas as pd
 import numpy as np
@@ -21,54 +21,52 @@ from src.submission.check_validity import (
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
-def get_top_prediction(text, tokenizer, model):
-    try:
-        inputs = tokenizer(text, return_tensors='pt')
-        if torch.cuda.is_available():
-            model = model.cuda()
-            inputs = {k: v.cuda() for k, v in inputs.items()}
-        else:
-            model = model.cpu()
-        with torch.no_grad():
-            outputs = model(**inputs)
-            print(f"outputs.logits shape: {outputs.logits.shape}")
-            seq_len = outputs.logits.size(1)
-            if seq_len == 0:
-                print("No logits were produced by the model.")
-                return None
-            logits = outputs.logits[0, -1, :]  # Shape: [vocab_size]
-        options = ['A', 'B', 'C', 'D']
-        option_logits = []
-        for option in options:
-            # Encode the option without adding special tokens
-            option_ids = tokenizer.encode(option, add_special_tokens=False)
-            if not option_ids:
-                print(f"Option '{option}' could not be tokenized.")
-                continue
-            option_id = option_ids[0]
-            vocab_size = logits.size(0)
-            if option_id >= vocab_size:
-                print(f"Option ID {option_id} is out of bounds for vocabulary size {vocab_size}")
-                continue
             option_logit = logits[option_id]
-            option_logits.append((option_logit.item(), option))
-        if not option_logits:
-            print("No valid options found.")
-            return None
-        # Get the option with the highest logit
-        top_option = max(option_logits, key=lambda x: x[0])[1]
-        return top_option
-    except Exception as e:
-        tb = traceback.format_exc()
-        print(f"Error in get_top_prediction: {e}\n{tb}")
-        return None
-def evaluate_model_accuracy(model_name, num_examples):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -83,47 +81,43 @@ def evaluate_model_accuracy(model_name, num_examples):
         else:
             model = model.cpu()
-        # Load your dataset
         dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
         dataset = dataset['test']
-        # Convert the dataset to a pandas DataFrame for easier manipulation
-        df_dataset = dataset.to_pandas()
-        # Get list of unique subjects
-        subjects = df_dataset['Subject'].unique()
         # Define prompt template
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
 Question: {Question}
 A) {A}
 B) {B}
 C) {C}
 D) {D}
 Answer:"""
         prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
-        # Initialize counters and results
         overall_correct_predictions = 0
         overall_total_questions = 0
-        per_subject_results = []
-        detailed_results = []
         for subject in subjects:
-            # Filter dataset for the current subject
-            subject_df = df_dataset[df_dataset['Subject'] == subject]
-            # Select up to num_examples questions
-            subject_df = subject_df.sample(n=min(num_examples, len(subject_df)), random_state=42)
-            # Initialize counters for this subject
             correct_predictions = 0
             total_questions = 0
-            for idx, data in subject_df.iterrows():
                 # Prepare text input
                 text = prompt_template.format(
                     Question=data['Question'],
@@ -135,48 +129,38 @@ Answer:"""
                 # Get the top prediction
                 top_prediction = get_top_prediction(text, tokenizer, model)
-                if top_prediction is None:
-                    print(f"Skipping question due to tokenization issues: {data['Question']}")
-                    continue  # Skip this question if no valid options are found
                 is_correct = (top_prediction == data['Answer'])
                 correct_predictions += int(is_correct)
                 total_questions += 1
                 overall_correct_predictions += int(is_correct)
                 overall_total_questions += 1
-                detailed_results.append({
-                    'Subject': subject,
                     'Question': data['Question'],
                     'Answer': data['Answer'],
                     'Prediction': top_prediction,
                     'Correct': is_correct
                 })
-            # Compute accuracy for this subject
-            subject_accuracy = correct_predictions / total_questions if total_questions > 0 else 0
-            per_subject_results.append({
-                'Subject': subject,
-                'Total Score': correct_predictions,
                 'Total Questions': total_questions,
-                'Accuracy (%)': subject_accuracy * 100
-            })
-        # Compute overall accuracy
-        overall_accuracy = overall_correct_predictions / overall_total_questions if overall_total_questions > 0 else 0
-        # Convert per_subject_results to DataFrame
-        df_per_subject = pd.DataFrame(per_subject_results)
-        # Convert detailed_results to DataFrame
-        df_detailed_results = pd.DataFrame(detailed_results)
-        return overall_accuracy, df_per_subject, df_detailed_results
     except Exception as e:
-        return f"Error: {str(e)}", pd.DataFrame(), pd.DataFrame()
 def add_new_eval(
     model: str,
@@ -185,7 +169,7 @@ def add_new_eval(
     precision: str,
     weight_type: str,
     model_type: str,
-    num_examples: int  # New parameter
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -243,7 +227,7 @@ def add_new_eval(
     # Now, perform the evaluation
     try:
-        overall_accuracy, df_per_subject, df_detailed_results = evaluate_model_accuracy(model, int(num_examples))
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
@@ -265,15 +249,14 @@ def add_new_eval(
             "precision": precision,
         },
         "results": {
-            "average": overall_accuracy * 100,
         },
     }
     # Include per-subject accuracies
-    for idx, row in df_per_subject.iterrows():
-        subject_name = row['Subject']
-        accuracy = row['Accuracy (%)']
-        results_dict['results'][subject_name] = accuracy
     # Save results to a JSON file
     results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"

 import json
 import os
 from datetime import datetime, timezone
 import torch
 import pandas as pd
 import numpy as np
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
+# List of subjects to exclude from evaluation
+excluded_subjects = [
+    "human_sexuality",
+    "professional_psychology",
+    "moral_disputes",
+    "public_relations",
+    "jurisprudence",
+    "human_aging",
+    "world_religions"
+]
+def get_top_prediction(text, tokenizer, model):
+    inputs = tokenizer(text, return_tensors='pt')
+    if torch.cuda.is_available():
+        model = model.cuda()
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+    else:
+        model = model.cpu()
+        inputs = {k: v.cpu() for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits[0, -1]  # Get logits of the last token
+    options = [' A', ' B', ' C', ' D']
+    option_logits = []
+    # Iterate through each option
+    for option in options:
+        option_ids = tokenizer(option).input_ids
+        # Ensure option_ids are within range and not empty
+        if option_ids and option_ids[-1] < logits.size(0):
+            option_id = option_ids[-1]
             option_logit = logits[option_id]
+            option_logits.append((option_logit.item(), option.strip()))
+        else:
+            print(f"Skipping option '{option}' due to index out of range.")
+    if not option_logits:
+        return "No valid options"
+    # Get the option with the highest logit
+    top_option = max(option_logits, key=lambda x: x[0])[1]
+    return top_option
+def evaluate_model_accuracy_by_subject(model_name, num_examples):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         else:
             model = model.cpu()
+        # Load your custom MMMLU dataset
         dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
         dataset = dataset['test']
+        # Filter out excluded subjects
+        dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
         # Define prompt template
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
 Question: {Question}
 A) {A}
 B) {B}
 C) {C}
 D) {D}
 Answer:"""
         prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
+        # Initialize results storage
+        subject_results = {}
+        subjects = dataset.unique('Subject')
         overall_correct_predictions = 0
         overall_total_questions = 0
         for subject in subjects:
+            subject_data = dataset.filter(lambda x: x['Subject'] == subject)
+            # Sample num_examples from each subject
+            if num_examples > 0:
+                subject_data = subject_data.shuffle().select(range(min(num_examples, len(subject_data))))
             correct_predictions = 0
             total_questions = 0
+            results = []
+            for data in subject_data:
                 # Prepare text input
                 text = prompt_template.format(
                     Question=data['Question'],
                 # Get the top prediction
                 top_prediction = get_top_prediction(text, tokenizer, model)
                 is_correct = (top_prediction == data['Answer'])
                 correct_predictions += int(is_correct)
                 total_questions += 1
                 overall_correct_predictions += int(is_correct)
                 overall_total_questions += 1
+                results.append({
                     'Question': data['Question'],
                     'Answer': data['Answer'],
                     'Prediction': top_prediction,
                     'Correct': is_correct
                 })
+            accuracy = correct_predictions / total_questions if total_questions > 0 else 0
+            # Store results for this subject
+            subject_results[subject] = {
+                'Correct Predictions': correct_predictions,
                 'Total Questions': total_questions,
+                'Accuracy': accuracy * 100,
+                'Results DataFrame': pd.DataFrame(results)
+            }
+        overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
+        return overall_accuracy, subject_results
     except Exception as e:
+        import traceback
+        tb = traceback.format_exc()
+        print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
+        return f"Error: {str(e)}", {}
 def add_new_eval(
     model: str,
     precision: str,
     weight_type: str,
     model_type: str,
+    num_examples: int
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     # Now, perform the evaluation
     try:
+        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, int(num_examples))
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
             "precision": precision,
         },
         "results": {
+            "average": overall_accuracy,
         },
     }
     # Include per-subject accuracies
+    for subject, data in subject_results.items():
+        accuracy = data['Accuracy']
+        results_dict['results'][subject] = accuracy
     # Save results to a JSON file
     results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"