Commit
4b7beb0
1 Parent(s): 7f8c86c

update sum

Browse files
Files changed (1) hide show
  1. src/submission/submit.py +75 -92
src/submission/submit.py CHANGED
@@ -1,7 +1,7 @@
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
- import traceback
5
  import torch
6
  import pandas as pd
7
  import numpy as np
@@ -21,54 +21,52 @@ from src.submission.check_validity import (
21
  REQUESTED_MODELS = None
22
  USERS_TO_SUBMISSION_DATES = None
23
 
24
- def get_top_prediction(text, tokenizer, model):
25
- try:
26
- inputs = tokenizer(text, return_tensors='pt')
27
- if torch.cuda.is_available():
28
- model = model.cuda()
29
- inputs = {k: v.cuda() for k, v in inputs.items()}
30
- else:
31
- model = model.cpu()
 
 
32
 
33
- with torch.no_grad():
34
- outputs = model(**inputs)
35
- print(f"outputs.logits shape: {outputs.logits.shape}")
36
- seq_len = outputs.logits.size(1)
37
- if seq_len == 0:
38
- print("No logits were produced by the model.")
39
- return None
40
- logits = outputs.logits[0, -1, :] # Shape: [vocab_size]
41
-
42
- options = ['A', 'B', 'C', 'D']
43
- option_logits = []
44
- for option in options:
45
- # Encode the option without adding special tokens
46
- option_ids = tokenizer.encode(option, add_special_tokens=False)
47
- if not option_ids:
48
- print(f"Option '{option}' could not be tokenized.")
49
- continue
50
- option_id = option_ids[0]
51
- vocab_size = logits.size(0)
52
- if option_id >= vocab_size:
53
- print(f"Option ID {option_id} is out of bounds for vocabulary size {vocab_size}")
54
- continue
55
  option_logit = logits[option_id]
56
- option_logits.append((option_logit.item(), option))
57
-
58
- if not option_logits:
59
- print("No valid options found.")
60
- return None
61
 
62
- # Get the option with the highest logit
63
- top_option = max(option_logits, key=lambda x: x[0])[1]
64
- return top_option
65
- except Exception as e:
66
- tb = traceback.format_exc()
67
- print(f"Error in get_top_prediction: {e}\n{tb}")
68
- return None
69
 
 
 
 
70
 
71
- def evaluate_model_accuracy(model_name, num_examples):
72
  try:
73
  # Load the model and tokenizer
74
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -83,47 +81,43 @@ def evaluate_model_accuracy(model_name, num_examples):
83
  else:
84
  model = model.cpu()
85
 
86
- # Load your dataset
87
  dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
88
  dataset = dataset['test']
89
 
90
- # Convert the dataset to a pandas DataFrame for easier manipulation
91
- df_dataset = dataset.to_pandas()
92
-
93
- # Get list of unique subjects
94
- subjects = df_dataset['Subject'].unique()
95
 
96
  # Define prompt template
97
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
98
-
99
  Question: {Question}
100
  A) {A}
101
  B) {B}
102
  C) {C}
103
  D) {D}
104
-
105
  Answer:"""
106
 
107
  prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
108
 
109
- # Initialize counters and results
 
 
 
110
  overall_correct_predictions = 0
111
  overall_total_questions = 0
112
- per_subject_results = []
113
- detailed_results = []
114
 
115
  for subject in subjects:
116
- # Filter dataset for the current subject
117
- subject_df = df_dataset[df_dataset['Subject'] == subject]
118
 
119
- # Select up to num_examples questions
120
- subject_df = subject_df.sample(n=min(num_examples, len(subject_df)), random_state=42)
 
121
 
122
- # Initialize counters for this subject
123
  correct_predictions = 0
124
  total_questions = 0
 
125
 
126
- for idx, data in subject_df.iterrows():
127
  # Prepare text input
128
  text = prompt_template.format(
129
  Question=data['Question'],
@@ -135,48 +129,38 @@ Answer:"""
135
 
136
  # Get the top prediction
137
  top_prediction = get_top_prediction(text, tokenizer, model)
138
- if top_prediction is None:
139
- print(f"Skipping question due to tokenization issues: {data['Question']}")
140
- continue # Skip this question if no valid options are found
141
-
142
  is_correct = (top_prediction == data['Answer'])
143
  correct_predictions += int(is_correct)
144
  total_questions += 1
145
  overall_correct_predictions += int(is_correct)
146
  overall_total_questions += 1
147
 
148
- detailed_results.append({
149
- 'Subject': subject,
150
  'Question': data['Question'],
151
  'Answer': data['Answer'],
152
  'Prediction': top_prediction,
153
  'Correct': is_correct
154
  })
155
 
156
- # Compute accuracy for this subject
157
- subject_accuracy = correct_predictions / total_questions if total_questions > 0 else 0
158
 
159
- per_subject_results.append({
160
- 'Subject': subject,
161
- 'Total Score': correct_predictions,
162
  'Total Questions': total_questions,
163
- 'Accuracy (%)': subject_accuracy * 100
164
- })
165
-
166
- # Compute overall accuracy
167
- overall_accuracy = overall_correct_predictions / overall_total_questions if overall_total_questions > 0 else 0
168
-
169
- # Convert per_subject_results to DataFrame
170
- df_per_subject = pd.DataFrame(per_subject_results)
171
 
172
- # Convert detailed_results to DataFrame
173
- df_detailed_results = pd.DataFrame(detailed_results)
174
 
175
- return overall_accuracy, df_per_subject, df_detailed_results
176
 
177
  except Exception as e:
178
- return f"Error: {str(e)}", pd.DataFrame(), pd.DataFrame()
179
-
 
 
180
 
181
  def add_new_eval(
182
  model: str,
@@ -185,7 +169,7 @@ def add_new_eval(
185
  precision: str,
186
  weight_type: str,
187
  model_type: str,
188
- num_examples: int # New parameter
189
  ):
190
  global REQUESTED_MODELS
191
  global USERS_TO_SUBMISSION_DATES
@@ -243,7 +227,7 @@ def add_new_eval(
243
 
244
  # Now, perform the evaluation
245
  try:
246
- overall_accuracy, df_per_subject, df_detailed_results = evaluate_model_accuracy(model, int(num_examples))
247
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
248
  return styled_error(overall_accuracy)
249
  except Exception as e:
@@ -265,15 +249,14 @@ def add_new_eval(
265
  "precision": precision,
266
  },
267
  "results": {
268
- "average": overall_accuracy * 100,
269
  },
270
  }
271
 
272
  # Include per-subject accuracies
273
- for idx, row in df_per_subject.iterrows():
274
- subject_name = row['Subject']
275
- accuracy = row['Accuracy (%)']
276
- results_dict['results'][subject_name] = accuracy
277
 
278
  # Save results to a JSON file
279
  results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
 
1
  import json
2
  import os
3
  from datetime import datetime, timezone
4
+
5
  import torch
6
  import pandas as pd
7
  import numpy as np
 
21
  REQUESTED_MODELS = None
22
  USERS_TO_SUBMISSION_DATES = None
23
 
24
+ # List of subjects to exclude from evaluation
25
+ excluded_subjects = [
26
+ "human_sexuality",
27
+ "professional_psychology",
28
+ "moral_disputes",
29
+ "public_relations",
30
+ "jurisprudence",
31
+ "human_aging",
32
+ "world_religions"
33
+ ]
34
 
35
+ def get_top_prediction(text, tokenizer, model):
36
+ inputs = tokenizer(text, return_tensors='pt')
37
+ if torch.cuda.is_available():
38
+ model = model.cuda()
39
+ inputs = {k: v.cuda() for k, v in inputs.items()}
40
+ else:
41
+ model = model.cpu()
42
+ inputs = {k: v.cpu() for k, v in inputs.items()}
43
+
44
+ with torch.no_grad():
45
+ outputs = model(**inputs)
46
+ logits = outputs.logits[0, -1] # Get logits of the last token
47
+
48
+ options = [' A', ' B', ' C', ' D']
49
+ option_logits = []
50
+
51
+ # Iterate through each option
52
+ for option in options:
53
+ option_ids = tokenizer(option).input_ids
54
+ # Ensure option_ids are within range and not empty
55
+ if option_ids and option_ids[-1] < logits.size(0):
56
+ option_id = option_ids[-1]
57
  option_logit = logits[option_id]
58
+ option_logits.append((option_logit.item(), option.strip()))
59
+ else:
60
+ print(f"Skipping option '{option}' due to index out of range.")
 
 
61
 
62
+ if not option_logits:
63
+ return "No valid options"
 
 
 
 
 
64
 
65
+ # Get the option with the highest logit
66
+ top_option = max(option_logits, key=lambda x: x[0])[1]
67
+ return top_option
68
 
69
+ def evaluate_model_accuracy_by_subject(model_name, num_examples):
70
  try:
71
  # Load the model and tokenizer
72
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
81
  else:
82
  model = model.cpu()
83
 
84
+ # Load your custom MMMLU dataset
85
  dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
86
  dataset = dataset['test']
87
 
88
+ # Filter out excluded subjects
89
+ dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
 
 
 
90
 
91
  # Define prompt template
92
  template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
 
93
  Question: {Question}
94
  A) {A}
95
  B) {B}
96
  C) {C}
97
  D) {D}
 
98
  Answer:"""
99
 
100
  prompt_template = PromptTemplate(template=template, input_variables=['Question', 'A', 'B', 'C', 'D'])
101
 
102
+ # Initialize results storage
103
+ subject_results = {}
104
+
105
+ subjects = dataset.unique('Subject')
106
  overall_correct_predictions = 0
107
  overall_total_questions = 0
 
 
108
 
109
  for subject in subjects:
110
+ subject_data = dataset.filter(lambda x: x['Subject'] == subject)
 
111
 
112
+ # Sample num_examples from each subject
113
+ if num_examples > 0:
114
+ subject_data = subject_data.shuffle().select(range(min(num_examples, len(subject_data))))
115
 
 
116
  correct_predictions = 0
117
  total_questions = 0
118
+ results = []
119
 
120
+ for data in subject_data:
121
  # Prepare text input
122
  text = prompt_template.format(
123
  Question=data['Question'],
 
129
 
130
  # Get the top prediction
131
  top_prediction = get_top_prediction(text, tokenizer, model)
 
 
 
 
132
  is_correct = (top_prediction == data['Answer'])
133
  correct_predictions += int(is_correct)
134
  total_questions += 1
135
  overall_correct_predictions += int(is_correct)
136
  overall_total_questions += 1
137
 
138
+ results.append({
 
139
  'Question': data['Question'],
140
  'Answer': data['Answer'],
141
  'Prediction': top_prediction,
142
  'Correct': is_correct
143
  })
144
 
145
+ accuracy = correct_predictions / total_questions if total_questions > 0 else 0
 
146
 
147
+ # Store results for this subject
148
+ subject_results[subject] = {
149
+ 'Correct Predictions': correct_predictions,
150
  'Total Questions': total_questions,
151
+ 'Accuracy': accuracy * 100,
152
+ 'Results DataFrame': pd.DataFrame(results)
153
+ }
 
 
 
 
 
154
 
155
+ overall_accuracy = (overall_correct_predictions / overall_total_questions) * 100 if overall_total_questions > 0 else 0
 
156
 
157
+ return overall_accuracy, subject_results
158
 
159
  except Exception as e:
160
+ import traceback
161
+ tb = traceback.format_exc()
162
+ print(f"Error in evaluate_model_accuracy_by_subject: {e}\n{tb}")
163
+ return f"Error: {str(e)}", {}
164
 
165
  def add_new_eval(
166
  model: str,
 
169
  precision: str,
170
  weight_type: str,
171
  model_type: str,
172
+ num_examples: int
173
  ):
174
  global REQUESTED_MODELS
175
  global USERS_TO_SUBMISSION_DATES
 
227
 
228
  # Now, perform the evaluation
229
  try:
230
+ overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, int(num_examples))
231
  if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
232
  return styled_error(overall_accuracy)
233
  except Exception as e:
 
249
  "precision": precision,
250
  },
251
  "results": {
252
+ "average": overall_accuracy,
253
  },
254
  }
255
 
256
  # Include per-subject accuracies
257
+ for subject, data in subject_results.items():
258
+ accuracy = data['Accuracy']
259
+ results_dict['results'][subject] = accuracy
 
260
 
261
  # Save results to a JSON file
262
  results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"