gardarjuto commited on
Commit
8446c23
1 Parent(s): fcffb23

collect results data per question

Browse files
Files changed (1) hide show
  1. quiz.py +46 -14
quiz.py CHANGED
@@ -3,7 +3,7 @@ from dataclasses import dataclass
3
  from typing import Any, Dict, List, Optional
4
  import random
5
  import matplotlib.pyplot as plt
6
- from score import calculate_gpt4o_score, BENCHMARK_SCORES
7
 
8
 
9
  # Define benchmarks
@@ -48,7 +48,7 @@ DATASETS = {
48
  name=BENCHMARKS[dataset_name].get("config_name"),
49
  split=BENCHMARKS[dataset_name].get("split", "train"),
50
  )
51
- for dataset_name in BENCHMARKS
52
  }
53
 
54
 
@@ -139,6 +139,7 @@ class QuizState:
139
  user_answers: List[Optional[str]]
140
  correct_answers: List[str]
141
  quiz_completed: bool
 
142
 
143
 
144
  @dataclass
@@ -166,6 +167,7 @@ class BenchmarkQuiz:
166
  user_answers=[None] * len(samples),
167
  correct_answers=correct_answers,
168
  quiz_completed=False,
 
169
  )
170
  return self.state
171
 
@@ -232,37 +234,43 @@ class BenchmarkQuiz:
232
  return {"completed": False, "question_data": self.update_question()}
233
  else:
234
  self.state.quiz_completed = True
235
- user_score = self.calculate_score()
236
- plot = self.plot_score(user_score)
237
- return {"completed": True, "plot": plot}
 
 
 
 
 
238
 
239
  def previous_question(self) -> QuestionData:
240
  if self.state.current_question > 0:
241
  self.state.current_question -= 1
242
  return self.update_question()
243
 
244
- def calculate_score(self) -> float:
245
  if self.state.benchmark_name == "icelandic-wiki-qa":
246
  queries = [sample["question"] for sample in self.state.samples]
247
- return calculate_gpt4o_score(
248
  queries, self.state.user_answers, self.state.correct_answers
249
  )
250
 
251
- score = sum(
252
- user_answer == correct_answer
253
  for user_answer, correct_answer in zip(
254
  self.state.user_answers, self.state.correct_answers
255
  )
256
- )
257
- return score / len(self.state.correct_answers)
258
 
259
- def plot_score(self, user_score: float):
 
260
  scores = {**BENCHMARK_SCORES[self.state.benchmark_name], "Þú": 100 * user_score}
261
  # Sort by score
262
  scores = dict(sorted(scores.items(), key=lambda item: item[1]))
263
 
264
  # Define colors for user vs models
265
- colors = {name: "tab:blue" for name in scores.keys()}
266
  colors["Þú"] = "tab:green"
267
 
268
  fig, ax = plt.subplots(figsize=(10, 6), dpi=250)
@@ -276,8 +284,32 @@ class BenchmarkQuiz:
276
  )
277
  ax.set_axisbelow(True)
278
  ax.xaxis.grid(True, linestyle="--", alpha=0.6)
279
- ax.set_title(f"{BENCHMARKS[self.state.benchmark_name]['name']}: Svona stóðstu þig miðað við mállíkönin", pad=20)
 
 
 
280
  ax.set_xlabel("Stig (%)")
281
  ax.set_xlim(0, 100)
282
  plt.tight_layout()
283
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from typing import Any, Dict, List, Optional
4
  import random
5
  import matplotlib.pyplot as plt
6
+ from score import calculate_gpt4o_scores, BENCHMARK_SCORES
7
 
8
 
9
  # Define benchmarks
 
48
  name=BENCHMARKS[dataset_name].get("config_name"),
49
  split=BENCHMARKS[dataset_name].get("split", "train"),
50
  )
51
+ for dataset_name in BENCHMARKS
52
  }
53
 
54
 
 
139
  user_answers: List[Optional[str]]
140
  correct_answers: List[str]
141
  quiz_completed: bool
142
+ user_scores: List[Optional[float]]
143
 
144
 
145
  @dataclass
 
167
  user_answers=[None] * len(samples),
168
  correct_answers=correct_answers,
169
  quiz_completed=False,
170
+ user_scores=[None] * len(samples),
171
  )
172
  return self.state
173
 
 
234
  return {"completed": False, "question_data": self.update_question()}
235
  else:
236
  self.state.quiz_completed = True
237
+ user_scores = self.calculate_scores()
238
+ self.state.user_scores = user_scores
239
+ plot = self.plot_score(user_scores)
240
+ return {
241
+ "completed": True,
242
+ "plot": plot,
243
+ "results_data": self.get_results_data(),
244
+ }
245
 
246
  def previous_question(self) -> QuestionData:
247
  if self.state.current_question > 0:
248
  self.state.current_question -= 1
249
  return self.update_question()
250
 
251
+ def calculate_scores(self) -> list[float]:
252
  if self.state.benchmark_name == "icelandic-wiki-qa":
253
  queries = [sample["question"] for sample in self.state.samples]
254
+ return calculate_gpt4o_scores(
255
  queries, self.state.user_answers, self.state.correct_answers
256
  )
257
 
258
+ scores = [
259
+ float(user_answer == correct_answer)
260
  for user_answer, correct_answer in zip(
261
  self.state.user_answers, self.state.correct_answers
262
  )
263
+ ]
264
+ return scores
265
 
266
+ def plot_score(self, user_scores: List[float]):
267
+ user_score = sum(user_scores) / len(user_scores)
268
  scores = {**BENCHMARK_SCORES[self.state.benchmark_name], "Þú": 100 * user_score}
269
  # Sort by score
270
  scores = dict(sorted(scores.items(), key=lambda item: item[1]))
271
 
272
  # Define colors for user vs models
273
+ colors = {name: "tab:blue" for name in scores.keys()}
274
  colors["Þú"] = "tab:green"
275
 
276
  fig, ax = plt.subplots(figsize=(10, 6), dpi=250)
 
284
  )
285
  ax.set_axisbelow(True)
286
  ax.xaxis.grid(True, linestyle="--", alpha=0.6)
287
+ ax.set_title(
288
+ f"{BENCHMARKS[self.state.benchmark_name]['name']}: Svona stóðstu þig miðað við mállíkönin",
289
+ pad=20,
290
+ )
291
  ax.set_xlabel("Stig (%)")
292
  ax.set_xlim(0, 100)
293
  plt.tight_layout()
294
  return fig
295
+
296
+ def get_results_data(self) -> List[Dict[str, Any]]:
297
+ return [
298
+ {
299
+ "question_num": i + 1,
300
+ "question": sample["question"],
301
+ "user_answer": user_answer,
302
+ "correct_answer": correct_answer,
303
+ "options": sample.get("options"),
304
+ "instruction": sample.get("instruction", ""),
305
+ "points": score,
306
+ }
307
+ for i, (sample, user_answer, correct_answer, score) in enumerate(
308
+ zip(
309
+ self.state.samples,
310
+ self.state.user_answers,
311
+ self.state.correct_answers,
312
+ self.state.user_scores,
313
+ )
314
+ )
315
+ ]