Spaces:

mideind
/

maeliprof

Sleeping

App Files Files Community

gardarjuto commited on Sep 25, 2024

Commit

8446c23

1 Parent(s): fcffb23

collect results data per question

Browse files

Files changed (1) hide show

quiz.py +46 -14

quiz.py CHANGED Viewed

@@ -3,7 +3,7 @@ from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 import random
 import matplotlib.pyplot as plt
-from score import calculate_gpt4o_score, BENCHMARK_SCORES
 # Define benchmarks
@@ -48,7 +48,7 @@ DATASETS = {
         name=BENCHMARKS[dataset_name].get("config_name"),
         split=BENCHMARKS[dataset_name].get("split", "train"),
     )
-    for dataset_name in BENCHMARKS
 }
@@ -139,6 +139,7 @@ class QuizState:
     user_answers: List[Optional[str]]
     correct_answers: List[str]
     quiz_completed: bool
 @dataclass
@@ -166,6 +167,7 @@ class BenchmarkQuiz:
             user_answers=[None] * len(samples),
             correct_answers=correct_answers,
             quiz_completed=False,
         )
         return self.state
@@ -232,37 +234,43 @@ class BenchmarkQuiz:
             return {"completed": False, "question_data": self.update_question()}
         else:
             self.state.quiz_completed = True
-            user_score = self.calculate_score()
-            plot = self.plot_score(user_score)
-            return {"completed": True, "plot": plot}
     def previous_question(self) -> QuestionData:
         if self.state.current_question > 0:
             self.state.current_question -= 1
         return self.update_question()
-    def calculate_score(self) -> float:
         if self.state.benchmark_name == "icelandic-wiki-qa":
             queries = [sample["question"] for sample in self.state.samples]
-            return calculate_gpt4o_score(
                 queries, self.state.user_answers, self.state.correct_answers
             )
-        score = sum(
-            user_answer == correct_answer
             for user_answer, correct_answer in zip(
                 self.state.user_answers, self.state.correct_answers
             )
-        )
-        return score / len(self.state.correct_answers)
-    def plot_score(self, user_score: float):
         scores = {**BENCHMARK_SCORES[self.state.benchmark_name], "Þú": 100 * user_score}
         # Sort by score
         scores = dict(sorted(scores.items(), key=lambda item: item[1]))
         # Define colors for user vs models
-        colors = {name: "tab:blue" for name in scores.keys()}
         colors["Þú"] = "tab:green"
         fig, ax = plt.subplots(figsize=(10, 6), dpi=250)
@@ -276,8 +284,32 @@ class BenchmarkQuiz:
         )
         ax.set_axisbelow(True)
         ax.xaxis.grid(True, linestyle="--", alpha=0.6)
-        ax.set_title(f"{BENCHMARKS[self.state.benchmark_name]['name']}: Svona stóðstu þig miðað við mállíkönin", pad=20)
         ax.set_xlabel("Stig (%)")
         ax.set_xlim(0, 100)
         plt.tight_layout()
         return fig

 from typing import Any, Dict, List, Optional
 import random
 import matplotlib.pyplot as plt
+from score import calculate_gpt4o_scores, BENCHMARK_SCORES
 # Define benchmarks
         name=BENCHMARKS[dataset_name].get("config_name"),
         split=BENCHMARKS[dataset_name].get("split", "train"),
     )
+    for dataset_name in BENCHMARKS
 }
     user_answers: List[Optional[str]]
     correct_answers: List[str]
     quiz_completed: bool
+    user_scores: List[Optional[float]]
 @dataclass
             user_answers=[None] * len(samples),
             correct_answers=correct_answers,
             quiz_completed=False,
+            user_scores=[None] * len(samples),
         )
         return self.state
             return {"completed": False, "question_data": self.update_question()}
         else:
             self.state.quiz_completed = True
+            user_scores = self.calculate_scores()
+            self.state.user_scores = user_scores
+            plot = self.plot_score(user_scores)
+            return {
+                "completed": True,
+                "plot": plot,
+                "results_data": self.get_results_data(),
+            }
     def previous_question(self) -> QuestionData:
         if self.state.current_question > 0:
             self.state.current_question -= 1
         return self.update_question()
+    def calculate_scores(self) -> list[float]:
         if self.state.benchmark_name == "icelandic-wiki-qa":
             queries = [sample["question"] for sample in self.state.samples]
+            return calculate_gpt4o_scores(
                 queries, self.state.user_answers, self.state.correct_answers
             )
+        scores = [
+            float(user_answer == correct_answer)
             for user_answer, correct_answer in zip(
                 self.state.user_answers, self.state.correct_answers
             )
+        ]
+        return scores
+    def plot_score(self, user_scores: List[float]):
+        user_score = sum(user_scores) / len(user_scores)
         scores = {**BENCHMARK_SCORES[self.state.benchmark_name], "Þú": 100 * user_score}
         # Sort by score
         scores = dict(sorted(scores.items(), key=lambda item: item[1]))
         # Define colors for user vs models
+        colors = {name: "tab:blue" for name in scores.keys()}
         colors["Þú"] = "tab:green"
         fig, ax = plt.subplots(figsize=(10, 6), dpi=250)
         )
         ax.set_axisbelow(True)
         ax.xaxis.grid(True, linestyle="--", alpha=0.6)
+        ax.set_title(
+            f"{BENCHMARKS[self.state.benchmark_name]['name']}: Svona stóðstu þig miðað við mállíkönin",
+            pad=20,
+        )
         ax.set_xlabel("Stig (%)")
         ax.set_xlim(0, 100)
         plt.tight_layout()
         return fig
+    def get_results_data(self) -> List[Dict[str, Any]]:
+        return [
+            {
+                "question_num": i + 1,
+                "question": sample["question"],
+                "user_answer": user_answer,
+                "correct_answer": correct_answer,
+                "options": sample.get("options"),
+                "instruction": sample.get("instruction", ""),
+                "points": score,
+            }
+            for i, (sample, user_answer, correct_answer, score) in enumerate(
+                zip(
+                    self.state.samples,
+                    self.state.user_answers,
+                    self.state.correct_answers,
+                    self.state.user_scores,
+                )
+            )
+        ]