facat commited on
Commit
845a45a
1 Parent(s): 33a6f85

update mt_bench

Browse files
Files changed (3) hide show
  1. README.md +4 -10
  2. tasks.py +33 -4
  3. tlem.py +17 -14
README.md CHANGED
@@ -1,11 +1,5 @@
1
- ---
2
- title: Tlem
3
- emoji: 🏆
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: static
7
- pinned: false
8
- license: apache-2.0
9
- ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+
3
+ # Transparent LLMs Evaluation Metrics
4
+
5
+ > LLMs belong to *tout le monde*
tasks.py CHANGED
@@ -14,16 +14,42 @@ from evaluate import load
14
  from collections import defaultdict
15
  import sys
16
 
 
17
  # if sys.version_info >= (3, 9):
18
  # from functools import cache
19
  # else:
20
  # from functools import lru_cache as cache
21
 
 
22
  disable_progress_bar()
23
 
24
 
25
- def fake_pipeline(prompts: Iterable[str]) -> list[str]:
26
- return [prompt for prompt in tqdm(prompts)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
  @dataclass
@@ -33,7 +59,7 @@ class Task:
33
  # metrics: list[str] = field(default_factory=list)
34
  metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k")
35
  input_column: str = "question"
36
- label_column: str = "answer"
37
  prompt: Optional[Callable | str] = None
38
  few_shot: int = 0
39
  few_shot_from: Optional[str] = None
@@ -54,6 +80,7 @@ class Task:
54
  input_column=example[self.input_column]
55
  )
56
  }
 
57
 
58
  @cached_property
59
  def samples(self):
@@ -78,6 +105,7 @@ class Task:
78
  self.few_shot_from = name
79
  break
80
 
 
81
  shots = ds[self.few_shot_from].select(range(self.few_shot))
82
  if self.prompt is not None:
83
  shots = shots.map(self.prompt)
@@ -126,6 +154,8 @@ class Task:
126
  result = self.metric.compute(
127
  responses=outputs, references=self.dataset[self.label_column]
128
  )
 
 
129
  # if log:
130
  # name = name or pipeline.__name__
131
  # self.results[name] = result
@@ -431,7 +461,6 @@ class MMLU:
431
  "psychology",
432
  ],
433
  "other": ["other", "business", "health"],
434
- "Test": ["culture"],
435
  }
436
 
437
  @classmethod
 
14
  from collections import defaultdict
15
  import sys
16
 
17
+
18
  # if sys.version_info >= (3, 9):
19
  # from functools import cache
20
  # else:
21
  # from functools import lru_cache as cache
22
 
23
+
24
  disable_progress_bar()
25
 
26
 
27
+ def mt_bench_prompt(example):
28
+ judge_prompt = "You are ChatGPT, a large language model trained by OpenAI. Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. The Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response."
29
+ judge_prompt = "You are ChatGPT, a large language model trained by OpenAI. Your task is to act as an impartial judge and evaluate the quality of the responses provided by an 'assistant' role in the displayed conversation. Your evaluation should focus on the helpfulness, relevance, accuracy, depth, creativity, language fluency, clarity, and level of detail in the assistant's responses. Please note that the evaluation should not consider the user's questions or the overall conversation, but solely the quality of the assistant's replies."
30
+ multi_prompt = "You evaluation should focus on the assistant's answer to the second user question."
31
+ ref_prompt = "In the conversation, you will encounter system messages labeled 'Reference Answer' followed by the assistant's response. Your task is to evaluate the quality of the assistant's response by comparing it with the reference answer."
32
+ json_prompt = 'You must rate the response on a scale of 1 to 10 in JSON format, for example: {"rating": 5}.'
33
+ prompt_list = [judge_prompt]
34
+ conversations = example["conversation"]
35
+ if example["turn"] == 2:
36
+ prompt_list.append(multi_prompt)
37
+
38
+ if example["reference"] is not None:
39
+ conversations = []
40
+ quesiotns = filter(lambda e: e["role"] == "user", example["conversation"])
41
+ answers = filter(lambda e: e["role"] == "assistant", example["conversation"])
42
+ for q, a, r in zip(quesiotns, answers, example["reference"]):
43
+ conversations.append(q)
44
+ conversations.append(
45
+ {"role": "system", "content": "Reference Answer: " + r}
46
+ )
47
+ conversations.append(a)
48
+ prompt_list.append(ref_prompt)
49
+ prompt_list.append(json_prompt)
50
+
51
+ messages = [{"role": "system", "content": " ".join(prompt_list)}] + conversations
52
+ return messages
53
 
54
 
55
  @dataclass
 
59
  # metrics: list[str] = field(default_factory=list)
60
  metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k")
61
  input_column: str = "question"
62
+ label_column: str = ""
63
  prompt: Optional[Callable | str] = None
64
  few_shot: int = 0
65
  few_shot_from: Optional[str] = None
 
80
  input_column=example[self.input_column]
81
  )
82
  }
83
+ self.label_column = self.label_column or self.input_column
84
 
85
  @cached_property
86
  def samples(self):
 
105
  self.few_shot_from = name
106
  break
107
 
108
+ assert self.few_shot_from != self.split
109
  shots = ds[self.few_shot_from].select(range(self.few_shot))
110
  if self.prompt is not None:
111
  shots = shots.map(self.prompt)
 
154
  result = self.metric.compute(
155
  responses=outputs, references=self.dataset[self.label_column]
156
  )
157
+ finally:
158
+ result = outputs
159
  # if log:
160
  # name = name or pipeline.__name__
161
  # self.results[name] = result
 
461
  "psychology",
462
  ],
463
  "other": ["other", "business", "health"],
 
464
  }
465
 
466
  @classmethod
tlem.py CHANGED
@@ -69,15 +69,9 @@ class ReasoningMetric(evaluate.Metric):
69
  return results
70
 
71
 
72
- gsm8k = Task(
73
- dataset_name=("gsm8k", "main"),
74
- metric_name=("sustech/tlem", "gsm8k"),
75
- input_column="question",
76
- label_column="answer",
77
- )
78
-
79
-
80
  class Suite(EvaluationSuite):
 
 
81
  def run(
82
  self,
83
  model_or_pipeline: Any,
@@ -115,12 +109,21 @@ class Suite(EvaluationSuite):
115
  case _ if name.startswith("cmmlu"):
116
  suite = CMMLU.suite(chat=chat)
117
  case "gsm8k":
118
- suite = [gsm8k]
119
- match name:
120
- case _ if "test" in name:
121
- suite = suite["Test"]
122
-
123
- self.suite = suite
 
 
 
 
 
 
 
 
 
124
 
125
  def __init__(self, name="tlem"):
126
  super().__init__(name)
 
69
  return results
70
 
71
 
 
 
 
 
 
 
 
 
72
  class Suite(EvaluationSuite):
73
+ task_class = Task
74
+
75
  def run(
76
  self,
77
  model_or_pipeline: Any,
 
109
  case _ if name.startswith("cmmlu"):
110
  suite = CMMLU.suite(chat=chat)
111
  case "gsm8k":
112
+ suite = Task(
113
+ dataset_name=("gsm8k", "main"),
114
+ metric_name=("sustech/tlem", "gsm8k"),
115
+ input_column="question",
116
+ label_column="answer",
117
+ )
118
+ case "mt_bench":
119
+ suite = Task(
120
+ dataset_name="SUSTech/mt_bench_judge",
121
+ split="train",
122
+ prompt=mt_bench_prompt
123
+ # metric_name=("sustech/tlem", "gsm8k"),
124
+ )
125
+
126
+ self.suite = [suite] if isinstance(suite, Task) else suite
127
 
128
  def __init__(self, name="tlem"):
129
  super().__init__(name)