playing_with_the_source_code

#2
by XinGuan2000 - opened
pages/1_Single_Evaluation.py CHANGED
@@ -57,7 +57,7 @@ if not st.session_state.get('password_correct', False):
57
  check_password()
58
  else:
59
  st.sidebar.success("Password Verified. Proceed with the demo.")
60
- model_name = st.selectbox('Select a model:', ['gpt35-1106'])
61
 
62
  # User choice between predefined examples or their own input
63
  input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
 
57
  check_password()
58
  else:
59
  st.sidebar.success("Password Verified. Proceed with the demo.")
60
+ model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
61
 
62
  # User choice between predefined examples or their own input
63
  input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
pages/{3_Benchmark_Data.py β†’ 2_Benchmark_Data.py} RENAMED
File without changes
pages/{4_Explanation_Generation.py β†’ 3_Explanation_Generation.py} RENAMED
File without changes
pages/{5_Batch_Evaluation.py β†’ 4_Batch_Evaluation.py} RENAMED
File without changes
pages/{2_Conversation_Evaluation.py β†’ 5_Conversation_Evaluation.py} RENAMED
@@ -66,7 +66,7 @@ if not st.session_state.get('password_correct', False):
66
  check_password()
67
  else:
68
  st.sidebar.success("Password Verified. Proceed with the demo.")
69
- model_name = st.selectbox('Select a model:', ['gpt35-1106'])
70
 
71
  # User choice between predefined examples or their own input
72
  input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
 
66
  check_password()
67
  else:
68
  st.sidebar.success("Password Verified. Proceed with the demo.")
69
+ model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
70
 
71
  # User choice between predefined examples or their own input
72
  input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
util/evaluator.py CHANGED
@@ -9,34 +9,20 @@ class evaluator:
9
 
10
  def validate_scores(self, scores):
11
  required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
12
-
13
  for key in required_keys:
14
- if key not in scores:
15
- return {k: {"Score": -1, "Justification": "Invalid input"} for k in required_keys}
16
-
17
- score_data = scores[key]
18
-
19
- if not isinstance(score_data, dict):
20
- return {k: {"Score": -1, "Justification": "Invalid input format"} for k in required_keys}
21
-
22
- if "Score" not in score_data or not isinstance(score_data["Score"], (int, float)) or not (
23
- 0 <= score_data["Score"] <= 10):
24
- return {k: {"Score": -1, "Justification": "Invalid score value"} for k in required_keys}
25
-
26
- if "Justification" not in score_data or not isinstance(score_data["Justification"], str) or not score_data[
27
- "Justification"].strip():
28
- return {k: {"Score": -1, "Justification": "Invalid or missing justification"} for k in required_keys}
29
 
30
  return scores
31
 
32
  def evaluate_single(self, question,explanation):
33
 
34
- evaluation_prompt = f"""You are provided with a user's query and the corresponding explanation generated by
35
- an Chatbot. Your task is to evaluate the explanation based on the following five principles. Each principle
36
- should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all,
37
- and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.
38
 
39
- Query:
40
  {question}
41
 
42
  Provided Explanation:
@@ -46,55 +32,35 @@ class evaluator:
46
 
47
  Factually Correct:
48
  Definition: The explanation must be accurate and relevant to the question and the subject matter.
49
- Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
50
 
51
  Useful:
52
  Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
53
- Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?
54
 
55
  Context Specific:
56
  Definition: The explanation should be relevant to the specific context or scenario implied by the question.
57
- Score: (0-10) How well does the explanation address the specific context or scenario of the question?
58
 
59
  User Specific:
60
  Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
61
- Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?
62
 
63
  Provides Pluralism:
64
  Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
65
- Score: (0-10) How well does the explanation provide or support multiple perspectives?
66
 
67
- After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
68
 
69
  Example JSON format:
70
- {{
71
- "Factually Correct": {{
72
- "Justification": "xxx",
73
- "Score": 9
74
- }},
75
- "Useful": {{
76
- "Justification": "xxx",
77
- "Score": 8.5
78
- }},
79
- "Context Specific": {{
80
- "Justification": "xxx",
81
- "Score": 8
82
- }},
83
- "User Specific": {{
84
- "Justification": "xxx",
85
- "Score": 7.5
86
- }},
87
- "Provides Pluralism": {{
88
- "Justification": "xxx",
89
- "Score": 7
90
- }}
91
- }}
92
-
93
- Answer:
94
- """
95
-
96
- response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=500).strip()
97
 
 
 
98
  print(response)
99
  try:
100
  scores = json.loads(response)
@@ -119,70 +85,48 @@ class evaluator:
119
  def evaluate_conversation(self, conversation, context):
120
  formatted_conversation = self.format_conversation(conversation)
121
  evaluation_prompt = f"""
122
- You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the explanation based on the following five principles. Each principle
123
- should be scored on a scale from 0 to 10, where 0 indicates that the principle is not met at all,
124
- and 10 indicates that the principle is fully satisfied. Additionally, provide a brief ten words explanation for each score to justify your rating.
125
-
126
- Conversation:
127
- {formatted_conversation}
128
-
129
- Context:
130
- {context}
131
-
132
- Evaluation Criteria:
133
-
134
- Factually Correct:
135
- Definition: The explanation must be accurate and relevant to the question and the subject matter.
136
- Score: (0-10) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
137
-
138
- Useful:
139
- Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
140
- Score: (0-10) How useful is the explanation in helping the user understand the answer and make informed decisions?
141
-
142
- Context Specific:
143
- Definition: The explanation should be relevant to the specific context or scenario implied by the question.
144
- Score: (0-10) How well does the explanation address the specific context or scenario of the question?
145
-
146
- User Specific:
147
- Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
148
- Score: (0-10) How well does the explanation cater to the needs and knowledge level of the intended user?
149
-
150
- Provides Pluralism:
151
- Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
152
- Score: (0-10) How well does the explanation provide or support multiple perspectives?
153
-
154
- After evaluating the provided question and explanation based on the five principles, please format your scores and justifications in a JSON dictionary. Directly provide me with the JSON without any additional text.
155
-
156
- Example JSON format:
157
- {{
158
- "Factually Correct": {{
159
- "Justification": "xxx",
160
- "Score": 9
161
- }},
162
- "Useful": {{
163
- "Justification": "xxx",
164
- "Score": 8.5
165
- }},
166
- "Context Specific": {{
167
- "Justification": "xxx",
168
- "Score": 8
169
- }},
170
- "User Specific": {{
171
- "Justification": "xxx",
172
- "Score": 7.5
173
- }},
174
- "Provides Pluralism": {{
175
- "Justification": "xxx",
176
- "Score": 7
177
- }}
178
- }}
179
-
180
  Answer:
181
  """
182
 
183
  print(evaluation_prompt)
184
 
185
- response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=1000).strip()
186
  try:
187
  scores = json.loads(response)
188
  except json.JSONDecodeError:
@@ -195,19 +139,12 @@ class evaluator:
195
 
196
  return self.validate_scores(scores)
197
 
198
-
199
  def write_evaluation_commentary(scores):
200
  evaluation_details = []
201
-
202
- for principle, details in scores.items():
203
- print(details)
204
- score = details.get('Score', -1)
205
- justification = details.get('Justification', '')
206
 
207
  if score == -1:
208
- evaluation_details.append(
209
- {'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.',
210
- 'Justification': justification})
211
  continue
212
 
213
  if principle == "Factually Correct":
@@ -246,56 +183,8 @@ def write_evaluation_commentary(scores):
246
  else:
247
  comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
248
 
249
- evaluation_details.append(
250
- {'Principle': principle, 'Score': score, 'Justification': justification,'Commentary': comment})
251
-
252
  return evaluation_details
253
- # def write_evaluation_commentary(scores):
254
- # evaluation_details = []
255
- # for principle, score in scores.items():
256
- #
257
- # if score == -1:
258
- # evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
259
- # continue
260
- #
261
- # if principle == "Factually Correct":
262
- # if score >= 0.8:
263
- # comment = "Excellent accuracy! The information is precise and directly relevant to the question."
264
- # elif score >= 0.5:
265
- # comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
266
- # else:
267
- # comment = "The explanation contains significant inaccuracies or irrelevant information."
268
- # elif principle == "Useful":
269
- # if score >= 0.8:
270
- # comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
271
- # elif score >= 0.5:
272
- # comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
273
- # else:
274
- # comment = "The explanation does little to help understand or apply the information provided."
275
- # elif principle == "Context Specific":
276
- # if score >= 0.8:
277
- # comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
278
- # elif score >= 0.5:
279
- # comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
280
- # else:
281
- # comment = "Fails to address the context of the question, lacking relevance or specificity."
282
- # elif principle == "User Specific":
283
- # if score >= 0.8:
284
- # comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
285
- # elif score >= 0.5:
286
- # comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
287
- # else:
288
- # comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
289
- # elif principle == "Provides Pluralism":
290
- # if score >= 0.8:
291
- # comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
292
- # elif score >= 0.5:
293
- # comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
294
- # else:
295
- # comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
296
- #
297
- # evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
298
- # return evaluation_details
299
 
300
  if __name__ == '__main__':
301
 
 
9
 
10
  def validate_scores(self, scores):
11
  required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
 
12
  for key in required_keys:
13
+ if key not in scores or not isinstance(scores[key], (int, float)) or not (-1 <= scores[key] <= 1):
14
+ return {"Factually Correct": -1,"Useful": -1,"Context Specific": -1,"User Specific":-1,"Provides Pluralism":-1}
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  return scores
17
 
18
  def evaluate_single(self, question,explanation):
19
 
20
+ evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
21
+ an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
22
+ should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
23
+ and 1 indicates that the principle is fully satisfied.
24
 
25
+ Question:
26
  {question}
27
 
28
  Provided Explanation:
 
32
 
33
  Factually Correct:
34
  Definition: The explanation must be accurate and relevant to the question and the subject matter.
35
+ Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
36
 
37
  Useful:
38
  Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
39
+ Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
40
 
41
  Context Specific:
42
  Definition: The explanation should be relevant to the specific context or scenario implied by the question.
43
+ Score: (0-1) How well does the explanation address the specific context or scenario of the question?
44
 
45
  User Specific:
46
  Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
47
+ Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
48
 
49
  Provides Pluralism:
50
  Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
51
+ Score: (0-1) How well does the explanation provide or support multiple perspectives?
52
 
53
+ After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
54
 
55
  Example JSON format:
56
+
57
+ Answer:{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
58
+
59
+ Answer:
60
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ response = self.model.invoke(evaluation_prompt,temperature=0, max_tokens=500).strip()
63
+ #response = """{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
64
  print(response)
65
  try:
66
  scores = json.loads(response)
 
85
  def evaluate_conversation(self, conversation, context):
86
  formatted_conversation = self.format_conversation(conversation)
87
  evaluation_prompt = f"""
88
+ You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied.
89
+
90
+ Conversation:
91
+ {formatted_conversation}
92
+
93
+ Context:
94
+ {context}
95
+
96
+ Evaluation Criteria:
97
+
98
+ Factually Correct:
99
+ Definition: The explanation must be accurate and relevant to the question and the subject matter.
100
+ Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
101
+
102
+ Useful:
103
+ Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
104
+ Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
105
+
106
+ Context Specific:
107
+ Definition: The explanation should be relevant to the specific context or scenario implied by the question.
108
+ Score: (0-1) How well does the explanation address the specific context or scenario of the question?
109
+
110
+ User Specific:
111
+ Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
112
+ Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
113
+
114
+ Provides Pluralism:
115
+ Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
116
+ Score: (0-1) How well does the explanation provide or support multiple perspectives?
117
+
118
+ After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
119
+
120
+ Example JSON format:
121
+
122
+ Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}}
123
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  Answer:
125
  """
126
 
127
  print(evaluation_prompt)
128
 
129
+ response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip()
130
  try:
131
  scores = json.loads(response)
132
  except json.JSONDecodeError:
 
139
 
140
  return self.validate_scores(scores)
141
 
 
142
  def write_evaluation_commentary(scores):
143
  evaluation_details = []
144
+ for principle, score in scores.items():
 
 
 
 
145
 
146
  if score == -1:
147
+ evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': 'Failed to evaluate the explanation.'})
 
 
148
  continue
149
 
150
  if principle == "Factually Correct":
 
183
  else:
184
  comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
185
 
186
+ evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
 
 
187
  return evaluation_details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  if __name__ == '__main__':
190