Presidentlin commited on
Commit
eebf495
·
1 Parent(s): fb39607
Files changed (3) hide show
  1. __pycache__/main.cpython-310.pyc +0 -0
  2. app.py +10 -4
  3. main.py +89 -40
__pycache__/main.cpython-310.pyc CHANGED
Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ
 
app.py CHANGED
@@ -41,6 +41,7 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
41
  models.sort(key=lambda model: model["id"])
42
 
43
  model_names = [model["id"] for model in models]
 
44
  except requests.exceptions.RequestException as e:
45
  st.error(f"Error fetching models from OpenRouter API: {e}")
46
  model_names = [] # Provide an empty list if API call fails
@@ -52,6 +53,13 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
52
  st.error("No models available. Please check your API connection.")
53
  st.stop() # Stop execution if no models are available
54
 
 
 
 
 
 
 
 
55
  # Initialize session state for user_questions and predefined_questions
56
  if "user_questions" not in st.session_state:
57
  st.session_state.user_questions = []
@@ -107,8 +115,6 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
107
  if not selected_questions:
108
  st.warning("Please select at least one question.")
109
  else:
110
- # Initialize progress bar
111
- progress_bar = st.progress(0)
112
  num_questions = len(selected_questions)
113
  results = []
114
 
@@ -117,9 +123,9 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
117
 
118
  # Benchmarking logic using the chosen execution mode
119
  if execution_mode == "Sequential":
120
- question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key)
121
  else: # Multithreaded
122
- question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads)
123
 
124
  results.extend(question_results)
125
 
 
41
  models.sort(key=lambda model: model["id"])
42
 
43
  model_names = [model["id"] for model in models]
44
+ judge_models = [model["id"] for model in models if "gpt" in model["id"]] # Example criteria
45
  except requests.exceptions.RequestException as e:
46
  st.error(f"Error fetching models from OpenRouter API: {e}")
47
  model_names = [] # Provide an empty list if API call fails
 
53
  st.error("No models available. Please check your API connection.")
54
  st.stop() # Stop execution if no models are available
55
 
56
+ # Judge Model Selection
57
+ if judge_models:
58
+ judge_model_name = st.selectbox("Select a Judge Model", judge_models)
59
+ else:
60
+ st.error("No judge models available. Please check your API connection.")
61
+ st.stop() # Stop execution if no judge models are available
62
+
63
  # Initialize session state for user_questions and predefined_questions
64
  if "user_questions" not in st.session_state:
65
  st.session_state.user_questions = []
 
115
  if not selected_questions:
116
  st.warning("Please select at least one question.")
117
  else:
 
 
118
  num_questions = len(selected_questions)
119
  results = []
120
 
 
123
 
124
  # Benchmarking logic using the chosen execution mode
125
  if execution_mode == "Sequential":
126
+ question_results = benchmark_model_sequential(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key,judge_model_name)
127
  else: # Multithreaded
128
+ question_results = benchmark_model_multithreaded(model_name, selected_questions, st.session_state.open_router_key, st.session_state.openai_api_key, max_threads, judge_model_name)
129
 
130
  results.extend(question_results)
131
 
main.py CHANGED
@@ -7,37 +7,37 @@ import threading
7
  import streamlit as st # Import Streamlit
8
  import queue
9
 
 
10
  def generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key):
11
  """Generates an answer to a question using the specified language model."""
12
  gen_prompt = create_gen_prompt(question, previous_answers)
13
  try:
14
  new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
15
- openai_api_key=openai_api_key)
16
  return new_answer
17
  except Exception as e:
18
  st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
19
- unsafe_allow_html=True)
20
  return None
21
 
22
 
23
- def evaluate_answer(question, new_answer, open_router_key, openai_api_key):
24
  """Evaluates the coherence and novelty of an answer."""
25
  judge_prompt = create_judge_prompt(question, new_answer)
26
- judge = "openai/gpt-4o-mini"
27
  try:
28
  judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
29
- openai_api_key=openai_api_key)
30
  coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
31
  return coherence_score
32
  except Exception as e:
33
  st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
34
- unsafe_allow_html=True)
35
  return None
36
 
37
 
38
- def process_question(question, model_name, open_router_key, openai_api_key, result_queue):
39
  start_time = time.time()
40
- # st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True)
41
  previous_answers = []
42
  question_novelty = 0
43
 
@@ -47,20 +47,20 @@ def process_question(question, model_name, open_router_key, openai_api_key, resu
47
  if new_answer is None:
48
  break
49
 
50
- coherence_score = evaluate_answer(question, new_answer, open_router_key, openai_api_key)
51
  if coherence_score is None:
52
  break
53
 
54
- if coherence_score <= 6:
55
  break
56
 
57
  novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
58
 
59
- if novelty_score < 0.3:
60
  break
61
 
62
- # Append results to the queue instead of using st.write
63
- result_queue.put({
64
  "type": "answer",
65
  "question": question,
66
  "answer": new_answer,
@@ -69,26 +69,34 @@ def process_question(question, model_name, open_router_key, openai_api_key, resu
69
  "results": [
70
  {
71
  "question": question,
72
- "answers": previous_answers.copy() + [new_answer], # Include the new answer
73
  "coherence_score": coherence_score,
74
- "novelty_score": question_novelty + novelty_score # Accumulate novelty score
75
  }
76
  ]
77
- })
 
 
 
 
 
78
 
79
  previous_answers.append(new_answer)
80
  question_novelty += novelty_score
81
 
82
  except Exception as e:
83
- result_queue.put({"type": "error", "message": str(e)})
 
84
 
85
  time_taken = time.time() - start_time
86
- result_queue.put({
87
- "type": "summary",
88
- "question": question,
89
- "total_novelty": question_novelty,
90
- "time_taken": time_taken
91
- })
 
 
92
 
93
  return question_novelty, [
94
  {
@@ -121,7 +129,7 @@ def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key):
121
  return novelty
122
 
123
 
124
- def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None):
125
  novelty_score = 0
126
  results = []
127
  result_queue = queue.Queue() # Create a queue for communication
@@ -135,14 +143,13 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
135
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
136
  # Submit tasks to the thread pool
137
  future_to_question = {
138
- executor.submit(process_question, question, model_name, open_router_key, openai_api_key, result_queue): question
139
  for question in questions
140
  }
141
 
142
- # Process results from the queue in the main thread
143
- while True:
144
- try:
145
- result = result_queue.get_nowait()
146
  if result["type"] == "answer":
147
  st.write(f"**Question:** {result['question']}")
148
  st.write(f"**New Answer:**\n{result['answer']}")
@@ -150,6 +157,11 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
150
  unsafe_allow_html=True)
151
  st.write(f"**Novelty Score:** {result['novelty_score']}")
152
  results.extend(result["results"]) # Add results here
 
 
 
 
 
153
  elif result["type"] == "summary":
154
  st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
155
  unsafe_allow_html=True)
@@ -158,27 +170,64 @@ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai
158
  elif result["type"] == "error":
159
  st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
160
  unsafe_allow_html=True)
161
- except queue.Empty:
162
- if not any(future.running() for future in future_to_question.keys()):
163
- break # All tasks are done
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
166
  unsafe_allow_html=True)
167
  return results
168
 
169
 
170
- def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key, progress=0, progress_lock=None):
171
  novelty_score = 0
172
  results = []
173
 
174
  for i, question in enumerate(questions):
175
- question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key,
176
- progress_lock, i, len(questions), progress)
177
- novelty_score += question_novelty
178
- results.extend(question_results)
179
- st.write(
180
- f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>",
181
- unsafe_allow_html=True) # Display progress after each question
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
  st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
184
  unsafe_allow_html=True)
 
7
  import streamlit as st # Import Streamlit
8
  import queue
9
 
10
+
11
  def generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key):
12
  """Generates an answer to a question using the specified language model."""
13
  gen_prompt = create_gen_prompt(question, previous_answers)
14
  try:
15
  new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
16
+ openai_api_key=openai_api_key)
17
  return new_answer
18
  except Exception as e:
19
  st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>",
20
+ unsafe_allow_html=True)
21
  return None
22
 
23
 
24
+ def evaluate_answer(question, new_answer, open_router_key, openai_api_key, judge_model_name):
25
  """Evaluates the coherence and novelty of an answer."""
26
  judge_prompt = create_judge_prompt(question, new_answer)
27
+ judge = judge_model_name # Use the judge_model_name passed to the function
28
  try:
29
  judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
30
+ openai_api_key=openai_api_key)
31
  coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
32
  return coherence_score
33
  except Exception as e:
34
  st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>",
35
+ unsafe_allow_html=True)
36
  return None
37
 
38
 
39
+ def process_question(question, model_name, open_router_key, openai_api_key, result_queue, judge_model_name):
40
  start_time = time.time()
 
41
  previous_answers = []
42
  question_novelty = 0
43
 
 
47
  if new_answer is None:
48
  break
49
 
50
+ coherence_score = evaluate_answer(question, new_answer, open_router_key, openai_api_key, judge_model_name)
51
  if coherence_score is None:
52
  break
53
 
54
+ if coherence_score <= 3:
55
  break
56
 
57
  novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
58
 
59
+ if novelty_score < 0.1:
60
  break
61
 
62
+
63
+ result_dict = {
64
  "type": "answer",
65
  "question": question,
66
  "answer": new_answer,
 
69
  "results": [
70
  {
71
  "question": question,
72
+ "answers": previous_answers.copy() + [new_answer],
73
  "coherence_score": coherence_score,
74
+ "novelty_score": question_novelty + novelty_score
75
  }
76
  ]
77
+ }
78
+
79
+ if result_queue is not None: # Check if result_queue is provided
80
+ result_queue.put(result_dict)
81
+
82
+ yield result_dict # Use yield to return the result immediately
83
 
84
  previous_answers.append(new_answer)
85
  question_novelty += novelty_score
86
 
87
  except Exception as e:
88
+ if result_queue is not None: # Check if result_queue is provided
89
+ result_queue.put({"type": "error", "message": str(e)})
90
 
91
  time_taken = time.time() - start_time
92
+
93
+ if result_queue is not None: # Check if result_queue is provided
94
+ result_queue.put({
95
+ "type": "summary",
96
+ "question": question,
97
+ "total_novelty": question_novelty,
98
+ "time_taken": time_taken
99
+ })
100
 
101
  return question_novelty, [
102
  {
 
129
  return novelty
130
 
131
 
132
+ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None, judge_model_name=None):
133
  novelty_score = 0
134
  results = []
135
  result_queue = queue.Queue() # Create a queue for communication
 
143
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
144
  # Submit tasks to the thread pool
145
  future_to_question = {
146
+ executor.submit(process_question, question, model_name, open_router_key, openai_api_key, result_queue, judge_model_name): question
147
  for question in questions
148
  }
149
 
150
+ # Collect results as they become available from futures and the queue
151
+ for future in as_completed(future_to_question):
152
+ for result in future.result(): # Iterate over yielded results from process_question
 
153
  if result["type"] == "answer":
154
  st.write(f"**Question:** {result['question']}")
155
  st.write(f"**New Answer:**\n{result['answer']}")
 
157
  unsafe_allow_html=True)
158
  st.write(f"**Novelty Score:** {result['novelty_score']}")
159
  results.extend(result["results"]) # Add results here
160
+ novelty_score += result["novelty_score"] # Update novelty score
161
+ st.write(
162
+ f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>",
163
+ unsafe_allow_html=True)
164
+
165
  elif result["type"] == "summary":
166
  st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
167
  unsafe_allow_html=True)
 
170
  elif result["type"] == "error":
171
  st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
172
  unsafe_allow_html=True)
173
+
174
+ # Process remaining results in the queue (if any)
175
+ while not result_queue.empty():
176
+ result = result_queue.get()
177
+ if result["type"] == "answer":
178
+ st.write(f"**Question:** {result['question']}")
179
+ st.write(f"**New Answer:**\n{result['answer']}")
180
+ st.write(f"<span style='color:green'>Coherence Score: {result['coherence_score']}</span>",
181
+ unsafe_allow_html=True)
182
+ st.write(f"**Novelty Score:** {result['novelty_score']}")
183
+ results.extend(result["results"]) # Add results here
184
+ novelty_score += result["novelty_score"] # Update novelty score
185
+ st.write(
186
+ f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>",
187
+ unsafe_allow_html=True)
188
+
189
+ elif result["type"] == "summary":
190
+ st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
191
+ unsafe_allow_html=True)
192
+ st.write(f"<span style='color:blue'>Time taken: {result['time_taken']} seconds</span>",
193
+ unsafe_allow_html=True)
194
+ elif result["type"] == "error":
195
+ st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
196
+ unsafe_allow_html=True)
197
+
198
 
199
  st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
200
  unsafe_allow_html=True)
201
  return results
202
 
203
 
204
+ def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key, judge_model_name):
205
  novelty_score = 0
206
  results = []
207
 
208
  for i, question in enumerate(questions):
209
+ for result in process_question(question, model_name, open_router_key, openai_api_key, None, judge_model_name):
210
+ if result["type"] == "answer":
211
+ st.write(f"**Question:** {result['question']}")
212
+ st.write(f"**New Answer:**\n{result['answer']}")
213
+ st.write(f"<span style='color:green'>Coherence Score: {result['coherence_score']}</span>",
214
+ unsafe_allow_html=True)
215
+ st.write(f"**Novelty Score:** {result['novelty_score']}")
216
+ results.extend(result["results"])
217
+ novelty_score += result["novelty_score"] # Add to novelty score
218
+ st.write(
219
+ f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>",
220
+ unsafe_allow_html=True)
221
+
222
+ elif result["type"] == "summary":
223
+ st.write(f"<span style='color:blue'>Total novelty score for question '{result['question']}': {result['total_novelty']}</span>",
224
+ unsafe_allow_html=True)
225
+ st.write(f"<span style='color:blue'>Time taken: {result['time_taken']} seconds</span>",
226
+ unsafe_allow_html=True)
227
+
228
+ elif result["type"] == "error":
229
+ st.write(f"<span style='color:red'>Error in thread: {result['message']}</span>",
230
+ unsafe_allow_html=True)
231
 
232
  st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>",
233
  unsafe_allow_html=True)