Presidentlin commited on
Commit
c9e00de
·
1 Parent(s): 0e9562f
__pycache__/main.cpython-310.pyc CHANGED
Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ
 
__pycache__/prompts.cpython-310.pyc CHANGED
Binary files a/__pycache__/prompts.cpython-310.pyc and b/__pycache__/prompts.cpython-310.pyc differ
 
app.py CHANGED
@@ -1,10 +1,7 @@
1
  import streamlit as st
2
- from main import get_novelty_score
3
- from models import chat_with_model, embed
4
- from prompts import questions as predefined_questions, create_gen_prompt, create_judge_prompt
5
  import requests
6
- import numpy as np
7
- import os
8
 
9
  # Set the title in the browser tab
10
  st.set_page_config(page_title="Aidan Bench - Generator")
@@ -95,86 +92,51 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
95
  # Display selected questions
96
  st.write("Selected Questions:", selected_questions)
97
 
 
 
 
98
  # Benchmark Execution
99
  if st.button("Start Benchmark"):
100
  if not selected_questions:
101
  st.warning("Please select at least one question.")
102
  else:
103
- # Initialize progress bar
104
  progress_bar = st.progress(0)
105
  num_questions = len(selected_questions)
106
- results = [] # List to store results
 
 
 
107
 
108
- # Iterate through selected questions
109
  for i, question in enumerate(selected_questions):
110
  # Display current question
111
  st.write(f"Processing question {i+1}/{num_questions}: {question}")
112
 
113
- previous_answers = []
114
- question_novelty = 0
115
-
116
- try:
117
- while True:
118
- gen_prompt = create_gen_prompt(question, previous_answers)
119
-
120
- try:
121
- new_answer = chat_with_model(
122
- prompt=gen_prompt,
123
- model=model_name,
124
- open_router_key=st.session_state.open_router_key,
125
- openai_api_key=st.session_state.openai_api_key
126
- )
127
- except requests.exceptions.RequestException as e:
128
- st.error(f"API Error: {e}")
129
- break
130
-
131
- judge_prompt = create_judge_prompt(question, new_answer)
132
- judge = "openai/gpt-4o-mini"
133
-
134
- try:
135
- judge_response = chat_with_model(
136
- prompt=judge_prompt,
137
- model=judge,
138
- open_router_key=st.session_state.open_router_key,
139
- openai_api_key=st.session_state.openai_api_key
140
- )
141
- except requests.exceptions.RequestException as e:
142
- st.error(f"API Error (Judge): {e}")
143
- break
144
-
145
- coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
146
-
147
- if coherence_score <= 3:
148
- st.warning("Output is incoherent. Moving to next question.")
149
- break
150
-
151
- novelty_score = get_novelty_score(new_answer, previous_answers, st.session_state.openai_api_key)
152
-
153
- if novelty_score < 0.1:
154
- st.warning("Output is redundant. Moving to next question.")
155
- break
156
-
157
- st.write(f"New Answer:\n{new_answer}")
158
- st.write(f"Coherence Score: {coherence_score}")
159
- st.write(f"Novelty Score: {novelty_score}")
160
-
161
- previous_answers.append(new_answer)
162
- question_novelty += novelty_score
163
-
164
- except Exception as e:
165
- st.error(f"Error processing question: {e}")
166
-
167
- results.append({
168
- "question": question,
169
- "answers": previous_answers,
170
- "coherence_score": coherence_score,
171
- "novelty_score": novelty_score
172
- })
173
 
174
  # Update progress bar
175
  progress_bar.progress((i + 1) / num_questions)
176
 
177
- st.success("Benchmark completed!")
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  # Display results in a table
180
  st.write("Results:")
 
1
  import streamlit as st
2
+ from main import benchmark_model_multithreaded, benchmark_model_sequential
3
+ from prompts import questions as predefined_questions
 
4
  import requests
 
 
5
 
6
  # Set the title in the browser tab
7
  st.set_page_config(page_title="Aidan Bench - Generator")
 
92
  # Display selected questions
93
  st.write("Selected Questions:", selected_questions)
94
 
95
+ # Choose execution mode
96
+ execution_mode = st.radio("Execution Mode:", ["Sequential", "Multithreaded"])
97
+
98
  # Benchmark Execution
99
  if st.button("Start Benchmark"):
100
  if not selected_questions:
101
  st.warning("Please select at least one question.")
102
  else:
103
+ # Initialize progress bar
104
  progress_bar = st.progress(0)
105
  num_questions = len(selected_questions)
106
+ results = []
107
+
108
+ # Stop button
109
+ stop_button = st.button("Stop Benchmark")
110
 
111
+ # Benchmarking loop
112
  for i, question in enumerate(selected_questions):
113
  # Display current question
114
  st.write(f"Processing question {i+1}/{num_questions}: {question}")
115
 
116
+ # ... (benchmarking logic using the chosen execution mode)
117
+ if execution_mode == "Sequential":
118
+ question_results = benchmark_model_sequential(model_name, [question], st.session_state.open_router_key, st.session_state.openai_api_key)
119
+ else: # Multithreaded
120
+ question_results = benchmark_model_multithreaded(model_name, [question], st.session_state.open_router_key, st.session_state.openai_api_key)
121
+
122
+ results.extend(question_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  # Update progress bar
125
  progress_bar.progress((i + 1) / num_questions)
126
 
127
+ # Check if stop button is clicked
128
+ if stop_button:
129
+ st.warning("Benchmark stopped!")
130
+ break # Exit the loop
131
+
132
+ # Display results (even if interrupted)
133
+ st.write("Results:")
134
+ # ... (table generation logic - Same as before)
135
+
136
+ if stop_button:
137
+ st.warning("Partial results displayed due to interruption.")
138
+ else:
139
+ st.success("Benchmark completed!")
140
 
141
  # Display results in a table
142
  st.write("Results:")
main.py CHANGED
@@ -1,30 +1,14 @@
1
  import numpy as np
2
  from models import chat_with_model, embed
3
- from prompts import questions, create_gen_prompt, create_judge_prompt
4
- from colorama import Fore, Style
5
  import time
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
  import threading
8
- import argparse
9
 
10
-
11
- def parse_arguments():
12
- parser = argparse.ArgumentParser(description="Benchmark a language model.")
13
- parser.add_argument("model_name", type=str, help="Name of the model to benchmark")
14
- parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode")
15
- return parser.parse_args()
16
-
17
-
18
- def benchmark_model(model_name, multithreaded=False):
19
- if multithreaded:
20
- return benchmark_model_multithreaded(model_name)
21
- else:
22
- return benchmark_model_sequential(model_name)
23
-
24
-
25
- def process_question(question, model_name):
26
  start_time = time.time()
27
- print(Fore.RED + question + Style.RESET_ALL)
28
  previous_answers = []
29
  question_novelty = 0
30
 
@@ -32,110 +16,112 @@ def process_question(question, model_name):
32
  while True:
33
  gen_prompt = create_gen_prompt(question, previous_answers)
34
  try:
35
- new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
36
  except Exception as e:
37
- print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL)
38
  break
39
 
40
  judge_prompt = create_judge_prompt(question, new_answer)
41
  judge = "openai/gpt-4o-mini"
42
  try:
43
- judge_response = chat_with_model(prompt=judge_prompt, model=judge)
44
  except Exception as e:
45
- print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL)
46
  break
47
 
48
- coherence_score = int(judge_response.split("<coherence_score>")[
49
- 1].split("</coherence_score>")[0])
50
 
51
  if coherence_score <= 3:
52
- print(
53
- Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL)
54
  break
55
 
56
- novelty_score = get_novelty_score(new_answer, previous_answers)
57
 
58
  if novelty_score < 0.1:
59
- print(
60
- Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL)
61
  break
62
 
63
- print(f"New Answer:\n{new_answer}")
64
- print(Fore.GREEN + f"Coherence Score: {coherence_score}")
65
- print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL)
66
 
67
  previous_answers.append(new_answer)
68
  question_novelty += novelty_score
69
 
70
  except Exception as e:
71
- print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL)
72
 
73
  time_taken = time.time() - start_time
74
- print(Fore.BLUE)
75
- print(f"Total novelty score for this question: {question_novelty}")
76
- print(f"Time taken: {time_taken} seconds")
77
- print(Style.RESET_ALL)
78
 
79
- return question_novelty
 
 
 
 
 
 
 
80
 
81
 
82
- def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key=None):
83
- new_embedding = embed(new_answer, openai_api_key)
84
 
85
- # If there are no previous answers, return maximum novelty
86
- if not previous_answers:
87
- return 1.0
88
 
89
- previous_embeddings = [embed(answer, openai_api_key) for answer in previous_answers]
90
 
91
- similarities = [
92
- np.dot(new_embedding, prev_embedding) /
93
- (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
94
- for prev_embedding in previous_embeddings
95
- ]
96
 
97
- max_similarity = max(similarities)
98
- novelty = 1 - max_similarity
99
 
100
- return novelty
101
 
102
 
103
- def benchmark_model_multithreaded(model_name):
104
  novelty_score = 0
105
- print_lock = threading.Lock()
 
106
 
107
  with ThreadPoolExecutor(max_workers=len(questions)) as executor:
108
  future_to_question = {executor.submit(
109
- process_question, question, model_name): question for question in questions}
110
 
111
  for future in as_completed(future_to_question):
112
  question = future_to_question[future]
113
 
114
- question_novelty = future.result()
115
- with print_lock:
116
- novelty_score += question_novelty
117
-
118
- print(Fore.YELLOW)
119
- print(f"Total novelty score across all questions: {novelty_score}")
120
- print(Style.RESET_ALL)
 
 
121
 
122
- return novelty_score
 
123
 
124
 
125
- def benchmark_model_sequential(model_name):
126
  novelty_score = 0
 
127
 
128
- for question in questions:
129
- question_novelty = process_question(question, model_name)
130
  novelty_score += question_novelty
 
 
131
 
132
- print(Fore.YELLOW)
133
- print(f"Total novelty score across all questions: {novelty_score}")
134
- print(Style.RESET_ALL)
135
-
136
- return novelty_score
137
-
138
 
139
- if __name__ == "__main__":
140
- args = parse_arguments()
141
- benchmark_model(args.model_name, multithreaded=not args.single_threaded)
 
1
  import numpy as np
2
  from models import chat_with_model, embed
3
+ from prompts import create_gen_prompt, create_judge_prompt
 
4
  import time
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
6
  import threading
7
+ import streamlit as st # Import Streamlit
8
 
9
+ def process_question(question, model_name, open_router_key, openai_api_key):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  start_time = time.time()
11
+ st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True) # Display question in red
12
  previous_answers = []
13
  question_novelty = 0
14
 
 
16
  while True:
17
  gen_prompt = create_gen_prompt(question, previous_answers)
18
  try:
19
+ new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key, openai_api_key=openai_api_key)
20
  except Exception as e:
21
+ st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>", unsafe_allow_html=True) # Display error in red
22
  break
23
 
24
  judge_prompt = create_judge_prompt(question, new_answer)
25
  judge = "openai/gpt-4o-mini"
26
  try:
27
+ judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key, openai_api_key=openai_api_key)
28
  except Exception as e:
29
+ st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>", unsafe_allow_html=True) # Display error in red
30
  break
31
 
32
+ coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
 
33
 
34
  if coherence_score <= 3:
35
+ st.write("<span style='color:yellow'>Output is incoherent. Moving to next question.</span>", unsafe_allow_html=True) # Display warning in yellow
 
36
  break
37
 
38
+ novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
39
 
40
  if novelty_score < 0.1:
41
+ st.write("<span style='color:yellow'>Output is redundant. Moving to next question.</span>", unsafe_allow_html=True) # Display warning in yellow
 
42
  break
43
 
44
+ st.write(f"**New Answer:**\n{new_answer}")
45
+ st.write(f"<span style='color:green'>Coherence Score: {coherence_score}</span>", unsafe_allow_html=True) # Display coherence score in green
46
+ st.write(f"**Novelty Score:** {novelty_score}")
47
 
48
  previous_answers.append(new_answer)
49
  question_novelty += novelty_score
50
 
51
  except Exception as e:
52
+ st.write(f"<span style='color:red'>Unexpected error processing question: {str(e)}</span>", unsafe_allow_html=True) # Display error in red
53
 
54
  time_taken = time.time() - start_time
55
+ st.write(f"<span style='color:blue'>Total novelty score for this question: {question_novelty}</span>", unsafe_allow_html=True) # Display novelty score in blue
56
+ st.write(f"<span style='color:blue'>Time taken: {time_taken} seconds</span>", unsafe_allow_html=True) # Display time taken in blue
 
 
57
 
58
+ return question_novelty, [
59
+ {
60
+ "question": question,
61
+ "answers": previous_answers,
62
+ "coherence_score": coherence_score,
63
+ "novelty_score": question_novelty
64
+ }
65
+ ]
66
 
67
 
68
+ def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key):
69
+ new_embedding = embed(new_answer, openai_api_key)
70
 
71
+ # If there are no previous answers, return maximum novelty
72
+ if not previous_answers:
73
+ return 1.0
74
 
75
+ previous_embeddings = [embed(answer, openai_api_key) for answer in previous_answers]
76
 
77
+ similarities = [
78
+ np.dot(new_embedding, prev_embedding) /
79
+ (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
80
+ for prev_embedding in previous_embeddings
81
+ ]
82
 
83
+ max_similarity = max(similarities)
84
+ novelty = 1 - max_similarity
85
 
86
+ return novelty
87
 
88
 
89
+ def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key):
90
  novelty_score = 0
91
+ print_lock = threading.Lock() # Lock for thread-safe printing
92
+ results = []
93
 
94
  with ThreadPoolExecutor(max_workers=len(questions)) as executor:
95
  future_to_question = {executor.submit(
96
+ process_question, question, model_name, open_router_key, openai_api_key): question for question in questions}
97
 
98
  for future in as_completed(future_to_question):
99
  question = future_to_question[future]
100
 
101
+ try:
102
+ question_novelty, question_results = future.result()
103
+ with print_lock:
104
+ novelty_score += question_novelty
105
+ results.extend(question_results)
106
+ st.write(f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>", unsafe_allow_html=True)
107
+ except Exception as e:
108
+ with print_lock:
109
+ st.write(f"<span style='color:red'>Error in thread: {str(e)}</span>", unsafe_allow_html=True)
110
 
111
+ st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
112
+ return results
113
 
114
 
115
+ def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key):
116
  novelty_score = 0
117
+ results = []
118
 
119
+ for i, question in enumerate(questions):
120
+ question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key)
121
  novelty_score += question_novelty
122
+ results.extend(question_results)
123
+ st.write(f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>", unsafe_allow_html=True) # Display progress after each question
124
 
125
+ st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
 
 
 
 
 
126
 
127
+ return results