Presidentlin commited on
Commit
15bbe10
·
1 Parent(s): f0250b1
.gitattributes CHANGED
@@ -1,4 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
1
+ st*.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
__pycache__/main.cpython-310.pyc ADDED
Binary file (4.25 kB). View file
 
__pycache__/models.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/prompts.cpython-310.pyc ADDED
Binary file (2.79 kB). View file
 
app.py CHANGED
@@ -1,4 +1,169 @@
1
  import streamlit as st
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from main import get_novelty_score
3
+ from models import chat_with_model, embed
4
+ from prompts import questions as predefined_questions, create_gen_prompt, create_judge_prompt
5
+ import requests
6
+ import numpy as np
7
+ import os # Import the os module
8
 
9
+ st.title("Aiden Bench - Generator")
10
+
11
+ # API Key Inputs with Security and User Experience Enhancements
12
+ st.warning("Please keep your API keys secure and confidential.")
13
+ open_router_key = st.text_input("Enter your Open Router API Key:", type="password")
14
+ openai_api_key = st.text_input("Enter your OpenAI API Key:", type="password")
15
+
16
+ # Set environment variables (temporarily)
17
+ os.environ["OPEN_ROUTER_KEY"] = open_router_key
18
+ os.environ["OPENAI_API_KEY"] = openai_api_key
19
+
20
+ # Fetch models from OpenRouter API
21
+ try:
22
+ response = requests.get("https://openrouter.ai/api/v1/models")
23
+ response.raise_for_status() # Raise an exception for bad status codes
24
+ models = response.json()["data"]
25
+
26
+ # Sort models alphabetically by their ID
27
+ models.sort(key=lambda model: model["id"])
28
+
29
+ model_names = [model["id"] for model in models]
30
+ except requests.exceptions.RequestException as e:
31
+ st.error(f"Error fetching models from OpenRouter API: {e}")
32
+ model_names = [] # Provide an empty list if API call fails
33
+
34
+ # Model Selection
35
+ if model_names:
36
+ model_name = st.selectbox("Select a Language Model", model_names)
37
+ else:
38
+ st.error("No models available. Please check your API connection.")
39
+ st.stop() # Stop execution if no models are available
40
+
41
+ # Initialize session state for user_questions and predefined_questions
42
+ if "user_questions" not in st.session_state:
43
+ st.session_state.user_questions = []
44
+
45
+ # Workflow Selection
46
+ workflow = st.radio("Select Workflow:", ["Use Predefined Questions", "Use User-Defined Questions"])
47
+
48
+ # Handle Predefined Questions
49
+ if workflow == "Use Predefined Questions":
50
+ st.header("Question Selection")
51
+ # Multiselect for predefined questions
52
+ selected_questions = st.multiselect(
53
+ "Select questions to benchmark:",
54
+ predefined_questions,
55
+ predefined_questions # Select all by default
56
+ )
57
+
58
+ # Handle User-Defined Questions
59
+ elif workflow == "Use User-Defined Questions":
60
+ st.header("Question Input")
61
+
62
+ # Input for adding a new question
63
+ new_question = st.text_input("Enter a new question:")
64
+ if st.button("Add Question") and new_question:
65
+ new_question = new_question.strip() # Remove leading/trailing whitespace
66
+ if new_question and new_question not in st.session_state.user_questions:
67
+ st.session_state.user_questions.append(new_question) # Append to session state
68
+ st.success(f"Question '{new_question}' added successfully.")
69
+ else:
70
+ st.warning("Question already exists or is empty!")
71
+
72
+ # Display multiselect with updated user questions
73
+ selected_questions = st.multiselect(
74
+ "Select your custom questions:",
75
+ options=st.session_state.user_questions,
76
+ default=st.session_state.user_questions
77
+ )
78
+
79
+ # Display selected questions
80
+ st.write("Selected Questions:", selected_questions)
81
+
82
+ # Benchmark Execution
83
+ if st.button("Start Benchmark"):
84
+ if not selected_questions:
85
+ st.warning("Please select at least one question.")
86
+ elif not open_router_key or not openai_api_key: # Check if API keys are provided
87
+ st.warning("Please enter both API keys.")
88
+ else:
89
+ # Initialize progress bar
90
+ progress_bar = st.progress(0)
91
+ num_questions = len(selected_questions)
92
+ results = [] # List to store results
93
+
94
+ # Iterate through selected questions
95
+ for i, question in enumerate(selected_questions):
96
+ # Display current question
97
+ st.write(f"Processing question {i+1}/{num_questions}: {question}")
98
+
99
+ previous_answers = []
100
+ question_novelty = 0
101
+
102
+ try:
103
+ while True:
104
+ gen_prompt = create_gen_prompt(question, previous_answers)
105
+
106
+ # Handle potential API errors for chat_with_model
107
+ try:
108
+ new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
109
+ except requests.exceptions.RequestException as e:
110
+ st.error(f"API Error: {e}")
111
+ break # Exit the loop if API error occurs
112
+
113
+ judge_prompt = create_judge_prompt(question, new_answer)
114
+ judge = "openai/gpt-4o-mini"
115
+
116
+ # Handle potential API errors for chat_with_model (judge)
117
+ try:
118
+ judge_response = chat_with_model(prompt=judge_prompt, model=judge)
119
+ except requests.exceptions.RequestException as e:
120
+ st.error(f"API Error (Judge): {e}")
121
+ break # Exit the loop if API error occurs
122
+
123
+ coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
124
+
125
+ if coherence_score <= 3:
126
+ st.warning("Output is incoherent. Moving to next question.")
127
+ break
128
+
129
+ novelty_score = get_novelty_score(new_answer, previous_answers)
130
+
131
+ if novelty_score < 0.1:
132
+ st.warning("Output is redundant. Moving to next question.")
133
+ break
134
+
135
+ st.write(f"New Answer:\n{new_answer}")
136
+ st.write(f"Coherence Score: {coherence_score}")
137
+ st.write(f"Novelty Score: {novelty_score}")
138
+
139
+ previous_answers.append(new_answer)
140
+ question_novelty += novelty_score
141
+
142
+ except Exception as e:
143
+ st.error(f"Error processing question: {e}")
144
+
145
+
146
+ results.append({
147
+ "question": question,
148
+ "answers": previous_answers,
149
+ "coherence_score": coherence_score,
150
+ "novelty_score": novelty_score
151
+ })
152
+
153
+ # Update progress bar
154
+ progress_bar.progress((i + 1) / num_questions)
155
+
156
+ st.success("Benchmark completed!")
157
+
158
+ # Display results in a table
159
+ st.write("Results:")
160
+ results_table = []
161
+ for result in results:
162
+ for answer in result["answers"]:
163
+ results_table.append({
164
+ "Question": result["question"],
165
+ "Answer": answer,
166
+ "Coherence Score": result["coherence_score"],
167
+ "Novelty Score": result["novelty_score"]
168
+ })
169
+ st.table(results_table)
main.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from models import chat_with_model, embed
3
+ from prompts import questions, create_gen_prompt, create_judge_prompt
4
+ from colorama import Fore, Style
5
+ import time
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ import threading
8
+ import argparse
9
+
10
+
11
+ def parse_arguments():
12
+ parser = argparse.ArgumentParser(description="Benchmark a language model.")
13
+ parser.add_argument("model_name", type=str, help="Name of the model to benchmark")
14
+ parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode")
15
+ return parser.parse_args()
16
+
17
+
18
+ def benchmark_model(model_name, multithreaded=False):
19
+ if multithreaded:
20
+ return benchmark_model_multithreaded(model_name)
21
+ else:
22
+ return benchmark_model_sequential(model_name)
23
+
24
+
25
+ def process_question(question, model_name):
26
+ start_time = time.time()
27
+ print(Fore.RED + question + Style.RESET_ALL)
28
+ previous_answers = []
29
+ question_novelty = 0
30
+
31
+ try:
32
+ while True:
33
+ gen_prompt = create_gen_prompt(question, previous_answers)
34
+ try:
35
+ new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
36
+ except Exception as e:
37
+ print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL)
38
+ break
39
+
40
+ judge_prompt = create_judge_prompt(question, new_answer)
41
+ judge = "openai/gpt-4o-mini"
42
+ try:
43
+ judge_response = chat_with_model(prompt=judge_prompt, model=judge)
44
+ except Exception as e:
45
+ print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL)
46
+ break
47
+
48
+ coherence_score = int(judge_response.split("<coherence_score>")[
49
+ 1].split("</coherence_score>")[0])
50
+
51
+ if coherence_score <= 3:
52
+ print(
53
+ Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL)
54
+ break
55
+
56
+ novelty_score = get_novelty_score(new_answer, previous_answers)
57
+
58
+ if novelty_score < 0.1:
59
+ print(
60
+ Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL)
61
+ break
62
+
63
+ print(f"New Answer:\n{new_answer}")
64
+ print(Fore.GREEN + f"Coherence Score: {coherence_score}")
65
+ print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL)
66
+
67
+ previous_answers.append(new_answer)
68
+ question_novelty += novelty_score
69
+
70
+ except Exception as e:
71
+ print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL)
72
+
73
+ time_taken = time.time() - start_time
74
+ print(Fore.BLUE)
75
+ print(f"Total novelty score for this question: {question_novelty}")
76
+ print(f"Time taken: {time_taken} seconds")
77
+ print(Style.RESET_ALL)
78
+
79
+ return question_novelty
80
+
81
+
82
+ def get_novelty_score(new_answer: str, previous_answers: list):
83
+ new_embedding = embed(new_answer)
84
+
85
+ # If there are no previous answers, return maximum novelty
86
+ if not previous_answers:
87
+ return 1.0
88
+
89
+ previous_embeddings = [embed(answer) for answer in previous_answers]
90
+
91
+ similarities = [
92
+ np.dot(new_embedding, prev_embedding) /
93
+ (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
94
+ for prev_embedding in previous_embeddings
95
+ ]
96
+
97
+ max_similarity = max(similarities)
98
+ novelty = 1 - max_similarity
99
+
100
+ return novelty
101
+
102
+
103
+ def benchmark_model_multithreaded(model_name):
104
+ novelty_score = 0
105
+ print_lock = threading.Lock()
106
+
107
+ with ThreadPoolExecutor(max_workers=len(questions)) as executor:
108
+ future_to_question = {executor.submit(
109
+ process_question, question, model_name): question for question in questions}
110
+
111
+ for future in as_completed(future_to_question):
112
+ question = future_to_question[future]
113
+
114
+ question_novelty = future.result()
115
+ with print_lock:
116
+ novelty_score += question_novelty
117
+
118
+ print(Fore.YELLOW)
119
+ print(f"Total novelty score across all questions: {novelty_score}")
120
+ print(Style.RESET_ALL)
121
+
122
+ return novelty_score
123
+
124
+
125
+ def benchmark_model_sequential(model_name):
126
+ novelty_score = 0
127
+
128
+ for question in questions:
129
+ question_novelty = process_question(question, model_name)
130
+ novelty_score += question_novelty
131
+
132
+ print(Fore.YELLOW)
133
+ print(f"Total novelty score across all questions: {novelty_score}")
134
+ print(Style.RESET_ALL)
135
+
136
+ return novelty_score
137
+
138
+
139
+ if __name__ == "__main__":
140
+ args = parse_arguments()
141
+ benchmark_model(args.model_name, multithreaded=not args.single_threaded)
main.py:Zone.Identifier ADDED
File without changes
models.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from functools import lru_cache
4
+ from retry import retry
5
+
6
+
7
+ @retry(tries=3)
8
+ def chat_with_model(prompt, model, max_tokens=4000, temperature=0):
9
+ client = OpenAI(
10
+ api_key=os.getenv("OPEN_ROUTER_KEY"),
11
+ base_url="https://openrouter.ai/api/v1"
12
+ )
13
+ response = client.chat.completions.create(
14
+ model=model,
15
+ messages=[
16
+ {
17
+ "role": "user",
18
+ "content": prompt
19
+ }
20
+ ],
21
+ max_tokens=max_tokens,
22
+ temperature=temperature
23
+ )
24
+ return response.choices[0].message.content
25
+
26
+
27
+ @lru_cache(maxsize=10000)
28
+ @retry(tries=3)
29
+ def embed(text):
30
+ client = OpenAI()
31
+
32
+ response = client.embeddings.create(
33
+ model="text-embedding-3-large", input=[text])
34
+ return response.data[0].embedding
models.py:Zone.Identifier ADDED
File without changes
prompts.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Questions should be open-ended but demand concrete answers.
2
+ questions = [
3
+ "Provide an explanation for Japan's Lost Decades.",
4
+ "What is a cause of World War 1?",
5
+
6
+ ]
7
+
8
+
9
+ def create_gen_prompt(question: str, previous_answers: list) -> str:
10
+ prompt = (
11
+ "Answer the following question:.\n"
12
+ f"<question>{question}</question>\n"
13
+ "Your response should be a single brief sentence.\n"
14
+ )
15
+
16
+ if len(previous_answers) > 0:
17
+
18
+ previous_answers_str = "\n".join(
19
+ [f"{i+1}. {answer}" for i, answer in enumerate(previous_answers)]
20
+ )
21
+
22
+ prompt += (
23
+ "IMPORTANT: Provide an answer you *HAVE NOT* given previously.\n"
24
+ "Your previous answers are inside of <previous_answers></previous_answers> XML tags.\n"
25
+ f"<previous_answers>\n{previous_answers_str}\n</previous_answers>"
26
+ )
27
+
28
+ return prompt
29
+
30
+
31
+ def create_judge_prompt(question: str, answer: str):
32
+ prompt = f""" Your task is to evaluate the coherence and plausibility of an answer to a given question. This involves assessing whether the answer makes sense and isn't nonsensical or implausible.
33
+
34
+ Question: <question>{question}</question>
35
+ Answer: <answer>{answer}</answer>
36
+
37
+ Evaluation process:
38
+ 1. Understand the question: Analyze what the question is asking.
39
+ 2. Assess the answer: Determine if the answer is coherent and plausible.
40
+ 3. Check for nonsensical elements: Identify any aspects that are completely unrelated or absurd.
41
+
42
+ Please think through each step carefully and show your reasoning:
43
+
44
+ 1. Question analysis:
45
+ [Your brief analysis of the question here]
46
+
47
+ 2. Answer assessment:
48
+ [Evaluate if the answer is coherent and plausible]
49
+
50
+ 3. Nonsensical check:
51
+ [Identify any completely unrelated or absurd elements]
52
+
53
+ Based on your analysis, provide a final Coherence and Plausibility Score on a scale of 1 - 10, where:
54
+ 1-3: Incoherent, implausible, or nonsensical
55
+ 4-6: Partially coherent and plausible, but with some issues
56
+ 7-8: Mostly coherent and plausible with minor issues
57
+ 9-10: Highly coherent and plausible
58
+
59
+ Ensure that nonsensical or completely implausible answers receive very low scores (1-3).
60
+
61
+ IMPORTANT: After your reasoning, you must provide your final Coherence and Plausibility Score as a single integer between 1 and 10, enclosed in <coherence_score></coherence_score> XML tags. For example:
62
+ <coherence_score>7</coherence_score>
63
+
64
+ Your response must end with this score in the specified format.
65
+ """
66
+ return prompt
prompts.py:Zone.Identifier ADDED
File without changes