Spaces:
Runtime error
Runtime error
Presidentlin
commited on
Commit
·
15bbe10
1
Parent(s):
f0250b1
- .gitattributes +1 -1
- __pycache__/main.cpython-310.pyc +0 -0
- __pycache__/models.cpython-310.pyc +0 -0
- __pycache__/prompts.cpython-310.pyc +0 -0
- app.py +167 -2
- main.py +141 -0
- main.py:Zone.Identifier +0 -0
- models.py +34 -0
- models.py:Zone.Identifier +0 -0
- prompts.py +66 -0
- prompts.py:Zone.Identifier +0 -0
.gitattributes
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
st*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
__pycache__/main.cpython-310.pyc
ADDED
Binary file (4.25 kB). View file
|
|
__pycache__/models.cpython-310.pyc
ADDED
Binary file (1.01 kB). View file
|
|
__pycache__/prompts.cpython-310.pyc
ADDED
Binary file (2.79 kB). View file
|
|
app.py
CHANGED
@@ -1,4 +1,169 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from main import get_novelty_score
|
3 |
+
from models import chat_with_model, embed
|
4 |
+
from prompts import questions as predefined_questions, create_gen_prompt, create_judge_prompt
|
5 |
+
import requests
|
6 |
+
import numpy as np
|
7 |
+
import os # Import the os module
|
8 |
|
9 |
+
st.title("Aiden Bench - Generator")
|
10 |
+
|
11 |
+
# API Key Inputs with Security and User Experience Enhancements
|
12 |
+
st.warning("Please keep your API keys secure and confidential.")
|
13 |
+
open_router_key = st.text_input("Enter your Open Router API Key:", type="password")
|
14 |
+
openai_api_key = st.text_input("Enter your OpenAI API Key:", type="password")
|
15 |
+
|
16 |
+
# Set environment variables (temporarily)
|
17 |
+
os.environ["OPEN_ROUTER_KEY"] = open_router_key
|
18 |
+
os.environ["OPENAI_API_KEY"] = openai_api_key
|
19 |
+
|
20 |
+
# Fetch models from OpenRouter API
|
21 |
+
try:
|
22 |
+
response = requests.get("https://openrouter.ai/api/v1/models")
|
23 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
24 |
+
models = response.json()["data"]
|
25 |
+
|
26 |
+
# Sort models alphabetically by their ID
|
27 |
+
models.sort(key=lambda model: model["id"])
|
28 |
+
|
29 |
+
model_names = [model["id"] for model in models]
|
30 |
+
except requests.exceptions.RequestException as e:
|
31 |
+
st.error(f"Error fetching models from OpenRouter API: {e}")
|
32 |
+
model_names = [] # Provide an empty list if API call fails
|
33 |
+
|
34 |
+
# Model Selection
|
35 |
+
if model_names:
|
36 |
+
model_name = st.selectbox("Select a Language Model", model_names)
|
37 |
+
else:
|
38 |
+
st.error("No models available. Please check your API connection.")
|
39 |
+
st.stop() # Stop execution if no models are available
|
40 |
+
|
41 |
+
# Initialize session state for user_questions and predefined_questions
|
42 |
+
if "user_questions" not in st.session_state:
|
43 |
+
st.session_state.user_questions = []
|
44 |
+
|
45 |
+
# Workflow Selection
|
46 |
+
workflow = st.radio("Select Workflow:", ["Use Predefined Questions", "Use User-Defined Questions"])
|
47 |
+
|
48 |
+
# Handle Predefined Questions
|
49 |
+
if workflow == "Use Predefined Questions":
|
50 |
+
st.header("Question Selection")
|
51 |
+
# Multiselect for predefined questions
|
52 |
+
selected_questions = st.multiselect(
|
53 |
+
"Select questions to benchmark:",
|
54 |
+
predefined_questions,
|
55 |
+
predefined_questions # Select all by default
|
56 |
+
)
|
57 |
+
|
58 |
+
# Handle User-Defined Questions
|
59 |
+
elif workflow == "Use User-Defined Questions":
|
60 |
+
st.header("Question Input")
|
61 |
+
|
62 |
+
# Input for adding a new question
|
63 |
+
new_question = st.text_input("Enter a new question:")
|
64 |
+
if st.button("Add Question") and new_question:
|
65 |
+
new_question = new_question.strip() # Remove leading/trailing whitespace
|
66 |
+
if new_question and new_question not in st.session_state.user_questions:
|
67 |
+
st.session_state.user_questions.append(new_question) # Append to session state
|
68 |
+
st.success(f"Question '{new_question}' added successfully.")
|
69 |
+
else:
|
70 |
+
st.warning("Question already exists or is empty!")
|
71 |
+
|
72 |
+
# Display multiselect with updated user questions
|
73 |
+
selected_questions = st.multiselect(
|
74 |
+
"Select your custom questions:",
|
75 |
+
options=st.session_state.user_questions,
|
76 |
+
default=st.session_state.user_questions
|
77 |
+
)
|
78 |
+
|
79 |
+
# Display selected questions
|
80 |
+
st.write("Selected Questions:", selected_questions)
|
81 |
+
|
82 |
+
# Benchmark Execution
|
83 |
+
if st.button("Start Benchmark"):
|
84 |
+
if not selected_questions:
|
85 |
+
st.warning("Please select at least one question.")
|
86 |
+
elif not open_router_key or not openai_api_key: # Check if API keys are provided
|
87 |
+
st.warning("Please enter both API keys.")
|
88 |
+
else:
|
89 |
+
# Initialize progress bar
|
90 |
+
progress_bar = st.progress(0)
|
91 |
+
num_questions = len(selected_questions)
|
92 |
+
results = [] # List to store results
|
93 |
+
|
94 |
+
# Iterate through selected questions
|
95 |
+
for i, question in enumerate(selected_questions):
|
96 |
+
# Display current question
|
97 |
+
st.write(f"Processing question {i+1}/{num_questions}: {question}")
|
98 |
+
|
99 |
+
previous_answers = []
|
100 |
+
question_novelty = 0
|
101 |
+
|
102 |
+
try:
|
103 |
+
while True:
|
104 |
+
gen_prompt = create_gen_prompt(question, previous_answers)
|
105 |
+
|
106 |
+
# Handle potential API errors for chat_with_model
|
107 |
+
try:
|
108 |
+
new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
|
109 |
+
except requests.exceptions.RequestException as e:
|
110 |
+
st.error(f"API Error: {e}")
|
111 |
+
break # Exit the loop if API error occurs
|
112 |
+
|
113 |
+
judge_prompt = create_judge_prompt(question, new_answer)
|
114 |
+
judge = "openai/gpt-4o-mini"
|
115 |
+
|
116 |
+
# Handle potential API errors for chat_with_model (judge)
|
117 |
+
try:
|
118 |
+
judge_response = chat_with_model(prompt=judge_prompt, model=judge)
|
119 |
+
except requests.exceptions.RequestException as e:
|
120 |
+
st.error(f"API Error (Judge): {e}")
|
121 |
+
break # Exit the loop if API error occurs
|
122 |
+
|
123 |
+
coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
|
124 |
+
|
125 |
+
if coherence_score <= 3:
|
126 |
+
st.warning("Output is incoherent. Moving to next question.")
|
127 |
+
break
|
128 |
+
|
129 |
+
novelty_score = get_novelty_score(new_answer, previous_answers)
|
130 |
+
|
131 |
+
if novelty_score < 0.1:
|
132 |
+
st.warning("Output is redundant. Moving to next question.")
|
133 |
+
break
|
134 |
+
|
135 |
+
st.write(f"New Answer:\n{new_answer}")
|
136 |
+
st.write(f"Coherence Score: {coherence_score}")
|
137 |
+
st.write(f"Novelty Score: {novelty_score}")
|
138 |
+
|
139 |
+
previous_answers.append(new_answer)
|
140 |
+
question_novelty += novelty_score
|
141 |
+
|
142 |
+
except Exception as e:
|
143 |
+
st.error(f"Error processing question: {e}")
|
144 |
+
|
145 |
+
|
146 |
+
results.append({
|
147 |
+
"question": question,
|
148 |
+
"answers": previous_answers,
|
149 |
+
"coherence_score": coherence_score,
|
150 |
+
"novelty_score": novelty_score
|
151 |
+
})
|
152 |
+
|
153 |
+
# Update progress bar
|
154 |
+
progress_bar.progress((i + 1) / num_questions)
|
155 |
+
|
156 |
+
st.success("Benchmark completed!")
|
157 |
+
|
158 |
+
# Display results in a table
|
159 |
+
st.write("Results:")
|
160 |
+
results_table = []
|
161 |
+
for result in results:
|
162 |
+
for answer in result["answers"]:
|
163 |
+
results_table.append({
|
164 |
+
"Question": result["question"],
|
165 |
+
"Answer": answer,
|
166 |
+
"Coherence Score": result["coherence_score"],
|
167 |
+
"Novelty Score": result["novelty_score"]
|
168 |
+
})
|
169 |
+
st.table(results_table)
|
main.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from models import chat_with_model, embed
|
3 |
+
from prompts import questions, create_gen_prompt, create_judge_prompt
|
4 |
+
from colorama import Fore, Style
|
5 |
+
import time
|
6 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
7 |
+
import threading
|
8 |
+
import argparse
|
9 |
+
|
10 |
+
|
11 |
+
def parse_arguments():
|
12 |
+
parser = argparse.ArgumentParser(description="Benchmark a language model.")
|
13 |
+
parser.add_argument("model_name", type=str, help="Name of the model to benchmark")
|
14 |
+
parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode")
|
15 |
+
return parser.parse_args()
|
16 |
+
|
17 |
+
|
18 |
+
def benchmark_model(model_name, multithreaded=False):
|
19 |
+
if multithreaded:
|
20 |
+
return benchmark_model_multithreaded(model_name)
|
21 |
+
else:
|
22 |
+
return benchmark_model_sequential(model_name)
|
23 |
+
|
24 |
+
|
25 |
+
def process_question(question, model_name):
|
26 |
+
start_time = time.time()
|
27 |
+
print(Fore.RED + question + Style.RESET_ALL)
|
28 |
+
previous_answers = []
|
29 |
+
question_novelty = 0
|
30 |
+
|
31 |
+
try:
|
32 |
+
while True:
|
33 |
+
gen_prompt = create_gen_prompt(question, previous_answers)
|
34 |
+
try:
|
35 |
+
new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
|
36 |
+
except Exception as e:
|
37 |
+
print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL)
|
38 |
+
break
|
39 |
+
|
40 |
+
judge_prompt = create_judge_prompt(question, new_answer)
|
41 |
+
judge = "openai/gpt-4o-mini"
|
42 |
+
try:
|
43 |
+
judge_response = chat_with_model(prompt=judge_prompt, model=judge)
|
44 |
+
except Exception as e:
|
45 |
+
print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL)
|
46 |
+
break
|
47 |
+
|
48 |
+
coherence_score = int(judge_response.split("<coherence_score>")[
|
49 |
+
1].split("</coherence_score>")[0])
|
50 |
+
|
51 |
+
if coherence_score <= 3:
|
52 |
+
print(
|
53 |
+
Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL)
|
54 |
+
break
|
55 |
+
|
56 |
+
novelty_score = get_novelty_score(new_answer, previous_answers)
|
57 |
+
|
58 |
+
if novelty_score < 0.1:
|
59 |
+
print(
|
60 |
+
Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL)
|
61 |
+
break
|
62 |
+
|
63 |
+
print(f"New Answer:\n{new_answer}")
|
64 |
+
print(Fore.GREEN + f"Coherence Score: {coherence_score}")
|
65 |
+
print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL)
|
66 |
+
|
67 |
+
previous_answers.append(new_answer)
|
68 |
+
question_novelty += novelty_score
|
69 |
+
|
70 |
+
except Exception as e:
|
71 |
+
print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL)
|
72 |
+
|
73 |
+
time_taken = time.time() - start_time
|
74 |
+
print(Fore.BLUE)
|
75 |
+
print(f"Total novelty score for this question: {question_novelty}")
|
76 |
+
print(f"Time taken: {time_taken} seconds")
|
77 |
+
print(Style.RESET_ALL)
|
78 |
+
|
79 |
+
return question_novelty
|
80 |
+
|
81 |
+
|
82 |
+
def get_novelty_score(new_answer: str, previous_answers: list):
|
83 |
+
new_embedding = embed(new_answer)
|
84 |
+
|
85 |
+
# If there are no previous answers, return maximum novelty
|
86 |
+
if not previous_answers:
|
87 |
+
return 1.0
|
88 |
+
|
89 |
+
previous_embeddings = [embed(answer) for answer in previous_answers]
|
90 |
+
|
91 |
+
similarities = [
|
92 |
+
np.dot(new_embedding, prev_embedding) /
|
93 |
+
(np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
|
94 |
+
for prev_embedding in previous_embeddings
|
95 |
+
]
|
96 |
+
|
97 |
+
max_similarity = max(similarities)
|
98 |
+
novelty = 1 - max_similarity
|
99 |
+
|
100 |
+
return novelty
|
101 |
+
|
102 |
+
|
103 |
+
def benchmark_model_multithreaded(model_name):
|
104 |
+
novelty_score = 0
|
105 |
+
print_lock = threading.Lock()
|
106 |
+
|
107 |
+
with ThreadPoolExecutor(max_workers=len(questions)) as executor:
|
108 |
+
future_to_question = {executor.submit(
|
109 |
+
process_question, question, model_name): question for question in questions}
|
110 |
+
|
111 |
+
for future in as_completed(future_to_question):
|
112 |
+
question = future_to_question[future]
|
113 |
+
|
114 |
+
question_novelty = future.result()
|
115 |
+
with print_lock:
|
116 |
+
novelty_score += question_novelty
|
117 |
+
|
118 |
+
print(Fore.YELLOW)
|
119 |
+
print(f"Total novelty score across all questions: {novelty_score}")
|
120 |
+
print(Style.RESET_ALL)
|
121 |
+
|
122 |
+
return novelty_score
|
123 |
+
|
124 |
+
|
125 |
+
def benchmark_model_sequential(model_name):
|
126 |
+
novelty_score = 0
|
127 |
+
|
128 |
+
for question in questions:
|
129 |
+
question_novelty = process_question(question, model_name)
|
130 |
+
novelty_score += question_novelty
|
131 |
+
|
132 |
+
print(Fore.YELLOW)
|
133 |
+
print(f"Total novelty score across all questions: {novelty_score}")
|
134 |
+
print(Style.RESET_ALL)
|
135 |
+
|
136 |
+
return novelty_score
|
137 |
+
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
args = parse_arguments()
|
141 |
+
benchmark_model(args.model_name, multithreaded=not args.single_threaded)
|
main.py:Zone.Identifier
ADDED
File without changes
|
models.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
from functools import lru_cache
|
4 |
+
from retry import retry
|
5 |
+
|
6 |
+
|
7 |
+
@retry(tries=3)
|
8 |
+
def chat_with_model(prompt, model, max_tokens=4000, temperature=0):
|
9 |
+
client = OpenAI(
|
10 |
+
api_key=os.getenv("OPEN_ROUTER_KEY"),
|
11 |
+
base_url="https://openrouter.ai/api/v1"
|
12 |
+
)
|
13 |
+
response = client.chat.completions.create(
|
14 |
+
model=model,
|
15 |
+
messages=[
|
16 |
+
{
|
17 |
+
"role": "user",
|
18 |
+
"content": prompt
|
19 |
+
}
|
20 |
+
],
|
21 |
+
max_tokens=max_tokens,
|
22 |
+
temperature=temperature
|
23 |
+
)
|
24 |
+
return response.choices[0].message.content
|
25 |
+
|
26 |
+
|
27 |
+
@lru_cache(maxsize=10000)
|
28 |
+
@retry(tries=3)
|
29 |
+
def embed(text):
|
30 |
+
client = OpenAI()
|
31 |
+
|
32 |
+
response = client.embeddings.create(
|
33 |
+
model="text-embedding-3-large", input=[text])
|
34 |
+
return response.data[0].embedding
|
models.py:Zone.Identifier
ADDED
File without changes
|
prompts.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Questions should be open-ended but demand concrete answers.
|
2 |
+
questions = [
|
3 |
+
"Provide an explanation for Japan's Lost Decades.",
|
4 |
+
"What is a cause of World War 1?",
|
5 |
+
|
6 |
+
]
|
7 |
+
|
8 |
+
|
9 |
+
def create_gen_prompt(question: str, previous_answers: list) -> str:
|
10 |
+
prompt = (
|
11 |
+
"Answer the following question:.\n"
|
12 |
+
f"<question>{question}</question>\n"
|
13 |
+
"Your response should be a single brief sentence.\n"
|
14 |
+
)
|
15 |
+
|
16 |
+
if len(previous_answers) > 0:
|
17 |
+
|
18 |
+
previous_answers_str = "\n".join(
|
19 |
+
[f"{i+1}. {answer}" for i, answer in enumerate(previous_answers)]
|
20 |
+
)
|
21 |
+
|
22 |
+
prompt += (
|
23 |
+
"IMPORTANT: Provide an answer you *HAVE NOT* given previously.\n"
|
24 |
+
"Your previous answers are inside of <previous_answers></previous_answers> XML tags.\n"
|
25 |
+
f"<previous_answers>\n{previous_answers_str}\n</previous_answers>"
|
26 |
+
)
|
27 |
+
|
28 |
+
return prompt
|
29 |
+
|
30 |
+
|
31 |
+
def create_judge_prompt(question: str, answer: str):
|
32 |
+
prompt = f""" Your task is to evaluate the coherence and plausibility of an answer to a given question. This involves assessing whether the answer makes sense and isn't nonsensical or implausible.
|
33 |
+
|
34 |
+
Question: <question>{question}</question>
|
35 |
+
Answer: <answer>{answer}</answer>
|
36 |
+
|
37 |
+
Evaluation process:
|
38 |
+
1. Understand the question: Analyze what the question is asking.
|
39 |
+
2. Assess the answer: Determine if the answer is coherent and plausible.
|
40 |
+
3. Check for nonsensical elements: Identify any aspects that are completely unrelated or absurd.
|
41 |
+
|
42 |
+
Please think through each step carefully and show your reasoning:
|
43 |
+
|
44 |
+
1. Question analysis:
|
45 |
+
[Your brief analysis of the question here]
|
46 |
+
|
47 |
+
2. Answer assessment:
|
48 |
+
[Evaluate if the answer is coherent and plausible]
|
49 |
+
|
50 |
+
3. Nonsensical check:
|
51 |
+
[Identify any completely unrelated or absurd elements]
|
52 |
+
|
53 |
+
Based on your analysis, provide a final Coherence and Plausibility Score on a scale of 1 - 10, where:
|
54 |
+
1-3: Incoherent, implausible, or nonsensical
|
55 |
+
4-6: Partially coherent and plausible, but with some issues
|
56 |
+
7-8: Mostly coherent and plausible with minor issues
|
57 |
+
9-10: Highly coherent and plausible
|
58 |
+
|
59 |
+
Ensure that nonsensical or completely implausible answers receive very low scores (1-3).
|
60 |
+
|
61 |
+
IMPORTANT: After your reasoning, you must provide your final Coherence and Plausibility Score as a single integer between 1 and 10, enclosed in <coherence_score></coherence_score> XML tags. For example:
|
62 |
+
<coherence_score>7</coherence_score>
|
63 |
+
|
64 |
+
Your response must end with this score in the specified format.
|
65 |
+
"""
|
66 |
+
return prompt
|
prompts.py:Zone.Identifier
ADDED
File without changes
|