Spaces:

Presidentlin
/

Aidan-Bench

Runtime error

App Files Files Community

Presidentlin commited on Aug 12, 2024

Commit

15bbe10

1 Parent(s): f0250b1

x

Browse files

Files changed (11) hide show

.gitattributes +1 -1
__pycache__/main.cpython-310.pyc +0 -0
__pycache__/models.cpython-310.pyc +0 -0
__pycache__/prompts.cpython-310.pyc +0 -0
app.py +167 -2
main.py +141 -0
main.py:Zone.Identifier +0 -0
models.py +34 -0
models.py:Zone.Identifier +0 -0
prompts.py +66 -0
prompts.py:Zone.Identifier +0 -0

.gitattributes CHANGED Viewed

@@ -1,4 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text

+st*.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text

__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (4.25 kB). View file

__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

__pycache__/prompts.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

app.py CHANGED Viewed

@@ -1,4 +1,169 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+from main import get_novelty_score
+from models import chat_with_model, embed
+from prompts import questions as predefined_questions, create_gen_prompt, create_judge_prompt
+import requests
+import numpy as np
+import os  # Import the os module
+st.title("Aiden Bench - Generator")
+# API Key Inputs with Security and User Experience Enhancements
+st.warning("Please keep your API keys secure and confidential.")
+open_router_key = st.text_input("Enter your Open Router API Key:", type="password")
+openai_api_key = st.text_input("Enter your OpenAI API Key:", type="password")
+# Set environment variables (temporarily)
+os.environ["OPEN_ROUTER_KEY"] = open_router_key
+os.environ["OPENAI_API_KEY"] = openai_api_key
+# Fetch models from OpenRouter API
+try:
+    response = requests.get("https://openrouter.ai/api/v1/models")
+    response.raise_for_status()  # Raise an exception for bad status codes
+    models = response.json()["data"]
+    # Sort models alphabetically by their ID
+    models.sort(key=lambda model: model["id"])
+    model_names = [model["id"] for model in models]
+except requests.exceptions.RequestException as e:
+    st.error(f"Error fetching models from OpenRouter API: {e}")
+    model_names = []  # Provide an empty list if API call fails
+# Model Selection
+if model_names:
+    model_name = st.selectbox("Select a Language Model", model_names)
+else:
+    st.error("No models available. Please check your API connection.")
+    st.stop()  # Stop execution if no models are available
+# Initialize session state for user_questions and predefined_questions
+if "user_questions" not in st.session_state:
+    st.session_state.user_questions = []
+# Workflow Selection
+workflow = st.radio("Select Workflow:", ["Use Predefined Questions", "Use User-Defined Questions"])
+# Handle Predefined Questions
+if workflow == "Use Predefined Questions":
+    st.header("Question Selection")
+    # Multiselect for predefined questions
+    selected_questions = st.multiselect(
+        "Select questions to benchmark:",
+        predefined_questions,
+        predefined_questions  # Select all by default
+    )
+# Handle User-Defined Questions
+elif workflow == "Use User-Defined Questions":
+    st.header("Question Input")
+    # Input for adding a new question
+    new_question = st.text_input("Enter a new question:")
+    if st.button("Add Question") and new_question:
+        new_question = new_question.strip()  # Remove leading/trailing whitespace
+        if new_question and new_question not in st.session_state.user_questions:
+            st.session_state.user_questions.append(new_question)  # Append to session state
+            st.success(f"Question '{new_question}' added successfully.")
+        else:
+            st.warning("Question already exists or is empty!")
+    # Display multiselect with updated user questions
+    selected_questions = st.multiselect(
+        "Select your custom questions:",
+        options=st.session_state.user_questions,
+        default=st.session_state.user_questions
+    )
+# Display selected questions
+st.write("Selected Questions:", selected_questions)
+# Benchmark Execution
+if st.button("Start Benchmark"):
+    if not selected_questions:
+        st.warning("Please select at least one question.")
+    elif not open_router_key or not openai_api_key:  # Check if API keys are provided
+        st.warning("Please enter both API keys.")
+    else:
+        # Initialize progress bar
+        progress_bar = st.progress(0)
+        num_questions = len(selected_questions)
+        results = []  # List to store results
+        # Iterate through selected questions
+        for i, question in enumerate(selected_questions):
+            # Display current question
+            st.write(f"Processing question {i+1}/{num_questions}: {question}")
+            previous_answers = []
+            question_novelty = 0
+            try:
+                while True:
+                    gen_prompt = create_gen_prompt(question, previous_answers)
+                    # Handle potential API errors for chat_with_model
+                    try:
+                        new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
+                    except requests.exceptions.RequestException as e:
+                        st.error(f"API Error: {e}")
+                        break  # Exit the loop if API error occurs
+                    judge_prompt = create_judge_prompt(question, new_answer)
+                    judge = "openai/gpt-4o-mini"
+                    # Handle potential API errors for chat_with_model (judge)
+                    try:
+                        judge_response = chat_with_model(prompt=judge_prompt, model=judge)
+                    except requests.exceptions.RequestException as e:
+                        st.error(f"API Error (Judge): {e}")
+                        break  # Exit the loop if API error occurs
+                    coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
+                    if coherence_score <= 3:
+                        st.warning("Output is incoherent. Moving to next question.")
+                        break
+                    novelty_score = get_novelty_score(new_answer, previous_answers)
+                    if novelty_score < 0.1:
+                        st.warning("Output is redundant. Moving to next question.")
+                        break
+                    st.write(f"New Answer:\n{new_answer}")
+                    st.write(f"Coherence Score: {coherence_score}")
+                    st.write(f"Novelty Score: {novelty_score}")
+                    previous_answers.append(new_answer)
+                    question_novelty += novelty_score
+            except Exception as e:
+                st.error(f"Error processing question: {e}")
+            results.append({
+                "question": question,
+                "answers": previous_answers,
+                "coherence_score": coherence_score,
+                "novelty_score": novelty_score
+            })
+            # Update progress bar
+            progress_bar.progress((i + 1) / num_questions)
+        st.success("Benchmark completed!")
+        # Display results in a table
+        st.write("Results:")
+        results_table = []
+        for result in results:
+            for answer in result["answers"]:
+                results_table.append({
+                    "Question": result["question"],
+                    "Answer": answer,
+                    "Coherence Score": result["coherence_score"],
+                    "Novelty Score": result["novelty_score"]
+                })
+        st.table(results_table)

main.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import numpy as np
+from models import chat_with_model, embed
+from prompts import questions, create_gen_prompt, create_judge_prompt
+from colorama import Fore, Style
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+import argparse
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Benchmark a language model.")
+    parser.add_argument("model_name", type=str, help="Name of the model to benchmark")
+    parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode")
+    return parser.parse_args()
+def benchmark_model(model_name, multithreaded=False):
+    if multithreaded:
+        return benchmark_model_multithreaded(model_name)
+    else:
+        return benchmark_model_sequential(model_name)
+def process_question(question, model_name):
+    start_time = time.time()
+    print(Fore.RED + question + Style.RESET_ALL)
+    previous_answers = []
+    question_novelty = 0
+    try:
+        while True:
+            gen_prompt = create_gen_prompt(question, previous_answers)
+            try:
+                new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
+            except Exception as e:
+                print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL)
+                break
+            judge_prompt = create_judge_prompt(question, new_answer)
+            judge = "openai/gpt-4o-mini"
+            try:
+                judge_response = chat_with_model(prompt=judge_prompt, model=judge)
+            except Exception as e:
+                print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL)
+                break
+            coherence_score = int(judge_response.split("<coherence_score>")[
+                                1].split("</coherence_score>")[0])
+            if coherence_score <= 3:
+                print(
+                    Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL)
+                break
+            novelty_score = get_novelty_score(new_answer, previous_answers)
+            if novelty_score < 0.1:
+                print(
+                    Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL)
+                break
+            print(f"New Answer:\n{new_answer}")
+            print(Fore.GREEN + f"Coherence Score: {coherence_score}")
+            print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL)
+            previous_answers.append(new_answer)
+            question_novelty += novelty_score
+    except Exception as e:
+        print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL)
+    time_taken = time.time() - start_time
+    print(Fore.BLUE)
+    print(f"Total novelty score for this question: {question_novelty}")
+    print(f"Time taken: {time_taken} seconds")
+    print(Style.RESET_ALL)
+    return question_novelty
+def get_novelty_score(new_answer: str, previous_answers: list):
+    new_embedding = embed(new_answer)
+    # If there are no previous answers, return maximum novelty
+    if not previous_answers:
+        return 1.0
+    previous_embeddings = [embed(answer) for answer in previous_answers]
+    similarities = [
+        np.dot(new_embedding, prev_embedding) /
+        (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
+        for prev_embedding in previous_embeddings
+    ]
+    max_similarity = max(similarities)
+    novelty = 1 - max_similarity
+    return novelty
+def benchmark_model_multithreaded(model_name):
+    novelty_score = 0
+    print_lock = threading.Lock()
+    with ThreadPoolExecutor(max_workers=len(questions)) as executor:
+        future_to_question = {executor.submit(
+            process_question, question, model_name): question for question in questions}
+        for future in as_completed(future_to_question):
+            question = future_to_question[future]
+            question_novelty = future.result()
+            with print_lock:
+                novelty_score += question_novelty
+    print(Fore.YELLOW)
+    print(f"Total novelty score across all questions: {novelty_score}")
+    print(Style.RESET_ALL)
+    return novelty_score
+def benchmark_model_sequential(model_name):
+    novelty_score = 0
+    for question in questions:
+        question_novelty = process_question(question, model_name)
+        novelty_score += question_novelty
+    print(Fore.YELLOW)
+    print(f"Total novelty score across all questions: {novelty_score}")
+    print(Style.RESET_ALL)
+    return novelty_score
+if __name__ == "__main__":
+    args = parse_arguments()
+    benchmark_model(args.model_name, multithreaded=not args.single_threaded)

main.py:Zone.Identifier ADDED Viewed

File without changes

models.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from openai import OpenAI
+import os
+from functools import lru_cache
+from retry import retry
+@retry(tries=3)
+def chat_with_model(prompt, model, max_tokens=4000, temperature=0):
+    client = OpenAI(
+        api_key=os.getenv("OPEN_ROUTER_KEY"),
+        base_url="https://openrouter.ai/api/v1"
+    )
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ],
+        max_tokens=max_tokens,
+        temperature=temperature
+    )
+    return response.choices[0].message.content
+@lru_cache(maxsize=10000)
+@retry(tries=3)
+def embed(text):
+    client = OpenAI()
+    response = client.embeddings.create(
+        model="text-embedding-3-large", input=[text])
+    return response.data[0].embedding

models.py:Zone.Identifier ADDED Viewed

File without changes

prompts.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Questions should be open-ended but demand concrete answers.
+questions = [
+    "Provide an explanation for Japan's Lost Decades.",
+    "What is a cause of World War 1?",
+]
+def create_gen_prompt(question: str, previous_answers: list) -> str:
+    prompt = (
+        "Answer the following question:.\n"
+        f"<question>{question}</question>\n"
+        "Your response should be a single brief sentence.\n"
+    )
+    if len(previous_answers) > 0:
+        previous_answers_str = "\n".join(
+            [f"{i+1}. {answer}" for i, answer in enumerate(previous_answers)]
+        )
+        prompt += (
+            "IMPORTANT: Provide an answer you *HAVE NOT* given previously.\n"
+            "Your previous answers are inside of <previous_answers></previous_answers> XML tags.\n"
+            f"<previous_answers>\n{previous_answers_str}\n</previous_answers>"
+        )
+    return prompt
+def create_judge_prompt(question: str, answer: str):
+    prompt = f"""    Your task is to evaluate the coherence and plausibility of an answer to a given question. This involves assessing whether the answer makes sense and isn't nonsensical or implausible.
+    Question: <question>{question}</question>
+    Answer: <answer>{answer}</answer>
+    Evaluation process:
+    1. Understand the question: Analyze what the question is asking.
+    2. Assess the answer: Determine if the answer is coherent and plausible.
+    3. Check for nonsensical elements: Identify any aspects that are completely unrelated or absurd.
+    Please think through each step carefully and show your reasoning:
+    1. Question analysis:
+    [Your brief analysis of the question here]
+    2. Answer assessment:
+    [Evaluate if the answer is coherent and plausible]
+    3. Nonsensical check:
+    [Identify any completely unrelated or absurd elements]
+    Based on your analysis, provide a final Coherence and Plausibility Score on a scale of 1 - 10, where:
+    1-3: Incoherent, implausible, or nonsensical
+    4-6: Partially coherent and plausible, but with some issues
+    7-8: Mostly coherent and plausible with minor issues
+    9-10: Highly coherent and plausible
+    Ensure that nonsensical or completely implausible answers receive very low scores (1-3).
+    IMPORTANT: After your reasoning, you must provide your final Coherence and Plausibility Score as a single integer between 1 and 10, enclosed in <coherence_score></coherence_score> XML tags. For example:
+    <coherence_score>7</coherence_score>
+    Your response must end with this score in the specified format.
+    """
+    return prompt

prompts.py:Zone.Identifier ADDED Viewed

File without changes