File size: 1,665 Bytes
6bec1f5
a8ad509
a9bd212
6bec1f5
 
ac62b55
 
 
 
 
 
 
 
6bec1f5
ac62b55
 
6bec1f5
 
 
 
 
 
 
 
 
 
f62fea6
6bec1f5
 
 
 
 
7d57619
 
f62fea6
c4e55cd
f62fea6
c4e55cd
f62fea6
 
 
 
ac62b55
 
 
 
b871994
 
 
6bec1f5
 
 
 
33a9520
ac62b55
6bec1f5
 
a9bd212
6bec1f5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os

# from benchmark.run_benchmark import run_benchmark


def run_benchmark_main(
    tool_name,
    model_name,
    num_questions,
    openai_api_key,
    anthropic_api_key,
    openrouter_api_key,
):
    """Run the benchmark using the provided function and API key."""

    print("Running benchmark for the provided api keys")
    # Empyt the results directory
    os.system("rm -rf results/*")

    # Set the benchmark parameters
    kwargs = {}
    if not num_questions:
        num_questions = 10
    kwargs["num_questions"] = num_questions
    kwargs["tools"] = [tool_name]
    if model_name:
        kwargs["model"] = [model_name]
    kwargs["api_keys"] = {}
    if openai_api_key:
        kwargs["api_keys"]["openai"] = openai_api_key
    if anthropic_api_key:
        kwargs["api_keys"]["anthropic"] = anthropic_api_key
    if openrouter_api_key:
        kwargs["api_keys"]["openrouter"] = openrouter_api_key

    if "gpt" in model_name:
        kwargs["llm_provider"] = "openai"
    elif "claude" in model_name:
        kwargs["llm_provider"] = "anthropic"
    else:
        kwargs["llm_provider"] = "openrouter"

    if (
        tool_name == "prediction-request-reasoning"
        or tool_name == "prediction-request-rag"
    ):
        if not openai_api_key:
            return f"Error: Tools that use RAG also require an OpenAI API Key"

    kwargs["num_urls"] = 3
    kwargs["num_words"] = 300
    kwargs["provide_source_links"] = True

    print(f"Running benchmark")

    # Run the benchmark
    try:
        # run_benchmark(kwargs=kwargs)
        return "completed"
    except Exception as e:
        return f"Error running benchmark: {e}"