cyberosa commited on
Commit
16d0da9
β€’
1 Parent(s): e3f2881

disabling the run benchmark feature to fix the leaderboard

Browse files
Files changed (4) hide show
  1. .gitmodules +0 -3
  2. app.py +127 -125
  3. olas-predict-benchmark +0 -1
  4. tabs/faq.py +2 -2
.gitmodules DELETED
@@ -1,3 +0,0 @@
1
- [submodule "olas-predict-benchmark"]
2
- path = olas-predict-benchmark
3
- url = https://github.com/valory-xyz/olas-predict-benchmark.git
 
 
 
 
app.py CHANGED
@@ -11,69 +11,71 @@ from tabs.faq import (
11
  about_the_tools,
12
  )
13
  from tabs.howto_benchmark import how_to_run
14
- from tabs.run_benchmark import run_benchmark_main
 
 
15
 
16
 
17
  demo = gr.Blocks()
18
 
19
 
20
- def run_benchmark_gradio(
21
- tool_name,
22
- model_name,
23
- num_questions,
24
- openai_api_key,
25
- anthropic_api_key,
26
- openrouter_api_key,
27
- ):
28
- """Run the benchmark using inputs."""
29
- if tool_name is None:
30
- return "Please enter the name of your tool."
31
- if (
32
- openai_api_key is None
33
- and anthropic_api_key is None
34
- and openrouter_api_key is None
35
- ):
36
- return "Please enter either OpenAI or Anthropic or OpenRouter API key."
37
-
38
- result = run_benchmark_main(
39
- tool_name,
40
- model_name,
41
- num_questions,
42
- openai_api_key,
43
- anthropic_api_key,
44
- openrouter_api_key,
45
- )
46
 
47
- if result == "completed":
48
- # get the results file in the results directory
49
- fns = glob("results/*.csv")
50
 
51
- print(f"Number of files in results directory: {len(fns)}")
52
 
53
- # convert to Path
54
- files = [Path(file) for file in fns]
55
 
56
- # get results and summary files
57
- results_files = [file for file in files if "results" in file.name]
58
 
59
- # the other file is the summary file
60
- summary_files = [file for file in files if "summary" in file.name]
61
 
62
- print(results_files, summary_files)
63
 
64
- # get the path with results
65
- results_df = pd.read_csv(results_files[0])
66
- summary_df = pd.read_csv(summary_files[0])
67
 
68
- # make sure all df float values are rounded to 4 decimal places
69
- results_df = results_df.round(4)
70
- summary_df = summary_df.round(4)
71
 
72
- return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
73
 
74
- return gr.Textbox(
75
- label="Benchmark Result", value=result, interactive=False
76
- ), gr.Textbox(label="Summary", value="")
77
 
78
 
79
  with demo:
@@ -110,83 +112,83 @@ with demo:
110
  gr.Markdown(how_to_run)
111
 
112
  # fourth tab - run the benchmark
113
- with gr.TabItem("πŸ”₯ Run the Benchmark"):
114
- with gr.Row():
115
- tool_name = gr.Dropdown(
116
- [
117
- "prediction-offline",
118
- "prediction-online",
119
- # "prediction-online-summarized-info",
120
- # "prediction-offline-sme",
121
- # "prediction-online-sme",
122
- "prediction-request-rag",
123
- "prediction-request-reasoning",
124
- # "prediction-url-cot-claude",
125
- # "prediction-request-rag-cohere",
126
- # "prediction-with-research-conservative",
127
- # "prediction-with-research-bold",
128
- ],
129
- label="Tool Name",
130
- info="Choose the tool to run",
131
- )
132
- model_name = gr.Dropdown(
133
- [
134
- "gpt-3.5-turbo-0125",
135
- "gpt-4-0125-preview",
136
- "claude-3-haiku-20240307",
137
- "claude-3-sonnet-20240229",
138
- "claude-3-opus-20240229",
139
- "databricks/dbrx-instruct:nitro",
140
- "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
141
- # "cohere/command-r-plus",
142
- ],
143
- label="Model Name",
144
- info="Choose the model to use",
145
- )
146
- with gr.Row():
147
- openai_api_key = gr.Textbox(
148
- label="OpenAI API Key",
149
- placeholder="Enter your OpenAI API key here",
150
- type="password",
151
- )
152
- anthropic_api_key = gr.Textbox(
153
- label="Anthropic API Key",
154
- placeholder="Enter your Anthropic API key here",
155
- type="password",
156
- )
157
- openrouter_api_key = gr.Textbox(
158
- label="OpenRouter API Key",
159
- placeholder="Enter your OpenRouter API key here",
160
- type="password",
161
- )
162
- with gr.Row():
163
- num_questions = gr.Slider(
164
- minimum=1,
165
- maximum=340,
166
- value=10,
167
- label="Number of questions to run the benchmark on",
168
- )
169
- with gr.Row():
170
- run_button = gr.Button("Run Benchmark")
171
- with gr.Row():
172
- with gr.Accordion("Results", open=True):
173
- result = gr.Dataframe()
174
- with gr.Row():
175
- with gr.Accordion("Summary", open=False):
176
- summary = gr.Dataframe()
177
-
178
- run_button.click(
179
- run_benchmark_gradio,
180
- inputs=[
181
- tool_name,
182
- model_name,
183
- num_questions,
184
- openai_api_key,
185
- anthropic_api_key,
186
- openrouter_api_key,
187
- ],
188
- outputs=[result, summary],
189
- )
190
 
191
 
192
  demo.queue(default_concurrency_limit=40).launch()
 
11
  about_the_tools,
12
  )
13
  from tabs.howto_benchmark import how_to_run
14
+
15
+ # Feature temporarily disabled til HF support helps us with the Space Error
16
+ # from tabs.run_benchmark import run_benchmark_main
17
 
18
 
19
  demo = gr.Blocks()
20
 
21
 
22
+ # def run_benchmark_gradio(
23
+ # tool_name,
24
+ # model_name,
25
+ # num_questions,
26
+ # openai_api_key,
27
+ # anthropic_api_key,
28
+ # openrouter_api_key,
29
+ # ):
30
+ # """Run the benchmark using inputs."""
31
+ # if tool_name is None:
32
+ # return "Please enter the name of your tool."
33
+ # if (
34
+ # openai_api_key is None
35
+ # and anthropic_api_key is None
36
+ # and openrouter_api_key is None
37
+ # ):
38
+ # return "Please enter either OpenAI or Anthropic or OpenRouter API key."
39
+
40
+ # result = run_benchmark_main(
41
+ # tool_name,
42
+ # model_name,
43
+ # num_questions,
44
+ # openai_api_key,
45
+ # anthropic_api_key,
46
+ # openrouter_api_key,
47
+ # )
48
 
49
+ # if result == "completed":
50
+ # # get the results file in the results directory
51
+ # fns = glob("results/*.csv")
52
 
53
+ # print(f"Number of files in results directory: {len(fns)}")
54
 
55
+ # # convert to Path
56
+ # files = [Path(file) for file in fns]
57
 
58
+ # # get results and summary files
59
+ # results_files = [file for file in files if "results" in file.name]
60
 
61
+ # # the other file is the summary file
62
+ # summary_files = [file for file in files if "summary" in file.name]
63
 
64
+ # print(results_files, summary_files)
65
 
66
+ # # get the path with results
67
+ # results_df = pd.read_csv(results_files[0])
68
+ # summary_df = pd.read_csv(summary_files[0])
69
 
70
+ # # make sure all df float values are rounded to 4 decimal places
71
+ # results_df = results_df.round(4)
72
+ # summary_df = summary_df.round(4)
73
 
74
+ # return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
75
 
76
+ # return gr.Textbox(
77
+ # label="Benchmark Result", value=result, interactive=False
78
+ # ), gr.Textbox(label="Summary", value="")
79
 
80
 
81
  with demo:
 
112
  gr.Markdown(how_to_run)
113
 
114
  # fourth tab - run the benchmark
115
+ # with gr.TabItem("πŸ”₯ Run the Benchmark"):
116
+ # with gr.Row():
117
+ # tool_name = gr.Dropdown(
118
+ # [
119
+ # "prediction-offline",
120
+ # "prediction-online",
121
+ # # "prediction-online-summarized-info",
122
+ # # "prediction-offline-sme",
123
+ # # "prediction-online-sme",
124
+ # "prediction-request-rag",
125
+ # "prediction-request-reasoning",
126
+ # # "prediction-url-cot-claude",
127
+ # # "prediction-request-rag-cohere",
128
+ # # "prediction-with-research-conservative",
129
+ # # "prediction-with-research-bold",
130
+ # ],
131
+ # label="Tool Name",
132
+ # info="Choose the tool to run",
133
+ # )
134
+ # model_name = gr.Dropdown(
135
+ # [
136
+ # "gpt-3.5-turbo-0125",
137
+ # "gpt-4-0125-preview",
138
+ # "claude-3-haiku-20240307",
139
+ # "claude-3-sonnet-20240229",
140
+ # "claude-3-opus-20240229",
141
+ # "databricks/dbrx-instruct:nitro",
142
+ # "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
143
+ # # "cohere/command-r-plus",
144
+ # ],
145
+ # label="Model Name",
146
+ # info="Choose the model to use",
147
+ # )
148
+ # with gr.Row():
149
+ # openai_api_key = gr.Textbox(
150
+ # label="OpenAI API Key",
151
+ # placeholder="Enter your OpenAI API key here",
152
+ # type="password",
153
+ # )
154
+ # anthropic_api_key = gr.Textbox(
155
+ # label="Anthropic API Key",
156
+ # placeholder="Enter your Anthropic API key here",
157
+ # type="password",
158
+ # )
159
+ # openrouter_api_key = gr.Textbox(
160
+ # label="OpenRouter API Key",
161
+ # placeholder="Enter your OpenRouter API key here",
162
+ # type="password",
163
+ # )
164
+ # with gr.Row():
165
+ # num_questions = gr.Slider(
166
+ # minimum=1,
167
+ # maximum=340,
168
+ # value=10,
169
+ # label="Number of questions to run the benchmark on",
170
+ # )
171
+ # with gr.Row():
172
+ # run_button = gr.Button("Run Benchmark")
173
+ # with gr.Row():
174
+ # with gr.Accordion("Results", open=True):
175
+ # result = gr.Dataframe()
176
+ # with gr.Row():
177
+ # with gr.Accordion("Summary", open=False):
178
+ # summary = gr.Dataframe()
179
+
180
+ # run_button.click(
181
+ # run_benchmark_gradio,
182
+ # inputs=[
183
+ # tool_name,
184
+ # model_name,
185
+ # num_questions,
186
+ # openai_api_key,
187
+ # anthropic_api_key,
188
+ # openrouter_api_key,
189
+ # ],
190
+ # outputs=[result, summary],
191
+ # )
192
 
193
 
194
  demo.queue(default_concurrency_limit=40).launch()
olas-predict-benchmark DELETED
@@ -1 +0,0 @@
1
- Subproject commit cdb77050567ef441e231960cb2a26c20cf09cc30
 
 
tabs/faq.py CHANGED
@@ -10,7 +10,7 @@ However, we can learn about the relative strengths of the different approaches (
10
  This HF Space showcases the performance of the various models and workflows (called tools in the Olas ecosystem) for making predictions, in terms of accuracy and cost.\
11
 
12
 
13
- πŸ€— Pick a tool and run it on the benchmark using the "πŸ”₯ Run the Benchmark" page!
14
  """
15
 
16
  about_the_tools = """\
@@ -48,4 +48,4 @@ about_olas_predict = """\
48
  Olas is a network of autonomous services that can run complex logic in a decentralized manner, interacting with on- and off-chain data autonomously and continuously. For other use cases check out [olas.network](https://olas.network/).
49
  Since 'Olas' means 'waves' in Spanish, it is sometimes referred to as the 'ocean of services' 🌊.
50
  The project is co-created by [Valory](https://www.valory.xyz/). Valory aspires to enable communities, organizations and countries to co-own AI systems, beginning with decentralized autonomous agents.
51
- """
 
10
  This HF Space showcases the performance of the various models and workflows (called tools in the Olas ecosystem) for making predictions, in terms of accuracy and cost.\
11
 
12
 
13
+ πŸ€— Pick a tool and run it on the benchmark using the "πŸ”₯ Run the Benchmark" page! (This feature is temporarily disabled due to an error in HF Spaces)
14
  """
15
 
16
  about_the_tools = """\
 
48
  Olas is a network of autonomous services that can run complex logic in a decentralized manner, interacting with on- and off-chain data autonomously and continuously. For other use cases check out [olas.network](https://olas.network/).
49
  Since 'Olas' means 'waves' in Spanish, it is sometimes referred to as the 'ocean of services' 🌊.
50
  The project is co-created by [Valory](https://www.valory.xyz/). Valory aspires to enable communities, organizations and countries to co-own AI systems, beginning with decentralized autonomous agents.
51
+ """