formatted_data.csv CHANGED
@@ -22,10 +22,10 @@ prediction-online,claude-2,0.6600660066006601,200,303,1505.3135313531352,0.01334
22
  prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928988
23
  prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.002112727564102551
24
  prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176408
25
- prediction-request-rag,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.625,5,8,3229.375,0.0017438625
26
  prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999664
27
  prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746164
 
 
28
  prediction-request-rag,databricks/dbrx-instruct:nitro,0.5,5,10,2651.8,0.00238661999999997
29
  prediction-online-sme,gpt-3.5-turbo-0125,0.49411764705882355,168,340,2189.1882352941175,0.002402523529411752
30
- prediction-online,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4666666666666667,147,315,3143.4285714285716,0.001697451428571419
31
- prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4,4,10,2957.6,0.0015971039999999998
 
22
  prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928988
23
  prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.002112727564102551
24
  prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176408
 
25
  prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999664
26
  prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746164
27
+ prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321892
28
+ prediction-request-rag,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.5018587360594795,135,269,3099.4869888475837,0.001673722973977683
29
  prediction-request-rag,databricks/dbrx-instruct:nitro,0.5,5,10,2651.8,0.00238661999999997
30
  prediction-online-sme,gpt-3.5-turbo-0125,0.49411764705882355,168,340,2189.1882352941175,0.002402523529411752
31
+ prediction-online,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4666666666666667,147,315,3143.4285714285716,0.001697451428571419
 
images/autocast_dataset_timeline.png ADDED
tabs/faq.py CHANGED
@@ -1,20 +1,23 @@
1
  about_olas_predict_benchmark = """\
2
  How good are LLMs at making predictions about events in the future? This is a topic that hasn't been well explored to date.
3
- [Olas Predict](https://olas.network/services/prediction-agents) aims to rectify this by incentivizing the creation of agents that predict the future (through prediction markets).
4
- This is a leaderboard showing the performance of LLM tools for making predictions (event forecasting) on a dataset, refined from Autocast.\
5
- The leaderboard shows tool performance in terms of accuracy and cost. \
6
 
 
 
 
 
 
 
 
7
  πŸ€— Pick a tool and run it on the benchmark using the "πŸ”₯ Run the Benchmark" page!
8
  """
9
 
10
  about_the_tools = """\
11
  - [Prediction Offline](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, but no web crawling, to make predictions
12
  - [Prediction Online](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, as well as web crawling, to make predictions
13
- - [Prediction SME](https://github.com/valory-xyz/mech/blob/main/packages/nickcom007/customs/prediction_request_sme/prediction_request_sme.py) - Use prompt engineering to get the LLM to act as a Subject Matter Expert (SME) in making a prediction.
14
  - [Prediction with RAG](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_rag/prediction_request_rag.py) - Uses retrieval-augment-generation (RAG) over extracted search result to make predictions.
15
- - [Prediction with Research Report](https://github.com/valory-xyz/mech/blob/main/packages/polywrap/customs/prediction_with_research_report/prediction_with_research_report.py) - Generates a research report before making a prediction.
16
  - [Prediction with Reasoning](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_reasoning/prediction_request_reasoning.py) - Incorporates an additional call to the LLM to do reasoning over retrieved data.
17
- - [Prediction with CoT](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_url_cot/prediction_url_cot.py) - Use Chain of Thought (CoT) to make predictions.
18
  """
19
 
20
  about_the_dataset = """\
 
1
  about_olas_predict_benchmark = """\
2
  How good are LLMs at making predictions about events in the future? This is a topic that hasn't been well explored to date.
3
+ [Olas Predict](https://olas.network/services/prediction-agents) aims to rectify this by incentivizing the creation of agents that make predictions about future events (through prediction markets).
4
+ These agents are tested in the wild on real-time prediction market data, which you can see on [here](https://huggingface.co/datasets/valory/prediction_market_data) on HuggingFace (updated weekly).\
 
5
 
6
+ However, if you want to create an agent with new tools, waiting for real-time results to arrive is slow. This is where the Olas Predict Benchmark comes in. It allows devs to backtest new approaches on a historical event forecasting dataset (refined from [Autocast](https://arxiv.org/abs/2206.15474)) with high iteration speed.
7
+
8
+ πŸ—“ 🧐 The autocast dataset resolved-questions are from a timeline ending in 2022, so the models might be trained on some of these data. Thus the current reported accuracy measure might be an in-sample forecasting one.
9
+ However, we can learn about the relative strengths of the different approaches (e.g models and logic), before testing the most promising ones on real-time unseen data.
10
+ This HF Space showcases the performance of the various models and workflows (called tools in the Olas ecosystem) for making predictions, in terms of accuracy and cost.\
11
+
12
+
13
  πŸ€— Pick a tool and run it on the benchmark using the "πŸ”₯ Run the Benchmark" page!
14
  """
15
 
16
  about_the_tools = """\
17
  - [Prediction Offline](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, but no web crawling, to make predictions
18
  - [Prediction Online](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, as well as web crawling, to make predictions
 
19
  - [Prediction with RAG](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_rag/prediction_request_rag.py) - Uses retrieval-augment-generation (RAG) over extracted search result to make predictions.
 
20
  - [Prediction with Reasoning](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_reasoning/prediction_request_reasoning.py) - Incorporates an additional call to the LLM to do reasoning over retrieved data.
 
21
  """
22
 
23
  about_the_dataset = """\