adding_note
#2
by
cyberosa
- opened
- formatted_data.csv +3 -3
- images/autocast_dataset_timeline.png +0 -0
- tabs/faq.py +9 -6
formatted_data.csv
CHANGED
@@ -22,10 +22,10 @@ prediction-online,claude-2,0.6600660066006601,200,303,1505.3135313531352,0.01334
|
|
22 |
prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928988
|
23 |
prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.002112727564102551
|
24 |
prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176408
|
25 |
-
prediction-request-rag,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.625,5,8,3229.375,0.0017438625
|
26 |
prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999664
|
27 |
prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746164
|
|
|
|
|
28 |
prediction-request-rag,databricks/dbrx-instruct:nitro,0.5,5,10,2651.8,0.00238661999999997
|
29 |
prediction-online-sme,gpt-3.5-turbo-0125,0.49411764705882355,168,340,2189.1882352941175,0.002402523529411752
|
30 |
-
prediction-online,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4666666666666667,147,315,3143.4285714285716,0.001697451428571419
|
31 |
-
prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4,4,10,2957.6,0.0015971039999999998
|
|
|
22 |
prediction-offline,gpt-3.5-turbo-0125,0.6578171091445427,223,339,730.1740412979351,0.0007721681415928988
|
23 |
prediction-request-reasoning,gpt-3.5-turbo-0125,0.6506410256410257,203,312,1871.173076923077,0.002112727564102551
|
24 |
prediction-offline-sme,gpt-3.5-turbo-0125,0.6294117647058823,214,340,1341.8323529411764,0.0014778852941176408
|
|
|
25 |
prediction-request-reasoning,databricks/dbrx-instruct:nitro,0.5555555555555556,5,9,2257.8888888888887,0.0020320999999999664
|
26 |
prediction-online,gpt-3.5-turbo-0125,0.551622418879056,187,339,1576.684365781711,0.0016928525073746164
|
27 |
+
prediction-request-reasoning,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.535593220338983,158,295,2921.172881355932,0.0015774333559321892
|
28 |
+
prediction-request-rag,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.5018587360594795,135,269,3099.4869888475837,0.001673722973977683
|
29 |
prediction-request-rag,databricks/dbrx-instruct:nitro,0.5,5,10,2651.8,0.00238661999999997
|
30 |
prediction-online-sme,gpt-3.5-turbo-0125,0.49411764705882355,168,340,2189.1882352941175,0.002402523529411752
|
31 |
+
prediction-online,nousresearch/nous-hermes-2-mixtral-8x7b-sft,0.4666666666666667,147,315,3143.4285714285716,0.001697451428571419
|
|
images/autocast_dataset_timeline.png
ADDED
tabs/faq.py
CHANGED
@@ -1,20 +1,23 @@
|
|
1 |
about_olas_predict_benchmark = """\
|
2 |
How good are LLMs at making predictions about events in the future? This is a topic that hasn't been well explored to date.
|
3 |
-
[Olas Predict](https://olas.network/services/prediction-agents) aims to rectify this by incentivizing the creation of agents that
|
4 |
-
|
5 |
-
The leaderboard shows tool performance in terms of accuracy and cost. \
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
π€ Pick a tool and run it on the benchmark using the "π₯ Run the Benchmark" page!
|
8 |
"""
|
9 |
|
10 |
about_the_tools = """\
|
11 |
- [Prediction Offline](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, but no web crawling, to make predictions
|
12 |
- [Prediction Online](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, as well as web crawling, to make predictions
|
13 |
-
- [Prediction SME](https://github.com/valory-xyz/mech/blob/main/packages/nickcom007/customs/prediction_request_sme/prediction_request_sme.py) - Use prompt engineering to get the LLM to act as a Subject Matter Expert (SME) in making a prediction.
|
14 |
- [Prediction with RAG](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_rag/prediction_request_rag.py) - Uses retrieval-augment-generation (RAG) over extracted search result to make predictions.
|
15 |
-
- [Prediction with Research Report](https://github.com/valory-xyz/mech/blob/main/packages/polywrap/customs/prediction_with_research_report/prediction_with_research_report.py) - Generates a research report before making a prediction.
|
16 |
- [Prediction with Reasoning](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_reasoning/prediction_request_reasoning.py) - Incorporates an additional call to the LLM to do reasoning over retrieved data.
|
17 |
-
- [Prediction with CoT](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_url_cot/prediction_url_cot.py) - Use Chain of Thought (CoT) to make predictions.
|
18 |
"""
|
19 |
|
20 |
about_the_dataset = """\
|
|
|
1 |
about_olas_predict_benchmark = """\
|
2 |
How good are LLMs at making predictions about events in the future? This is a topic that hasn't been well explored to date.
|
3 |
+
[Olas Predict](https://olas.network/services/prediction-agents) aims to rectify this by incentivizing the creation of agents that make predictions about future events (through prediction markets).
|
4 |
+
These agents are tested in the wild on real-time prediction market data, which you can see on [here](https://huggingface.co/datasets/valory/prediction_market_data) on HuggingFace (updated weekly).\
|
|
|
5 |
|
6 |
+
However, if you want to create an agent with new tools, waiting for real-time results to arrive is slow. This is where the Olas Predict Benchmark comes in. It allows devs to backtest new approaches on a historical event forecasting dataset (refined from [Autocast](https://arxiv.org/abs/2206.15474)) with high iteration speed.
|
7 |
+
|
8 |
+
π π§ The autocast dataset resolved-questions are from a timeline ending in 2022, so the models might be trained on some of these data. Thus the current reported accuracy measure might be an in-sample forecasting one.
|
9 |
+
However, we can learn about the relative strengths of the different approaches (e.g models and logic), before testing the most promising ones on real-time unseen data.
|
10 |
+
This HF Space showcases the performance of the various models and workflows (called tools in the Olas ecosystem) for making predictions, in terms of accuracy and cost.\
|
11 |
+
|
12 |
+
|
13 |
π€ Pick a tool and run it on the benchmark using the "π₯ Run the Benchmark" page!
|
14 |
"""
|
15 |
|
16 |
about_the_tools = """\
|
17 |
- [Prediction Offline](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, but no web crawling, to make predictions
|
18 |
- [Prediction Online](https://github.com/valory-xyz/mech/blob/main/packages/valory/customs/prediction_request/prediction_request.py) - Uses prompt engineering, as well as web crawling, to make predictions
|
|
|
19 |
- [Prediction with RAG](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_rag/prediction_request_rag.py) - Uses retrieval-augment-generation (RAG) over extracted search result to make predictions.
|
|
|
20 |
- [Prediction with Reasoning](https://github.com/valory-xyz/mech/blob/main/packages/napthaai/customs/prediction_request_reasoning/prediction_request_reasoning.py) - Incorporates an additional call to the LLM to do reasoning over retrieved data.
|
|
|
21 |
"""
|
22 |
|
23 |
about_the_dataset = """\
|