Spaces:

valory
/

olas-prediction-live-dashboard

Running

App Files Files Community

rosacastillo commited on 24 days ago

Commit

ea0955a

1 Parent(s): 5d61ee1

removed dependency with tools.parquet and new mech calls computation timestamps based

Browse files

Files changed (28) hide show

app.py +34 -17
data/all_trades_profitability.parquet +2 -2
data/{summary_profitability.parquet → error_by_markets.parquet} +2 -2
data/invalid_trades.parquet +2 -2
data/service_map.pkl +1 -1
data/tools.parquet +0 -3
data/tools_accuracy.csv +2 -2
data/unknown_traders.parquet +2 -2
data/{t_map.pkl → winning_df.parquet} +2 -2
notebooks/mech_calls_analysis.ipynb +125 -97
notebooks/tool_errors_analysis.ipynb +923 -34
scripts/cleaning_old_info.py +14 -13
scripts/daily_data.py +1 -1
scripts/get_mech_info.py +11 -41
scripts/gnosis_timestamps.py +184 -0
scripts/mech_request_utils.py +9 -9
scripts/nr_mech_calls.py +13 -15
scripts/profitability.py +24 -90
scripts/pull_data.py +7 -12
scripts/roi_analysis.py +0 -129
scripts/staking.py +7 -8
scripts/tools.py +6 -317
scripts/tools_metrics.py +93 -0
scripts/update_tools_accuracy.py +3 -5
scripts/web3_utils.py +12 -4
tabs/error.py +2 -21
tabs/metrics.py +0 -73
tabs/tool_win.py +0 -34

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from datetime import datetime, timedelta
 import gradio as gr
 import pandas as pd
 import duckdb
@@ -23,8 +22,6 @@ from tabs.metrics import (
 )
 from tabs.tool_win import (
-    prepare_tools,
-    get_tool_winning_rate_by_market,
     integrated_plot_tool_winnings_overall_per_market_by_week,
     integrated_tool_winnings_by_tool_per_market,
 )
@@ -44,7 +41,6 @@ from tabs.invalid_markets import (
 from tabs.error import (
     plot_week_error_data_by_market,
     plot_error_data_by_market,
-    get_error_data_by_market,
     get_error_data_overall_by_market,
     plot_tool_error_data_by_market,
 )
@@ -77,6 +73,12 @@ def get_all_data():
     logger.info("Getting all data")
     con = duckdb.connect(":memory:")
     query5 = f"""
     SELECT *
     FROM read_parquet('./data/unknown_traders.parquet')
@@ -107,26 +109,30 @@ def get_all_data():
     query1 = f"""
     SELECT *
-    FROM read_parquet('./data/tools.parquet')
     """
     df1 = con.execute(query1).fetchdf()
-    logger.info("Got all data from tools.parquet")
     con.close()
-    return df1, df2, df3, df4, df5
 def prepare_data():
     """
     Prepare the data for the dashboard
     """
-    tools_df, trades_df, tools_accuracy_info, invalid_trades, unknown_trades = (
-        get_all_data()
-    )
     print(trades_df.info())
-    tools_df = prepare_tools(tools_df)
     trades_df = prepare_trades(trades_df)
     unknown_trades = prepare_trades(unknown_trades)
@@ -145,22 +151,33 @@ def prepare_data():
         outliers.to_parquet("./data/outliers.parquet")
         trades_df = trades_df.loc[trades_df["roi"] < 1000]
-    return tools_df, trades_df, tools_accuracy_info, invalid_trades, unknown_trades
-tools_df, trades_df, tools_accuracy_info, invalid_trades, unknown_trades = (
-    prepare_data()
-)
 trades_df = trades_df.sort_values(by="creation_timestamp", ascending=True)
 unknown_trades = unknown_trades.sort_values(by="creation_timestamp", ascending=True)
 demo = gr.Blocks()
 # preparing data for the errors
-error_by_markets = get_error_data_by_market(tools_df=tools_df, inc_tools=INC_TOOLS)
 error_overall_by_markets = get_error_data_overall_by_market(error_df=error_by_markets)
-winning_df = get_tool_winning_rate_by_market(tools_df, inc_tools=INC_TOOLS)
 # preparing data for the trades graph
 trades_count_df = get_overall_trades(trades_df=trades_df)
 trades_by_market = get_overall_by_market_trades(trades_df=trades_df)

 import gradio as gr
 import pandas as pd
 import duckdb
 )
 from tabs.tool_win import (
     integrated_plot_tool_winnings_overall_per_market_by_week,
     integrated_tool_winnings_by_tool_per_market,
 )
 from tabs.error import (
     plot_week_error_data_by_market,
     plot_error_data_by_market,
     get_error_data_overall_by_market,
     plot_tool_error_data_by_market,
 )
     logger.info("Getting all data")
     con = duckdb.connect(":memory:")
+    query6 = f"""
+    SELECT *
+    FROM read_parquet('./data/winning_df.parquet')
+    """
+    df6 = con.execute(query6).fetchdf()
     query5 = f"""
     SELECT *
     FROM read_parquet('./data/unknown_traders.parquet')
     query1 = f"""
     SELECT *
+    FROM read_parquet('./data/error_by_markets.parquet')
     """
     df1 = con.execute(query1).fetchdf()
+    logger.info("Got all data from error_by_markets.parquet")
     con.close()
+    return df1, df2, df3, df4, df5, df6
 def prepare_data():
     """
     Prepare the data for the dashboard
     """
+    (
+        error_by_markets,
+        trades_df,
+        tools_accuracy_info,
+        invalid_trades,
+        unknown_trades,
+        winning_df,
+    ) = get_all_data()
     print(trades_df.info())
     trades_df = prepare_trades(trades_df)
     unknown_trades = prepare_trades(unknown_trades)
         outliers.to_parquet("./data/outliers.parquet")
         trades_df = trades_df.loc[trades_df["roi"] < 1000]
+    return (
+        error_by_markets,
+        trades_df,
+        tools_accuracy_info,
+        invalid_trades,
+        unknown_trades,
+        winning_df,
+    )
+(
+    error_by_markets,
+    trades_df,
+    tools_accuracy_info,
+    invalid_trades,
+    unknown_trades,
+    winning_df,
+) = prepare_data()
 trades_df = trades_df.sort_values(by="creation_timestamp", ascending=True)
 unknown_trades = unknown_trades.sort_values(by="creation_timestamp", ascending=True)
 demo = gr.Blocks()
 # preparing data for the errors
 error_overall_by_markets = get_error_data_overall_by_market(error_df=error_by_markets)
 # preparing data for the trades graph
 trades_count_df = get_overall_trades(trades_df=trades_df)
 trades_by_market = get_overall_by_market_trades(trades_df=trades_df)

data/all_trades_profitability.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec1f80e9de64d8981ac58b91de9e17f371c0620544c9012168519a6a789b512c
-size 3537818

 version https://git-lfs.github.com/spec/v1
+oid sha256:2dc010db5a3f4163f3d09274101a14cd63a860e64c92649c694c816f28799342
+size 6789999

data/{summary_profitability.parquet → error_by_markets.parquet} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20a70aff0b89a48381a0cf73ffb65ae9a41002b81bec1dd1ded9e454b86e9245
-size 112166

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbe47e7cb744db4522161c6c121ac9393937d53ca372a2210952f7a469f59489
+size 12067

data/invalid_trades.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88697b4baf7652f32c3413f1fc168f534f2472281761fa4e5208751f1a0bae56
-size 123705

 version https://git-lfs.github.com/spec/v1
+oid sha256:9b7c4c831e583b8632a6a45079df9e400fea4e40287bbed594624ad9f9437907
+size 196588

data/service_map.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6109116eb0c946088a55420b04cc85576985cb0bef7ec47c3b2be97ee85688e8
 size 90766

 version https://git-lfs.github.com/spec/v1
+oid sha256:93ac540e1bcd347a48b9978b87443ae64af0f8b0a4daff305c4ad99cd0959a73
 size 90766

data/tools.parquet DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0d5753d5858231903cf1bc20f47a54dae742f35da95ed15ddcb5f44a5be8338f
-size 616260724

data/tools_accuracy.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33bf015940a44f02ababb579398272ffc258a48d10e16be075179f18f4a2d578
-size 1101

 version https://git-lfs.github.com/spec/v1
+oid sha256:818026934d2218b01f130770ffcb7563c80de0900be6721a55cd2499f9731889
+size 1100

data/unknown_traders.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ab41a7a35d8bf5c588b95849ec650e048578ddcbb18bc62df0e7a3c96902ea5
-size 368142

 version https://git-lfs.github.com/spec/v1
+oid sha256:0164ef5ecaf966a5dcc677d96bba860c344f43cf53e237b6687b797502bd5e36
+size 184719

data/{t_map.pkl → winning_df.parquet} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c6ad41a6442cbe773ea3062c116a132fb67fcbfc2c9dbaaf990088da58b80b7
-size 15990448

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe676fcd7dde4b833f770dafa8e474a96bbe17fb16b9ceb160c03c2519ba72b4
+size 12980

notebooks/mech_calls_analysis.ipynb CHANGED Viewed

@@ -59,7 +59,6 @@
        "      <th>trade_fee_amount</th>\n",
        "      <th>outcomes_tokens_traded</th>\n",
        "      <th>...</th>\n",
-       "      <th>is_invalid</th>\n",
        "      <th>winning_trade</th>\n",
        "      <th>earnings</th>\n",
        "      <th>redeemed</th>\n",
@@ -69,6 +68,7 @@
        "      <th>net_earnings</th>\n",
        "      <th>roi</th>\n",
        "      <th>staking</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
@@ -76,125 +76,125 @@
        "      <th>0</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
-       "      <td>0x007068173910cf8719b6f2e66a18b6825c9dde820x01...</td>\n",
-       "      <td>2024-10-10 21:43:25+00:00</td>\n",
-       "      <td>Will the emergency public warning tests planne...</td>\n",
        "      <td>CLOSED</td>\n",
-       "      <td>0.930597</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.009306</td>\n",
-       "      <td>1.574258</td>\n",
        "      <td>...</td>\n",
-       "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>1.574258</td>\n",
        "      <td>True</td>\n",
-       "      <td>1.574258</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>0.624356</td>\n",
-       "      <td>0.657284</td>\n",
        "      <td>non_staking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
-       "      <td>0x00d659d7749fda4f1c9402182ca5d7ce26cf5cd10x01...</td>\n",
-       "      <td>2024-10-18 00:36:50+00:00</td>\n",
-       "      <td>Will the Northern Lights be visible over UK sk...</td>\n",
        "      <td>CLOSED</td>\n",
-       "      <td>1.375603</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.013756</td>\n",
-       "      <td>1.942215</td>\n",
        "      <td>...</td>\n",
        "      <td>False</td>\n",
-       "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>-1.399359</td>\n",
        "      <td>-1.000000</td>\n",
        "      <td>non_staking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
-       "      <td>0x02ccdf04646d9a55332e67a73e4ffdab2368d05f0x01...</td>\n",
-       "      <td>2024-10-23 22:37:35+00:00</td>\n",
-       "      <td>Will any Republican lawmakers introduce legisl...</td>\n",
        "      <td>CLOSED</td>\n",
-       "      <td>0.471695</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.004717</td>\n",
-       "      <td>0.784784</td>\n",
        "      <td>...</td>\n",
-       "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>0.784784</td>\n",
        "      <td>True</td>\n",
-       "      <td>0.784784</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>0.298372</td>\n",
-       "      <td>0.613414</td>\n",
        "      <td>non_staking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
-       "      <td>0x09f47ce8995abf1d5b91f2cbfa940ede2fb954c30x01...</td>\n",
-       "      <td>2024-10-20 23:58:35+00:00</td>\n",
-       "      <td>Will any new major AI-driven fraud detection t...</td>\n",
        "      <td>CLOSED</td>\n",
-       "      <td>0.289046</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.002890</td>\n",
-       "      <td>0.445590</td>\n",
        "      <td>...</td>\n",
        "      <td>False</td>\n",
-       "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
-       "      <td>5</td>\n",
-       "      <td>0.05</td>\n",
-       "      <td>-0.341936</td>\n",
        "      <td>-1.000000</td>\n",
        "      <td>non_staking</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
-       "      <td>0x0c86942c52740316bbdb70303c5aaee40876d8ce0x01...</td>\n",
-       "      <td>2024-10-24 22:42:00+00:00</td>\n",
-       "      <td>Will any new norovirus vaccine trial sites be ...</td>\n",
        "      <td>CLOSED</td>\n",
-       "      <td>0.286552</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.002866</td>\n",
-       "      <td>0.470457</td>\n",
        "      <td>...</td>\n",
-       "      <td>False</td>\n",
        "      <td>True</td>\n",
-       "      <td>0.470457</td>\n",
        "      <td>True</td>\n",
-       "      <td>0.470457</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.01</td>\n",
-       "      <td>0.171040</td>\n",
-       "      <td>0.571242</td>\n",
        "      <td>non_staking</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>5 rows × 21 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
@@ -206,48 +206,48 @@
        "4  0x01274796ce41aa8e8312e05a427ffb4b0d2148f6     quickstart   \n",
        "\n",
        "                                            trade_id  \\\n",
-       "0  0x007068173910cf8719b6f2e66a18b6825c9dde820x01...   \n",
-       "1  0x00d659d7749fda4f1c9402182ca5d7ce26cf5cd10x01...   \n",
-       "2  0x02ccdf04646d9a55332e67a73e4ffdab2368d05f0x01...   \n",
-       "3  0x09f47ce8995abf1d5b91f2cbfa940ede2fb954c30x01...   \n",
-       "4  0x0c86942c52740316bbdb70303c5aaee40876d8ce0x01...   \n",
        "\n",
        "         creation_timestamp  \\\n",
-       "0 2024-10-10 21:43:25+00:00   \n",
-       "1 2024-10-18 00:36:50+00:00   \n",
-       "2 2024-10-23 22:37:35+00:00   \n",
-       "3 2024-10-20 23:58:35+00:00   \n",
-       "4 2024-10-24 22:42:00+00:00   \n",
        "\n",
        "                                               title market_status  \\\n",
-       "0  Will the emergency public warning tests planne...        CLOSED   \n",
-       "1  Will the Northern Lights be visible over UK sk...        CLOSED   \n",
-       "2  Will any Republican lawmakers introduce legisl...        CLOSED   \n",
-       "3  Will any new major AI-driven fraud detection t...        CLOSED   \n",
-       "4  Will any new norovirus vaccine trial sites be ...        CLOSED   \n",
        "\n",
        "   collateral_amount outcome_index  trade_fee_amount  outcomes_tokens_traded  \\\n",
-       "0           0.930597             0          0.009306                1.574258   \n",
-       "1           1.375603             1          0.013756                1.942215   \n",
-       "2           0.471695             1          0.004717                0.784784   \n",
-       "3           0.289046             1          0.002890                0.445590   \n",
-       "4           0.286552             1          0.002866                0.470457   \n",
        "\n",
-       "   ...  is_invalid  winning_trade  earnings  redeemed  redeemed_amount  \\\n",
-       "0  ...       False           True  1.574258      True         1.574258   \n",
-       "1  ...       False          False  0.000000     False         0.000000   \n",
-       "2  ...       False           True  0.784784      True         0.784784   \n",
-       "3  ...       False          False  0.000000     False         0.000000   \n",
-       "4  ...       False           True  0.470457      True         0.470457   \n",
        "\n",
-       "   num_mech_calls  mech_fee_amount  net_earnings       roi      staking  \n",
-       "0               1             0.01      0.624356  0.657284  non_staking  \n",
-       "1               1             0.01     -1.399359 -1.000000  non_staking  \n",
-       "2               1             0.01      0.298372  0.613414  non_staking  \n",
-       "3               5             0.05     -0.341936 -1.000000  non_staking  \n",
-       "4               1             0.01      0.171040  0.571242  non_staking  \n",
        "\n",
-       "[5 rows x 21 columns]"
       ]
      },
      "execution_count": 3,
@@ -259,6 +259,34 @@
     "all_trades.head()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,

        "      <th>trade_fee_amount</th>\n",
        "      <th>outcomes_tokens_traded</th>\n",
        "      <th>...</th>\n",
        "      <th>winning_trade</th>\n",
        "      <th>earnings</th>\n",
        "      <th>redeemed</th>\n",
        "      <th>net_earnings</th>\n",
        "      <th>roi</th>\n",
        "      <th>staking</th>\n",
+       "      <th>nr_mech_calls</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "      <th>0</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
+       "      <td>0x0dfb9821725003c4d3007999968d34d7070959ef0x01...</td>\n",
+       "      <td>2024-10-27 21:51:25+00:00</td>\n",
+       "      <td>Will any mainstream U.S. news outlet publish a...</td>\n",
        "      <td>CLOSED</td>\n",
+       "      <td>0.461993</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.004620</td>\n",
+       "      <td>0.734537</td>\n",
        "      <td>...</td>\n",
        "      <td>True</td>\n",
+       "      <td>0.734537</td>\n",
        "      <td>True</td>\n",
+       "      <td>0.734537</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.02</td>\n",
+       "      <td>0.247924</td>\n",
+       "      <td>0.509488</td>\n",
        "      <td>non_staking</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
+       "      <td>0x1082be4e429e512182089162f41b3a86a52eee370x01...</td>\n",
+       "      <td>2024-10-31 22:50:15+00:00</td>\n",
+       "      <td>Will Prime Minister Shigeru Ishiba announce a ...</td>\n",
        "      <td>CLOSED</td>\n",
+       "      <td>0.859939</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.008599</td>\n",
+       "      <td>2.714890</td>\n",
        "      <td>...</td>\n",
        "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.08</td>\n",
+       "      <td>-0.948538</td>\n",
        "      <td>-1.000000</td>\n",
        "      <td>non_staking</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
+       "      <td>0x150f4d4e5affa7fe332684d7c828c0a471c4d5de0x01...</td>\n",
+       "      <td>2024-10-29 02:21:25+00:00</td>\n",
+       "      <td>Will the Constitutional Democratic Party of Ja...</td>\n",
        "      <td>CLOSED</td>\n",
+       "      <td>0.203751</td>\n",
        "      <td>1</td>\n",
+       "      <td>0.002038</td>\n",
+       "      <td>0.305174</td>\n",
        "      <td>...</td>\n",
        "      <td>True</td>\n",
+       "      <td>0.305174</td>\n",
        "      <td>True</td>\n",
+       "      <td>0.305174</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.02</td>\n",
+       "      <td>0.079385</td>\n",
+       "      <td>0.351592</td>\n",
        "      <td>non_staking</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
+       "      <td>0x15edf592dc3eb67e1c163ceb6d23039710cd67fb0x01...</td>\n",
+       "      <td>2024-10-28 21:59:25+00:00</td>\n",
+       "      <td>Will there be a public statement from the Bide...</td>\n",
        "      <td>CLOSED</td>\n",
+       "      <td>0.412054</td>\n",
        "      <td>1</td>\n",
+       "      <td>0.004121</td>\n",
+       "      <td>0.666936</td>\n",
        "      <td>...</td>\n",
        "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
        "      <td>False</td>\n",
        "      <td>0.000000</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.02</td>\n",
+       "      <td>-0.436175</td>\n",
        "      <td>-1.000000</td>\n",
        "      <td>non_staking</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>0x01274796ce41aa8e8312e05a427ffb4b0d2148f6</td>\n",
        "      <td>quickstart</td>\n",
+       "      <td>0x187c822a330c393912398884faf8150d21b4a7840x01...</td>\n",
+       "      <td>2024-10-30 00:30:45+00:00</td>\n",
+       "      <td>Will the Bank of Japan issue a public statemen...</td>\n",
        "      <td>CLOSED</td>\n",
+       "      <td>0.333192</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.003332</td>\n",
+       "      <td>0.447445</td>\n",
        "      <td>...</td>\n",
        "      <td>True</td>\n",
+       "      <td>0.447445</td>\n",
        "      <td>True</td>\n",
+       "      <td>0.447445</td>\n",
+       "      <td>8.0</td>\n",
+       "      <td>0.08</td>\n",
+       "      <td>0.030922</td>\n",
+       "      <td>0.074237</td>\n",
        "      <td>non_staking</td>\n",
+       "      <td>NaN</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
+       "<p>5 rows × 22 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "4  0x01274796ce41aa8e8312e05a427ffb4b0d2148f6     quickstart   \n",
        "\n",
        "                                            trade_id  \\\n",
+       "0  0x0dfb9821725003c4d3007999968d34d7070959ef0x01...   \n",
+       "1  0x1082be4e429e512182089162f41b3a86a52eee370x01...   \n",
+       "2  0x150f4d4e5affa7fe332684d7c828c0a471c4d5de0x01...   \n",
+       "3  0x15edf592dc3eb67e1c163ceb6d23039710cd67fb0x01...   \n",
+       "4  0x187c822a330c393912398884faf8150d21b4a7840x01...   \n",
        "\n",
        "         creation_timestamp  \\\n",
+       "0 2024-10-27 21:51:25+00:00   \n",
+       "1 2024-10-31 22:50:15+00:00   \n",
+       "2 2024-10-29 02:21:25+00:00   \n",
+       "3 2024-10-28 21:59:25+00:00   \n",
+       "4 2024-10-30 00:30:45+00:00   \n",
        "\n",
        "                                               title market_status  \\\n",
+       "0  Will any mainstream U.S. news outlet publish a...        CLOSED   \n",
+       "1  Will Prime Minister Shigeru Ishiba announce a ...        CLOSED   \n",
+       "2  Will the Constitutional Democratic Party of Ja...        CLOSED   \n",
+       "3  Will there be a public statement from the Bide...        CLOSED   \n",
+       "4  Will the Bank of Japan issue a public statemen...        CLOSED   \n",
        "\n",
        "   collateral_amount outcome_index  trade_fee_amount  outcomes_tokens_traded  \\\n",
+       "0           0.461993             1          0.004620                0.734537   \n",
+       "1           0.859939             0          0.008599                2.714890   \n",
+       "2           0.203751             1          0.002038                0.305174   \n",
+       "3           0.412054             1          0.004121                0.666936   \n",
+       "4           0.333192             0          0.003332                0.447445   \n",
        "\n",
+       "   ...  winning_trade  earnings  redeemed  redeemed_amount  num_mech_calls  \\\n",
+       "0  ...           True  0.734537      True         0.734537             2.0   \n",
+       "1  ...          False  0.000000     False         0.000000             8.0   \n",
+       "2  ...           True  0.305174      True         0.305174             2.0   \n",
+       "3  ...          False  0.000000     False         0.000000             2.0   \n",
+       "4  ...           True  0.447445      True         0.447445             8.0   \n",
        "\n",
+       "   mech_fee_amount  net_earnings       roi      staking  nr_mech_calls  \n",
+       "0             0.02      0.247924  0.509488  non_staking            NaN  \n",
+       "1             0.08     -0.948538 -1.000000  non_staking            NaN  \n",
+       "2             0.02      0.079385  0.351592  non_staking            NaN  \n",
+       "3             0.02     -0.436175 -1.000000  non_staking            NaN  \n",
+       "4             0.08      0.030922  0.074237  non_staking            NaN  \n",
        "\n",
+       "[5 rows x 22 columns]"
       ]
      },
      "execution_count": 3,
     "all_trades.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    43987.000000\n",
+       "mean         6.663537\n",
+       "std         13.608287\n",
+       "min          0.000000\n",
+       "25%          2.000000\n",
+       "50%          5.000000\n",
+       "75%          8.000000\n",
+       "max        650.000000\n",
+       "Name: num_mech_calls, dtype: float64"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_trades.num_mech_calls.describe()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 4,

notebooks/tool_errors_analysis.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -15,16 +15,234 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tools = pd.read_parquet('../data/tools.parquet')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -32,42 +250,713 @@
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
-      "RangeIndex: 286042 entries, 0 to 286041\n",
-      "Data columns (total 23 columns):\n",
-      " #   Column                   Non-Null Count   Dtype  \n",
-      "---  ------                   --------------   -----  \n",
-      " 0   request_id               286042 non-null  object \n",
-      " 1   request_block            286042 non-null  object \n",
-      " 2   prompt_request           286042 non-null  object \n",
-      " 3   tool                     286042 non-null  object \n",
-      " 4   nonce                    286042 non-null  object \n",
-      " 5   trader_address           286042 non-null  object \n",
-      " 6   deliver_block            286042 non-null  object \n",
-      " 7   error                    286042 non-null  int64  \n",
-      " 8   error_message            3147 non-null    object \n",
-      " 9   prompt_response          285564 non-null  object \n",
-      " 10  mech_address             285656 non-null  object \n",
-      " 11  p_yes                    282895 non-null  float64\n",
-      " 12  p_no                     282895 non-null  float64\n",
-      " 13  confidence               282895 non-null  float64\n",
-      " 14  info_utility             282895 non-null  float64\n",
-      " 15  vote                     205982 non-null  object \n",
-      " 16  win_probability          282895 non-null  float64\n",
-      " 17  market_creator           286042 non-null  object \n",
-      " 18  title                    286042 non-null  object \n",
-      " 19  currentAnswer            235482 non-null  object \n",
-      " 20  request_time             286042 non-null  object \n",
-      " 21  request_month_year       286042 non-null  object \n",
-      " 22  request_month_year_week  286042 non-null  object \n",
-      "dtypes: float64(5), int64(1), object(17)\n",
-      "memory usage: 50.2+ MB\n"
      ]
     }
    ],
    "source": [
-    "tools.info()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
+    "error_by_markets = pd.read_parquet('../data/error_by_markets.parquet')"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['tool', 'request_month_year_week', 'market_creator', '0', '1',\n",
+       "       'error_perc', 'total_requests'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "error_by_markets.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>tool</th>\n",
+       "      <th>request_month_year_week</th>\n",
+       "      <th>market_creator</th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>error_perc</th>\n",
+       "      <th>total_requests</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>355</th>\n",
+       "      <td>superforcaster</td>\n",
+       "      <td>Dec-22</td>\n",
+       "      <td>all</td>\n",
+       "      <td>1087.0</td>\n",
+       "      <td>61.0</td>\n",
+       "      <td>5.313589</td>\n",
+       "      <td>1148.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>356</th>\n",
+       "      <td>superforcaster</td>\n",
+       "      <td>Dec-22</td>\n",
+       "      <td>pearl</td>\n",
+       "      <td>75.0</td>\n",
+       "      <td>10.0</td>\n",
+       "      <td>11.764706</td>\n",
+       "      <td>85.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>357</th>\n",
+       "      <td>superforcaster</td>\n",
+       "      <td>Dec-29</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>678.0</td>\n",
+       "      <td>59.0</td>\n",
+       "      <td>8.005427</td>\n",
+       "      <td>737.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>358</th>\n",
+       "      <td>superforcaster</td>\n",
+       "      <td>Dec-29</td>\n",
+       "      <td>all</td>\n",
+       "      <td>705.0</td>\n",
+       "      <td>60.0</td>\n",
+       "      <td>7.843137</td>\n",
+       "      <td>765.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>359</th>\n",
+       "      <td>superforcaster</td>\n",
+       "      <td>Dec-29</td>\n",
+       "      <td>pearl</td>\n",
+       "      <td>27.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3.571429</td>\n",
+       "      <td>28.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               tool request_month_year_week market_creator       0     1  \\\n",
+       "355  superforcaster                  Dec-22            all  1087.0  61.0   \n",
+       "356  superforcaster                  Dec-22          pearl    75.0  10.0   \n",
+       "357  superforcaster                  Dec-29     quickstart   678.0  59.0   \n",
+       "358  superforcaster                  Dec-29            all   705.0  60.0   \n",
+       "359  superforcaster                  Dec-29          pearl    27.0   1.0   \n",
+       "\n",
+       "     error_perc  total_requests  \n",
+       "355    5.313589          1148.0  \n",
+       "356   11.764706            85.0  \n",
+       "357    8.005427           737.0  \n",
+       "358    7.843137           765.0  \n",
+       "359    3.571429            28.0  "
+      ]
+     },
+     "execution_count": 51,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "error_by_markets.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "error_total = (\n",
+    "    error_by_markets.groupby([\"request_month_year_week\", \"market_creator\"], sort=False)\n",
+    "    .agg({\"total_requests\": \"sum\", '1': \"sum\", '0': \"sum\"})\n",
+    "    .reset_index()\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_tools = pd.read_parquet('../data/new_tools.parquet')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "155789"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(new_tools)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['request_id', 'request_block', 'request_time', 'tx_hash',\n",
+       "       'prompt_request', 'tool', 'nonce', 'trader_address', 'deliver_block',\n",
+       "       'error', 'error_message', 'prompt_response', 'mech_address', 'p_yes',\n",
+       "       'p_no', 'confidence', 'info_utility', 'vote', 'win_probability',\n",
+       "       'market_creator'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_tools.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 155789 entries, 0 to 155788\n",
+      "Data columns (total 20 columns):\n",
+      " #   Column           Non-Null Count   Dtype              \n",
+      "---  ------           --------------   -----              \n",
+      " 0   request_id       155789 non-null  object             \n",
+      " 1   request_block    155789 non-null  object             \n",
+      " 2   request_time     155789 non-null  datetime64[ns, UTC]\n",
+      " 3   tx_hash          155789 non-null  object             \n",
+      " 4   prompt_request   155789 non-null  object             \n",
+      " 5   tool             155789 non-null  object             \n",
+      " 6   nonce            155789 non-null  object             \n",
+      " 7   trader_address   155789 non-null  object             \n",
+      " 8   deliver_block    155789 non-null  object             \n",
+      " 9   error            155789 non-null  int64              \n",
+      " 10  error_message    61690 non-null   object             \n",
+      " 11  prompt_response  131002 non-null  object             \n",
+      " 12  mech_address     131002 non-null  object             \n",
+      " 13  p_yes            94099 non-null   float64            \n",
+      " 14  p_no             94099 non-null   float64            \n",
+      " 15  confidence       94099 non-null   float64            \n",
+      " 16  info_utility     94099 non-null   float64            \n",
+      " 17  vote             66870 non-null   object             \n",
+      " 18  win_probability  94099 non-null   float64            \n",
+      " 19  market_creator   155789 non-null  object             \n",
+      "dtypes: datetime64[ns, UTC](1), float64(5), int64(1), object(13)\n",
+      "memory usage: 23.8+ MB\n"
      ]
     }
    ],
    "source": [
+    "new_tools.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "request_id         5585662306487809905791879720381993204173062765...\n",
+       "request_block                                               37672194\n",
+       "request_time                               2024-12-23 12:37:05+00:00\n",
+       "tx_hash            0x069f675e2cdfd328e9056901e5e79dcfa8cd981c95e3...\n",
+       "prompt_request     Please take over the role of a Data Scientist ...\n",
+       "tool                                              prediction-offline\n",
+       "nonce                           a711182a-8641-428b-8908-ae773516f846\n",
+       "trader_address            0x5d621d8bfcb57a70f4fde6e5484a54fa8127a858\n",
+       "deliver_block                                               37672203\n",
+       "error                                                              0\n",
+       "error_message                                                   None\n",
+       "prompt_response    \\nYou are an LLM inside a multi-agent system t...\n",
+       "mech_address              0x5e1d1eb61e1164d5a50b28c575da73a29595dff7\n",
+       "p_yes                                                            0.3\n",
+       "p_no                                                             0.7\n",
+       "confidence                                                       0.4\n",
+       "info_utility                                                     0.0\n",
+       "vote                                                              No\n",
+       "win_probability                                                  0.7\n",
+       "market_creator                                            quickstart\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_tools.iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Timestamp('2024-12-23 12:37:05+0000', tz='UTC')"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_tools.iloc[0].request_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tools = pd.read_parquet('../tmp/tools.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Timestamp('2024-10-26 13:03:55+0000', tz='UTC')"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tools.iloc[0].request_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "626382"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(tools)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>request_id</th>\n",
+       "      <th>request_block</th>\n",
+       "      <th>prompt_request</th>\n",
+       "      <th>tool</th>\n",
+       "      <th>nonce</th>\n",
+       "      <th>trader_address</th>\n",
+       "      <th>deliver_block</th>\n",
+       "      <th>error</th>\n",
+       "      <th>error_message</th>\n",
+       "      <th>prompt_response</th>\n",
+       "      <th>...</th>\n",
+       "      <th>confidence</th>\n",
+       "      <th>info_utility</th>\n",
+       "      <th>vote</th>\n",
+       "      <th>win_probability</th>\n",
+       "      <th>market_creator</th>\n",
+       "      <th>title</th>\n",
+       "      <th>currentAnswer</th>\n",
+       "      <th>request_time</th>\n",
+       "      <th>request_month_year</th>\n",
+       "      <th>request_month_year_week</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1065794400559258224527965821794720648636282516...</td>\n",
+       "      <td>36701352</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>fd5cfa7f-fb38-435b-b571-69f253397a69</td>\n",
+       "      <td>0x42cc3f5a30420e8964be3c18d0e560b10e8957fa</td>\n",
+       "      <td>36701366</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.6</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will the Caspian Sea countries announce a join...</td>\n",
+       "      <td>No</td>\n",
+       "      <td>2024-10-26 13:03:55+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-21/2024-10-27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>9015207976398091689774872599965107755141252481...</td>\n",
+       "      <td>36749933</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>a61c894b-fa0d-40d9-ab02-e512ef9dd7e6</td>\n",
+       "      <td>0xc84d1f9fc28ce5628f69cc138ba3092a036b8f69</td>\n",
+       "      <td>36749986</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.6</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.9</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will Iran launch another ballistic missile att...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-29 10:50:55+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-28/2024-11-03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>8204183583678326945661870665906374091080896102...</td>\n",
+       "      <td>36756777</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>e64fa0a7-a98b-4c95-ac1a-755d212a5b78</td>\n",
+       "      <td>0x992448484862672eb95ca1c877cc43f935c389ce</td>\n",
+       "      <td>36756791</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.6</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will the United Nations Security Council hold ...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-29 20:41:55+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-28/2024-11-03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1225398943131371197254748096595691861478679945...</td>\n",
+       "      <td>36749393</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>f84d34ae-3dd0-4d0c-a4d3-d8f28d25677c</td>\n",
+       "      <td>0x3de0f1a8d9c227af2e324dd92905bbf8bb852ff8</td>\n",
+       "      <td>36749408</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.3</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will the Bank of Japan issue a public statemen...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-29 10:02:45+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-28/2024-11-03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>7954746357421406217625419969909404056225427053...</td>\n",
+       "      <td>36701081</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>superforcaster</td>\n",
+       "      <td>1b609b7e-e0d2-4bb8-ad6b-7d0e6e6610b5</td>\n",
+       "      <td>0x8dd0f0f64e575a356545d9ed096122a1887e64bf</td>\n",
+       "      <td>36701099</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an advanced AI system which has been...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.6</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will any new human rights organizations public...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-26 12:40:25+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-21/2024-10-27</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 23 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          request_id request_block  \\\n",
+       "0  1065794400559258224527965821794720648636282516...      36701352   \n",
+       "1  9015207976398091689774872599965107755141252481...      36749933   \n",
+       "2  8204183583678326945661870665906374091080896102...      36756777   \n",
+       "3  1225398943131371197254748096595691861478679945...      36749393   \n",
+       "4  7954746357421406217625419969909404056225427053...      36701081   \n",
+       "\n",
+       "                                      prompt_request  \\\n",
+       "0  Please take over the role of a Data Scientist ...   \n",
+       "1  Please take over the role of a Data Scientist ...   \n",
+       "2  Please take over the role of a Data Scientist ...   \n",
+       "3  Please take over the role of a Data Scientist ...   \n",
+       "4  Please take over the role of a Data Scientist ...   \n",
+       "\n",
+       "                        tool                                 nonce  \\\n",
+       "0  claude-prediction-offline  fd5cfa7f-fb38-435b-b571-69f253397a69   \n",
+       "1  claude-prediction-offline  a61c894b-fa0d-40d9-ab02-e512ef9dd7e6   \n",
+       "2  claude-prediction-offline  e64fa0a7-a98b-4c95-ac1a-755d212a5b78   \n",
+       "3  claude-prediction-offline  f84d34ae-3dd0-4d0c-a4d3-d8f28d25677c   \n",
+       "4             superforcaster  1b609b7e-e0d2-4bb8-ad6b-7d0e6e6610b5   \n",
+       "\n",
+       "                               trader_address deliver_block  error  \\\n",
+       "0  0x42cc3f5a30420e8964be3c18d0e560b10e8957fa      36701366      0   \n",
+       "1  0xc84d1f9fc28ce5628f69cc138ba3092a036b8f69      36749986      0   \n",
+       "2  0x992448484862672eb95ca1c877cc43f935c389ce      36756791      0   \n",
+       "3  0x3de0f1a8d9c227af2e324dd92905bbf8bb852ff8      36749408      0   \n",
+       "4  0x8dd0f0f64e575a356545d9ed096122a1887e64bf      36701099      0   \n",
+       "\n",
+       "  error_message                                    prompt_response  ...  \\\n",
+       "0          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "1          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "2          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "3          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "4          None  \\nYou are an advanced AI system which has been...  ...   \n",
+       "\n",
+       "  confidence  info_utility  vote  win_probability  market_creator  \\\n",
+       "0        0.6           0.2    No              0.7      quickstart   \n",
+       "1        0.6           0.2    No              0.9      quickstart   \n",
+       "2        0.6           0.2    No              0.7      quickstart   \n",
+       "3        0.3           0.1  None              0.5      quickstart   \n",
+       "4        0.6           0.4    No              0.7      quickstart   \n",
+       "\n",
+       "                                               title  currentAnswer  \\\n",
+       "0  Will the Caspian Sea countries announce a join...             No   \n",
+       "1  Will Iran launch another ballistic missile att...            Yes   \n",
+       "2  Will the United Nations Security Council hold ...            Yes   \n",
+       "3  Will the Bank of Japan issue a public statemen...            Yes   \n",
+       "4  Will any new human rights organizations public...            Yes   \n",
+       "\n",
+       "               request_time request_month_year request_month_year_week  \n",
+       "0 2024-10-26 13:03:55+00:00            2024-10   2024-10-21/2024-10-27  \n",
+       "1 2024-10-29 10:50:55+00:00            2024-10   2024-10-28/2024-11-03  \n",
+       "2 2024-10-29 20:41:55+00:00            2024-10   2024-10-28/2024-11-03  \n",
+       "3 2024-10-29 10:02:45+00:00            2024-10   2024-10-28/2024-11-03  \n",
+       "4 2024-10-26 12:40:25+00:00            2024-10   2024-10-21/2024-10-27  \n",
+       "\n",
+       "[5 rows x 23 columns]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tools.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "request_id                 4650171020578125420345177840991368739117143085...\n",
+       "request_block                                                       36626348\n",
+       "prompt_request             Please take over the role of a Data Scientist ...\n",
+       "tool                                                claude-prediction-online\n",
+       "nonce                                   5a0e84af-fcc3-4015-b2c1-a390430d70ca\n",
+       "trader_address                    0x1fe2b09de07475b1027b0c73a5bf52693b31a52e\n",
+       "deliver_block                                                       36626364\n",
+       "error                                                                      0\n",
+       "error_message                                                           None\n",
+       "prompt_response            \\nYou are an LLM inside a multi-agent system t...\n",
+       "mech_address                      0x5e1d1eb61e1164d5a50b28c575da73a29595dff7\n",
+       "p_yes                                                                    0.3\n",
+       "p_no                                                                     0.7\n",
+       "confidence                                                               0.6\n",
+       "info_utility                                                             0.2\n",
+       "vote                                                                      No\n",
+       "win_probability                                                          0.7\n",
+       "market_creator                                                         pearl\n",
+       "title                      Will the US government make a public statement...\n",
+       "currentAnswer                                                             No\n",
+       "request_time                                       2024-10-22 00:56:35+00:00\n",
+       "request_month_year                                                   2024-10\n",
+       "request_month_year_week                                2024-10-21/2024-10-27\n",
+       "Name: 0, dtype: object"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tools.iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merge_df = pd.concat([tools, new_tools], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>request_id</th>\n",
+       "      <th>request_block</th>\n",
+       "      <th>prompt_request</th>\n",
+       "      <th>tool</th>\n",
+       "      <th>nonce</th>\n",
+       "      <th>trader_address</th>\n",
+       "      <th>deliver_block</th>\n",
+       "      <th>error</th>\n",
+       "      <th>error_message</th>\n",
+       "      <th>prompt_response</th>\n",
+       "      <th>...</th>\n",
+       "      <th>info_utility</th>\n",
+       "      <th>vote</th>\n",
+       "      <th>win_probability</th>\n",
+       "      <th>market_creator</th>\n",
+       "      <th>title</th>\n",
+       "      <th>currentAnswer</th>\n",
+       "      <th>request_time</th>\n",
+       "      <th>request_month_year</th>\n",
+       "      <th>request_month_year_week</th>\n",
+       "      <th>tx_hash</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1065794400559258224527965821794720648636282516...</td>\n",
+       "      <td>36701352</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>fd5cfa7f-fb38-435b-b571-69f253397a69</td>\n",
+       "      <td>0x42cc3f5a30420e8964be3c18d0e560b10e8957fa</td>\n",
+       "      <td>36701366</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will the Caspian Sea countries announce a join...</td>\n",
+       "      <td>No</td>\n",
+       "      <td>2024-10-26 13:03:55+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-21/2024-10-27</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>9015207976398091689774872599965107755141252481...</td>\n",
+       "      <td>36749933</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>a61c894b-fa0d-40d9-ab02-e512ef9dd7e6</td>\n",
+       "      <td>0xc84d1f9fc28ce5628f69cc138ba3092a036b8f69</td>\n",
+       "      <td>36749986</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.9</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will Iran launch another ballistic missile att...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-29 10:50:55+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-28/2024-11-03</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>8204183583678326945661870665906374091080896102...</td>\n",
+       "      <td>36756777</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>e64fa0a7-a98b-4c95-ac1a-755d212a5b78</td>\n",
+       "      <td>0x992448484862672eb95ca1c877cc43f935c389ce</td>\n",
+       "      <td>36756791</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will the United Nations Security Council hold ...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-29 20:41:55+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-28/2024-11-03</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1225398943131371197254748096595691861478679945...</td>\n",
+       "      <td>36749393</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>claude-prediction-offline</td>\n",
+       "      <td>f84d34ae-3dd0-4d0c-a4d3-d8f28d25677c</td>\n",
+       "      <td>0x3de0f1a8d9c227af2e324dd92905bbf8bb852ff8</td>\n",
+       "      <td>36749408</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an LLM inside a multi-agent system t...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0.5</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will the Bank of Japan issue a public statemen...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-29 10:02:45+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-28/2024-11-03</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>7954746357421406217625419969909404056225427053...</td>\n",
+       "      <td>36701081</td>\n",
+       "      <td>Please take over the role of a Data Scientist ...</td>\n",
+       "      <td>superforcaster</td>\n",
+       "      <td>1b609b7e-e0d2-4bb8-ad6b-7d0e6e6610b5</td>\n",
+       "      <td>0x8dd0f0f64e575a356545d9ed096122a1887e64bf</td>\n",
+       "      <td>36701099</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>\\nYou are an advanced AI system which has been...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>No</td>\n",
+       "      <td>0.7</td>\n",
+       "      <td>quickstart</td>\n",
+       "      <td>Will any new human rights organizations public...</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>2024-10-26 12:40:25+00:00</td>\n",
+       "      <td>2024-10</td>\n",
+       "      <td>2024-10-21/2024-10-27</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 24 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          request_id request_block  \\\n",
+       "0  1065794400559258224527965821794720648636282516...      36701352   \n",
+       "1  9015207976398091689774872599965107755141252481...      36749933   \n",
+       "2  8204183583678326945661870665906374091080896102...      36756777   \n",
+       "3  1225398943131371197254748096595691861478679945...      36749393   \n",
+       "4  7954746357421406217625419969909404056225427053...      36701081   \n",
+       "\n",
+       "                                      prompt_request  \\\n",
+       "0  Please take over the role of a Data Scientist ...   \n",
+       "1  Please take over the role of a Data Scientist ...   \n",
+       "2  Please take over the role of a Data Scientist ...   \n",
+       "3  Please take over the role of a Data Scientist ...   \n",
+       "4  Please take over the role of a Data Scientist ...   \n",
+       "\n",
+       "                        tool                                 nonce  \\\n",
+       "0  claude-prediction-offline  fd5cfa7f-fb38-435b-b571-69f253397a69   \n",
+       "1  claude-prediction-offline  a61c894b-fa0d-40d9-ab02-e512ef9dd7e6   \n",
+       "2  claude-prediction-offline  e64fa0a7-a98b-4c95-ac1a-755d212a5b78   \n",
+       "3  claude-prediction-offline  f84d34ae-3dd0-4d0c-a4d3-d8f28d25677c   \n",
+       "4             superforcaster  1b609b7e-e0d2-4bb8-ad6b-7d0e6e6610b5   \n",
+       "\n",
+       "                               trader_address deliver_block  error  \\\n",
+       "0  0x42cc3f5a30420e8964be3c18d0e560b10e8957fa      36701366      0   \n",
+       "1  0xc84d1f9fc28ce5628f69cc138ba3092a036b8f69      36749986      0   \n",
+       "2  0x992448484862672eb95ca1c877cc43f935c389ce      36756791      0   \n",
+       "3  0x3de0f1a8d9c227af2e324dd92905bbf8bb852ff8      36749408      0   \n",
+       "4  0x8dd0f0f64e575a356545d9ed096122a1887e64bf      36701099      0   \n",
+       "\n",
+       "  error_message                                    prompt_response  ...  \\\n",
+       "0          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "1          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "2          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "3          None  \\nYou are an LLM inside a multi-agent system t...  ...   \n",
+       "4          None  \\nYou are an advanced AI system which has been...  ...   \n",
+       "\n",
+       "  info_utility  vote  win_probability  market_creator  \\\n",
+       "0          0.2    No              0.7      quickstart   \n",
+       "1          0.2    No              0.9      quickstart   \n",
+       "2          0.2    No              0.7      quickstart   \n",
+       "3          0.1  None              0.5      quickstart   \n",
+       "4          0.4    No              0.7      quickstart   \n",
+       "\n",
+       "                                               title currentAnswer  \\\n",
+       "0  Will the Caspian Sea countries announce a join...            No   \n",
+       "1  Will Iran launch another ballistic missile att...           Yes   \n",
+       "2  Will the United Nations Security Council hold ...           Yes   \n",
+       "3  Will the Bank of Japan issue a public statemen...           Yes   \n",
+       "4  Will any new human rights organizations public...           Yes   \n",
+       "\n",
+       "               request_time request_month_year request_month_year_week tx_hash  \n",
+       "0 2024-10-26 13:03:55+00:00            2024-10   2024-10-21/2024-10-27     NaN  \n",
+       "1 2024-10-29 10:50:55+00:00            2024-10   2024-10-28/2024-11-03     NaN  \n",
+       "2 2024-10-29 20:41:55+00:00            2024-10   2024-10-28/2024-11-03     NaN  \n",
+       "3 2024-10-29 10:02:45+00:00            2024-10   2024-10-28/2024-11-03     NaN  \n",
+       "4 2024-10-26 12:40:25+00:00            2024-10   2024-10-21/2024-10-27     NaN  \n",
+       "\n",
+       "[5 rows x 24 columns]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merge_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merge_df.drop(columns=\"tx_hash\", inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merge_df.to_parquet(\"../tmp/tools.parquet\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 4,

scripts/cleaning_old_info.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import pandas as pd
-from profitability import summary_analyse
 from utils import DATA_DIR
-from staking import label_trades_by_staking
 def clean_old_data_from_parquet_files(cutoff_date: str):
@@ -47,21 +45,24 @@ def clean_old_data_from_parquet_files(cutoff_date: str):
     except Exception as e:
         print(f"Error cleaning all trades profitability file {e}")
-    # generate again summary_profitability.parquet
     try:
-        # add staking labels
-        label_trades_by_staking(trades_df=all_trades, update=False)
-        # save to parquet
-        all_trades.to_parquet(
-            DATA_DIR / "all_trades_profitability.parquet", index=False
         )
-        print("Summarising trades...")
-        summary_df = summary_analyse(all_trades)
-        summary_df.to_parquet(DATA_DIR / "summary_profitability.parquet", index=False)
     except Exception as e:
-        print(f"Error generating summary and saving all trades profitability file {e}")
 if __name__ == "__main__":
-    clean_old_data_from_parquet_files("2024-09-22")

 import pandas as pd
 from utils import DATA_DIR
 def clean_old_data_from_parquet_files(cutoff_date: str):
     except Exception as e:
         print(f"Error cleaning all trades profitability file {e}")
+    # clean unknown_traders.parquet
     try:
+        unknown_traders = pd.read_parquet(DATA_DIR / "unknown_traders.parquet")
+        unknown_traders["creation_timestamp"] = pd.to_datetime(
+            unknown_traders["creation_timestamp"], utc=True
         )
+        print(f"length before filtering {len(unknown_traders)}")
+        unknown_traders = unknown_traders.loc[
+            unknown_traders["creation_timestamp"] > min_date_utc
+        ]
+        print(f"length after filtering {len(unknown_traders)}")
+        unknown_traders.to_parquet(DATA_DIR / "unknown_traders.parquet", index=False)
     except Exception as e:
+        print(f"Error cleaning unknown_traders file {e}")
 if __name__ == "__main__":
+    clean_old_data_from_parquet_files("2024-10-25")

scripts/daily_data.py CHANGED Viewed

@@ -32,7 +32,7 @@ def prepare_live_metrics(
     )
     # staking label
-    label_trades_by_staking(all_trades_df)
     # create the unknown traders dataset
     unknown_traders_df, all_trades_df = create_unknown_traders_df(

     )
     # staking label
+    all_trades_df = label_trades_by_staking(all_trades_df)
     # create the unknown traders dataset
     unknown_traders_df, all_trades_df = create_unknown_traders_df(

scripts/get_mech_info.py CHANGED Viewed

@@ -11,6 +11,7 @@ from utils import (
 import requests
 import pandas as pd
 import numpy as np
 from mech_request_utils import (
     collect_all_mech_delivers,
     collect_all_mech_requests,
@@ -146,7 +147,7 @@ def update_fpmmTrades_parquet(trades_filename: str) -> pd.DataFrame:
     # Remove duplicates
     # fpmm.outcomes is a numpy array
-    merge_df.drop_duplicates("id", inplace=True)
     print(f"Final length after removing duplicates in fpmmTrades= {len(merge_df)}")
     # save the parquet file
@@ -174,16 +175,15 @@ def update_all_trades_parquet(new_trades_df: pd.DataFrame) -> pd.DataFrame:
     return merge_df
-def update_tools_parquet(rpc: str, new_tools_filename: pd.DataFrame):
     try:
-        old_tools_df = pd.read_parquet(DATA_DIR / "tools.parquet")
     except Exception as e:
         print(f"Error reading old tools parquet file {e}")
         return None
     try:
         new_tools_df = pd.read_parquet(DATA_DIR / new_tools_filename)
-        # the new file has no request_time yet
-        updating_timestamps(rpc, new_tools_filename)
     except Exception as e:
         print(f"Error reading new trades parquet file {e}")
         return None
@@ -201,7 +201,7 @@ def update_tools_parquet(rpc: str, new_tools_filename: pd.DataFrame):
     print(f"Final length after removing duplicates in tools= {len(merge_df)}")
     # save the parquet file
-    merge_df.to_parquet(DATA_DIR / "tools.parquet", index=False)
 def get_mech_info_2024() -> dict[str, Any]:
@@ -298,6 +298,10 @@ def get_mech_events_since_last_run():
     try:
         all_trades = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet")
         latest_timestamp = max(all_trades.creation_timestamp)
         print(f"Updating data since {latest_timestamp}")
     except Exception:
         print("Error while reading the profitability parquet file")
@@ -351,41 +355,7 @@ def get_mech_events_since_last_run():
     return latest_timestamp
-@measure_execution_time
-def get_mech_events_last_60_days():
-    earliest_block_number = get_last_60_days_block_number()
-    last_block_number = get_last_block_number()
-    # mech requests
-    requests_dict, duplicatedReqId, nr_errors = collect_all_mech_requests(
-        from_block=earliest_block_number,
-        to_block=last_block_number,
-        filename="mech_requests.json",
-    )
-    # mech delivers
-    delivers_dict, duplicatedIds, nr_errors = collect_all_mech_delivers(
-        from_block=earliest_block_number,
-        to_block=last_block_number,
-        filename="mech_delivers.json",
-    )
-    # clean delivers
-    clean_mech_delivers("mech_requests.json", "mech_delivers.json")
-    # solve duplicated requestIds
-    block_map = fix_duplicate_requestIds("mech_requests.json", "mech_delivers.json")
-    # merge the two files into one source
-    not_found = merge_requests_delivers(
-        "mech_requests.json", "mech_delivers.json", "merged_requests.json"
-    )
-    # Add ipfs contents
-    get_ipfs_data("merged_requests.json", "tools_info.json")
 if __name__ == "__main__":
-    get_mech_events_last_60_days()
     # result = get_mech_info_last_60_days()
     # print(result)

 import requests
 import pandas as pd
 import numpy as np
+from gnosis_timestamps import compute_request_time
 from mech_request_utils import (
     collect_all_mech_delivers,
     collect_all_mech_requests,
     # Remove duplicates
     # fpmm.outcomes is a numpy array
+    merge_df.drop_duplicates("id", keep="last", inplace=True)
     print(f"Final length after removing duplicates in fpmmTrades= {len(merge_df)}")
     # save the parquet file
     return merge_df
+def update_tools_parquet(new_tools_filename: pd.DataFrame):
     try:
+        old_tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
     except Exception as e:
         print(f"Error reading old tools parquet file {e}")
         return None
     try:
         new_tools_df = pd.read_parquet(DATA_DIR / new_tools_filename)
     except Exception as e:
         print(f"Error reading new trades parquet file {e}")
         return None
     print(f"Final length after removing duplicates in tools= {len(merge_df)}")
     # save the parquet file
+    merge_df.to_parquet(TMP_DIR / "tools.parquet", index=False)
 def get_mech_info_2024() -> dict[str, Any]:
     try:
         all_trades = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet")
         latest_timestamp = max(all_trades.creation_timestamp)
+        # cutoff_date = "2024-12-01"
+        # latest_timestamp = pd.Timestamp(
+        #     datetime.strptime(cutoff_date, "%Y-%m-%d")
+        # ).tz_localize("UTC")
         print(f"Updating data since {latest_timestamp}")
     except Exception:
         print("Error while reading the profitability parquet file")
     return latest_timestamp
 if __name__ == "__main__":
+    get_mech_events_since_last_run()
     # result = get_mech_info_last_60_days()
     # print(result)

scripts/gnosis_timestamps.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from web3 import Web3
+import os
+import requests
+import time
+import pickle
+from datetime import datetime, timezone
+from functools import partial
+import pandas as pd
+import pytz
+from tqdm import tqdm
+from utils import DATA_DIR, TMP_DIR, measure_execution_time
+from concurrent.futures import ThreadPoolExecutor
+GNOSIS_API_INTERVAL = 0.2  # 5 calls in 1 second
+GNOSIS_URL = "https://api.gnosisscan.io/api"
+GNOSIS_API_KEY = os.environ.get("GNOSIS_API_KEY", None)
+# https://api.gnosisscan.io/api?module=account&action=txlist&address=0x1fe2b09de07475b1027b0c73a5bf52693b31a52e&startblock=36626348&endblock=36626348&page=1&offset=10&sort=asc&apikey=${gnosis_api_key}""
+# Connect to Gnosis Chain RPC
+w3 = Web3(Web3.HTTPProvider("https://rpc.gnosischain.com"))
+def parallelize_timestamp_computation(df: pd.DataFrame, function: callable) -> list:
+    """Parallelize the timestamp conversion."""
+    tx_hashes = df["tx_hash"].tolist()
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        results = list(tqdm(executor.map(function, tx_hashes), total=len(tx_hashes)))
+    return results
+def transform_timestamp_to_datetime(timestamp):
+    dt = datetime.fromtimestamp(timestamp, timezone.utc)
+    return dt
+def get_tx_hash(trader_address, request_block):
+    """Function to get the transaction hash from the address and block number"""
+    params = {
+        "module": "account",
+        "action": "txlist",
+        "address": trader_address,
+        "page": 1,
+        "offset": 100,
+        "startblock": request_block,
+        "endblock": request_block,
+        "sort": "asc",
+        "apikey": GNOSIS_API_KEY,
+    }
+    try:
+        response = requests.get(GNOSIS_URL, params=params)
+        tx_list = response.json()["result"]
+        time.sleep(GNOSIS_API_INTERVAL)
+        if len(tx_list) > 1:
+            raise ValueError("More than one transaction found")
+        return tx_list[0]["hash"]
+    except Exception as e:
+        return None
+def add_tx_hash_info(filename: str = "tools.parquet"):
+    """Function to add the hash info to the saved tools parquet file"""
+    tools = pd.read_parquet(DATA_DIR / filename)
+    tools["tx_hash"] = None
+    total_errors = 0
+    for i, mech_request in tqdm(
+        tools.iterrows(), total=len(tools), desc="Adding tx hash"
+    ):
+        try:
+            trader_address = mech_request["trader_address"]
+            block_number = mech_request["request_block"]
+            tools.at[i, "tx_hash"] = get_tx_hash(
+                trader_address=trader_address, request_block=block_number
+            )
+        except Exception as e:
+            print(f"Error with mech request {mech_request}")
+            total_errors += 1
+            continue
+    print(f"Total number of errors = {total_errors}")
+    tools.to_parquet(DATA_DIR / filename)
+def get_transaction_timestamp(tx_hash: str, web3: Web3):
+    try:
+        # Get transaction data
+        tx = web3.eth.get_transaction(tx_hash)
+        # Get block data
+        block = web3.eth.get_block(tx["blockNumber"])
+        # Get timestamp
+        timestamp = block["timestamp"]
+        # Convert to datetime
+        dt = datetime.fromtimestamp(timestamp, tz=pytz.UTC)
+        # return {
+        #     "timestamp": timestamp,
+        #     "datetime": dt,
+        #     "from_address": tx["from"],
+        #     "to_address": tx["to"],
+        #     "success": True,
+        # }
+        return dt.strftime("%Y-%m-%d %H:%M:%S")
+    except Exception as e:
+        print(f"Error getting the timestamp from {tx_hash}")
+        return None
+@measure_execution_time
+def compute_request_time(tools_df: pd.DataFrame) -> pd.DataFrame:
+    """Function to compute the request timestamp from the tx hash"""
+    # read the local info
+    try:
+        gnosis_info = pickle.load(open(TMP_DIR / "gnosis_info.pkl", "rb"))
+    except Exception:
+        print("File not found or not created. Creating a new one")
+        gnosis_info = {}
+    # any previous information?
+    tools_df["request_time"] = tools_df["tx_hash"].map(gnosis_info)
+    # Identify tools with missing request_time and fill them
+    missing_time_indices = tools_df[tools_df["request_time"].isna()].index
+    print(f"length of missing_time_indices = {len(missing_time_indices)}")
+    # traverse all tx hashes and get the timestamp of each tx
+    partial_mech_request_timestamp = partial(get_transaction_timestamp, web3=w3)
+    missing_timestamps = parallelize_timestamp_computation(
+        tools_df.loc[missing_time_indices], partial_mech_request_timestamp
+    )
+    # Update the original DataFrame with the missing timestamps
+    for i, timestamp in zip(missing_time_indices, missing_timestamps):
+        tools_df.at[i, "request_time"] = timestamp
+    # creating other time fields
+    tools_df["request_month_year"] = pd.to_datetime(
+        tools_df["request_time"]
+    ).dt.strftime("%Y-%m")
+    tools_df["request_month_year_week"] = (
+        pd.to_datetime(tools_df["request_time"]).dt.to_period("W").astype(str)
+    )
+    # Update t_map with new timestamps
+    new_timestamps = (
+        tools_df[["tx_hash", "request_time"]]
+        .dropna()
+        .set_index("tx_hash")
+        .to_dict()["request_time"]
+    )
+    gnosis_info.update(new_timestamps)
+    # saving  gnosis info
+    with open(TMP_DIR / "gnosis_info.pkl", "wb") as f:
+        pickle.dump(gnosis_info, f)
+    return tools_df
+def get_account_details(address):
+    # gnosis_url = GNOSIS_URL.substitute(gnosis_api_key=GNOSIS_API_KEY, tx_hash=tx_hash)
+    params = {
+        "module": "account",
+        "action": "txlistinternal",
+        "address": address,
+        #'page': 1,
+        #'offset': 100,
+        #'startblock': 0,
+        #'endblock': 9999999999,
+        #'sort': 'asc',
+        "apikey": GNOSIS_API_KEY,
+    }
+    try:
+        response = requests.get(GNOSIS_URL, params=params)
+        return response.json()
+    except Exception as e:
+        return {"error": str(e)}
+if __name__ == "__main__":
+    # tx_data = "0x783BFA045BDE2D0BCD65280D97A29E7BD9E4FDC10985848690C9797E767140F4"
+    new_tools = pd.read_parquet(DATA_DIR / "new_tools.parquet")
+    new_tools = compute_request_time(new_tools)
+    new_tools.to_parquet(DATA_DIR / "new_tools.parquet")
+    # result = get_tx_hash("0x1fe2b09de07475b1027b0c73a5bf52693b31a52e", 36626348)
+    # print(result)

scripts/mech_request_utils.py CHANGED Viewed

@@ -23,7 +23,6 @@ import time
 import pickle
 from random import uniform
 from typing import Any, Dict, Tuple
-from pathlib import Path
 import requests
 from gql import Client, gql
 from gql.transport.requests import RequestsHTTPTransport
@@ -379,21 +378,22 @@ def clean_mech_delivers(requests_filename: str, delivers_filename: str) -> None:
         mech_requests = json.load(file)
     list_reqIds = [mech_requests[k].get("requestId") for k in mech_requests.keys()]
-    # remove duplicated elements
-    list_reqIds = list(set(list_reqIds))
     # remove requestIds from delivers that are not in this list
     with open(JSON_DATA_DIR / delivers_filename, "r") as file:
         mech_delivers = json.load(file)
     print(f"original size of the file {len(mech_delivers)}")
-    to_delete = []
-    for r in mech_delivers.keys():
-        if r not in list_reqIds:
-            to_delete.append(r)
-    for r in to_delete:
-        mech_delivers.pop(r, None)
     print(f"final size of the file {len(mech_delivers)}")
     save_json_file(mech_delivers, delivers_filename)

 import pickle
 from random import uniform
 from typing import Any, Dict, Tuple
 import requests
 from gql import Client, gql
 from gql.transport.requests import RequestsHTTPTransport
         mech_requests = json.load(file)
     list_reqIds = [mech_requests[k].get("requestId") for k in mech_requests.keys()]
     # remove requestIds from delivers that are not in this list
     with open(JSON_DATA_DIR / delivers_filename, "r") as file:
         mech_delivers = json.load(file)
     print(f"original size of the file {len(mech_delivers)}")
+    mech_delivers = {
+        k: v
+        for k, v in tqdm(
+            mech_delivers.items(),
+            total=len(mech_delivers),
+            desc="Filtering delivers dictionary",
+        )
+        if k in set(list_reqIds)
+    }
     print(f"final size of the file {len(mech_delivers)}")
     save_json_file(mech_delivers, delivers_filename)

scripts/nr_mech_calls.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import pandas as pd
-from utils import DATA_DIR, DEFAULT_MECH_FEE
 from tqdm import tqdm
 from datetime import datetime, timezone
 from typing import Dict, Any
@@ -220,9 +220,9 @@ def compute_timestamp_mech_calls(
         ]
         # traverse market requests
         total_mech_calls = 0
-        for mech_request in market_requests:
             # check timestamp (before the trade)
-            request_ts = mech_request.request_time
             if request_ts < trade_ts:
                 # check the timestamp has not been used in a previous trade
                 used_timestamps = request_timestamps_used[market]
@@ -246,7 +246,7 @@ def compute_mech_calls_based_on_timestamps(
     nr_traders = len(fpmmTrades["trader_address"].unique())
     fpmmTrades["creation_timestamp"] = pd.to_datetime(fpmmTrades["creationTimestamp"])
     fpmmTrades["creation_date"] = fpmmTrades["creation_timestamp"].dt.date
-    trades_df = trades_df.sort_values(by="creation_timestamp", ascending=True)
     tools["request_time"] = pd.to_datetime(tools["request_time"])
     tools["request_date"] = tools["request_time"].dt.date
     tools = tools.sort_values(by="request_time", ascending=True)
@@ -254,9 +254,9 @@ def compute_mech_calls_based_on_timestamps(
     for trader in tqdm(
         fpmmTrades["trader_address"].unique(),
         total=nr_traders,
-        desc="creating mech calls estimation based on timestamps",
     ):
-        # compute the mech calls estimations for each trader
         all_trades = fpmmTrades[fpmmTrades["trader_address"] == trader]
         all_tools = tools[tools["trader_address"] == trader]
         trader_mech_calls = compute_timestamp_mech_calls(all_trades, all_tools)
@@ -266,12 +266,10 @@ def compute_mech_calls_based_on_timestamps(
 if __name__ == "__main__":
     # update_trade_nr_mech_calls(non_agents=True)
-    trades_df = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet")
-    print("before filtering")
-    print(trades_df.staking.value_counts())
-    unknown_df, trades_df = create_unknown_traders_df(trades_df=trades_df)
-    print("after filtering")
-    print(trades_df.staking.value_counts())
-    print("saving unknown traders")
-    unknown_df.to_parquet(DATA_DIR / "unknown_traders.parquet", index=False)
-    trades_df.to_parquet(DATA_DIR / "all_trades_profitability.parquet", index=False)

 import pandas as pd
+from utils import DATA_DIR, DEFAULT_MECH_FEE, TMP_DIR
 from tqdm import tqdm
 from datetime import datetime, timezone
 from typing import Dict, Any
         ]
         # traverse market requests
         total_mech_calls = 0
+        for i, mech_request in market_requests.iterrows():
             # check timestamp (before the trade)
+            request_ts = mech_request["request_time"]
             if request_ts < trade_ts:
                 # check the timestamp has not been used in a previous trade
                 used_timestamps = request_timestamps_used[market]
     nr_traders = len(fpmmTrades["trader_address"].unique())
     fpmmTrades["creation_timestamp"] = pd.to_datetime(fpmmTrades["creationTimestamp"])
     fpmmTrades["creation_date"] = fpmmTrades["creation_timestamp"].dt.date
+    fpmmTrades = fpmmTrades.sort_values(by="creation_timestamp", ascending=True)
     tools["request_time"] = pd.to_datetime(tools["request_time"])
     tools["request_date"] = tools["request_time"].dt.date
     tools = tools.sort_values(by="request_time", ascending=True)
     for trader in tqdm(
         fpmmTrades["trader_address"].unique(),
         total=nr_traders,
+        desc="creating mech calls count based on timestamps",
     ):
+        # compute the mech calls for each trader
         all_trades = fpmmTrades[fpmmTrades["trader_address"] == trader]
         all_tools = tools[tools["trader_address"] == trader]
         trader_mech_calls = compute_timestamp_mech_calls(all_trades, all_tools)
 if __name__ == "__main__":
     # update_trade_nr_mech_calls(non_agents=True)
+    tools = pd.read_parquet(TMP_DIR / "tools.parquet")
+    fpmmTrades = pd.read_parquet(TMP_DIR / "fpmmTrades.parquet")
+    fpmmTrades["creationTimestamp"] = fpmmTrades["creationTimestamp"].apply(
+        lambda x: transform_to_datetime(x)
+    )
+    result = compute_mech_calls_based_on_timestamps(fpmmTrades=fpmmTrades, tools=tools)
+    result.to_parquet(TMP_DIR / "result_df.parquet", index=False)

scripts/profitability.py CHANGED Viewed

@@ -18,7 +18,6 @@
 #   ------------------------------------------------------------------------------
 import time
-import datetime
 import pandas as pd
 from typing import Any
 from enum import Enum
@@ -38,6 +37,7 @@ from utils import (
     JSON_DATA_DIR,
     DATA_DIR,
     DEFAULT_MECH_FEE,
 )
 from staking import label_trades_by_staking
 from nr_mech_calls import (
@@ -122,25 +122,6 @@ ALL_TRADES_STATS_DF_COLS = [
     "roi",
 ]
-SUMMARY_STATS_DF_COLS = [
-    "trader_address",
-    "num_trades",
-    "num_winning_trades",
-    "num_redeemed",
-    "total_investment",
-    "total_trade_fees",
-    "num_mech_calls",
-    "total_mech_fees",
-    "total_earnings",
-    "total_redeemed_amount",
-    "total_net_earnings",
-    "total_net_earnings_wo_mech_fees",
-    "total_roi",
-    "total_roi_wo_mech_fees",
-    "mean_mech_calls_per_trade",
-    "mean_mech_fee_amount_per_trade",
-]
 def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
     """Returns whether the user has redeemed the position."""
@@ -159,7 +140,6 @@ def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
 def prepare_profitalibity_data(
-    rpc: str,
     tools_filename: str,
     trades_filename: str,
 ) -> pd.DataFrame:
@@ -167,6 +147,7 @@ def prepare_profitalibity_data(
     # Check if tools.parquet is in the same directory
     try:
         tools = pd.read_parquet(DATA_DIR / tools_filename)
         # make sure creator_address is in the columns
@@ -181,7 +162,7 @@ def prepare_profitalibity_data(
         tools.to_parquet(DATA_DIR / tools_filename)
         print(f"{tools_filename} loaded")
     except FileNotFoundError:
-        print("tools.parquet not found. Please run tools.py first.")
         return
     # Check if fpmmTrades.parquet is in the same directory
@@ -218,7 +199,6 @@ def determine_market_status(trade, current_answer):
 def analyse_trader(
     trader_address: str,
     fpmmTrades: pd.DataFrame,
-    tools: pd.DataFrame,
     trader_estimated_mech_calls: pd.DataFrame,
     daily_info: bool = False,
 ) -> pd.DataFrame:
@@ -294,7 +274,7 @@ def analyse_trader(
                 total_mech_calls = trader_estimated_mech_calls.loc[
                     (trader_estimated_mech_calls["market"] == trade["title"])
                     & (trader_estimated_mech_calls["trade_id"] == trade_id),
-                    "mech_calls_per_trade",
                 ].iloc[0]
             net_earnings = (
@@ -341,7 +321,6 @@ def analyse_trader(
 def analyse_all_traders(
     trades: pd.DataFrame,
-    tools: pd.DataFrame,
     estimated_mech_calls: pd.DataFrame,
     daily_info: bool = False,
 ) -> pd.DataFrame:
@@ -357,9 +336,7 @@ def analyse_all_traders(
             estimated_mech_calls["trader_address"] == trader
         ]
         all_traders.append(
-            analyse_trader(
-                trader, trades, tools, trader_estimated_mech_calls, daily_info
-            )
         )
     # concat all creators
@@ -368,54 +345,7 @@ def analyse_all_traders(
     return all_creators_df
-def summary_analyse(df):
-    """Summarise profitability analysis."""
-    # Ensure DataFrame is not empty
-    if df.empty:
-        return pd.DataFrame(columns=SUMMARY_STATS_DF_COLS)
-    # Group by trader_address
-    grouped = df.groupby("trader_address")
-    # Create summary DataFrame
-    summary_df = grouped.agg(
-        num_trades=("trader_address", "size"),
-        num_winning_trades=("winning_trade", lambda x: float((x).sum())),
-        num_redeemed=("redeemed", lambda x: float(x.sum())),
-        total_investment=("collateral_amount", "sum"),
-        total_trade_fees=("trade_fee_amount", "sum"),
-        num_mech_calls=("num_mech_calls", "sum"),
-        total_mech_fees=("mech_fee_amount", "sum"),
-        total_earnings=("earnings", "sum"),
-        total_redeemed_amount=("redeemed_amount", "sum"),
-        total_net_earnings=("net_earnings", "sum"),
-    )
-    # Calculating additional columns
-    summary_df["total_roi"] = (
-        summary_df["total_net_earnings"] / summary_df["total_investment"]
-    )
-    summary_df["mean_mech_calls_per_trade"] = (
-        summary_df["num_mech_calls"] / summary_df["num_trades"]
-    )
-    summary_df["mean_mech_fee_amount_per_trade"] = (
-        summary_df["total_mech_fees"] / summary_df["num_trades"]
-    )
-    summary_df["total_net_earnings_wo_mech_fees"] = (
-        summary_df["total_net_earnings"] + summary_df["total_mech_fees"]
-    )
-    summary_df["total_roi_wo_mech_fees"] = (
-        summary_df["total_net_earnings_wo_mech_fees"] / summary_df["total_investment"]
-    )
-    # Resetting index to include trader_address
-    summary_df.reset_index(inplace=True)
-    return summary_df
 def run_profitability_analysis(
-    rpc: str,
     tools_filename: str,
     trades_filename: str,
     merge: bool = False,
@@ -424,10 +354,12 @@ def run_profitability_analysis(
     # load dfs from data folder for analysis
     print(f"Preparing data with {tools_filename} and {trades_filename}")
-    fpmmTrades = prepare_profitalibity_data(rpc, tools_filename, trades_filename)
     if merge:
-        update_tools_parquet(rpc, tools_filename)
-    tools = pd.read_parquet(DATA_DIR / "tools.parquet")
     fpmmTrades["creationTimestamp"] = fpmmTrades["creationTimestamp"].apply(
         lambda x: transform_to_datetime(x)
@@ -436,9 +368,10 @@ def run_profitability_analysis(
     trade_mech_calls = compute_mech_calls_based_on_timestamps(
         fpmmTrades=fpmmTrades, tools=tools
     )
     print(trade_mech_calls.total_mech_calls.describe())
     print("Analysing trades...")
-    all_trades_df = analyse_all_traders(fpmmTrades, tools, trade_mech_calls)
     # # merge previous files if requested
     if merge:
@@ -470,9 +403,10 @@ def run_profitability_analysis(
     all_trades_df = all_trades_df.loc[all_trades_df["is_invalid"] == False]
     # add staking labels
-    label_trades_by_staking(trades_df=all_trades_df)
     # create the unknown traders dataset
     unknown_traders_df, all_trades_df = create_unknown_traders_df(
         trades_df=all_trades_df
     )
@@ -481,18 +415,18 @@ def run_profitability_analysis(
     # save to parquet
     all_trades_df.to_parquet(DATA_DIR / "all_trades_profitability.parquet", index=False)
-    # summarize profitability df
-    print("Summarising trades...")
-    summary_df = summary_analyse(all_trades_df)
-    summary_df.to_parquet(DATA_DIR / "summary_profitability.parquet", index=False)
     print("Done!")
-    return all_trades_df, summary_df
 if __name__ == "__main__":
-    rpc = "https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a"
-    if os.path.exists(DATA_DIR / "fpmmTrades.parquet"):
-        os.remove(DATA_DIR / "fpmmTrades.parquet")
-    run_profitability_analysis(rpc)

 #   ------------------------------------------------------------------------------
 import time
 import pandas as pd
 from typing import Any
 from enum import Enum
     JSON_DATA_DIR,
     DATA_DIR,
     DEFAULT_MECH_FEE,
+    TMP_DIR,
 )
 from staking import label_trades_by_staking
 from nr_mech_calls import (
     "roi",
 ]
 def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
     """Returns whether the user has redeemed the position."""
 def prepare_profitalibity_data(
     tools_filename: str,
     trades_filename: str,
 ) -> pd.DataFrame:
     # Check if tools.parquet is in the same directory
     try:
+        # new tools parquet
         tools = pd.read_parquet(DATA_DIR / tools_filename)
         # make sure creator_address is in the columns
         tools.to_parquet(DATA_DIR / tools_filename)
         print(f"{tools_filename} loaded")
     except FileNotFoundError:
+        print(f"{tools_filename} not found.")
         return
     # Check if fpmmTrades.parquet is in the same directory
 def analyse_trader(
     trader_address: str,
     fpmmTrades: pd.DataFrame,
     trader_estimated_mech_calls: pd.DataFrame,
     daily_info: bool = False,
 ) -> pd.DataFrame:
                 total_mech_calls = trader_estimated_mech_calls.loc[
                     (trader_estimated_mech_calls["market"] == trade["title"])
                     & (trader_estimated_mech_calls["trade_id"] == trade_id),
+                    "total_mech_calls",
                 ].iloc[0]
             net_earnings = (
 def analyse_all_traders(
     trades: pd.DataFrame,
     estimated_mech_calls: pd.DataFrame,
     daily_info: bool = False,
 ) -> pd.DataFrame:
             estimated_mech_calls["trader_address"] == trader
         ]
         all_traders.append(
+            analyse_trader(trader, trades, trader_estimated_mech_calls, daily_info)
         )
     # concat all creators
     return all_creators_df
 def run_profitability_analysis(
     tools_filename: str,
     trades_filename: str,
     merge: bool = False,
     # load dfs from data folder for analysis
     print(f"Preparing data with {tools_filename} and {trades_filename}")
+    fpmmTrades = prepare_profitalibity_data(tools_filename, trades_filename)
     if merge:
+        update_tools_parquet(tools_filename)
+    tools = pd.read_parquet(TMP_DIR / "tools.parquet")
     fpmmTrades["creationTimestamp"] = fpmmTrades["creationTimestamp"].apply(
         lambda x: transform_to_datetime(x)
     trade_mech_calls = compute_mech_calls_based_on_timestamps(
         fpmmTrades=fpmmTrades, tools=tools
     )
+    trade_mech_calls.to_parquet(TMP_DIR / "trade_mech_calls.parquet")
     print(trade_mech_calls.total_mech_calls.describe())
     print("Analysing trades...")
+    all_trades_df = analyse_all_traders(fpmmTrades, trade_mech_calls)
     # # merge previous files if requested
     if merge:
     all_trades_df = all_trades_df.loc[all_trades_df["is_invalid"] == False]
     # add staking labels
+    all_trades_df = label_trades_by_staking(trades_df=all_trades_df)
     # create the unknown traders dataset
+    print("Creating unknown traders dataset")
     unknown_traders_df, all_trades_df = create_unknown_traders_df(
         trades_df=all_trades_df
     )
     # save to parquet
     all_trades_df.to_parquet(DATA_DIR / "all_trades_profitability.parquet", index=False)
     print("Done!")
+    return all_trades_df
 if __name__ == "__main__":
+    # updating the whole fpmmTrades parquet file instead of just the new ones
+    # trade_mech_calls = pd.read_parquet(TMP_DIR / "result_df.parquet")
+    # fpmmTrades = pd.read_parquet(TMP_DIR / "fpmmTrades.parquet")
+    # fpmmTrades["creationTimestamp"] = fpmmTrades["creationTimestamp"].apply(
+    #     lambda x: transform_to_datetime(x)
+    # )
+    # all_trades_df = analyse_all_traders(fpmmTrades, trade_mech_calls)
+    # all_trades_df.to_parquet(TMP_DIR / "all_trades_df.parquet", index=False)
+    run_profitability_analysis("file1", "file2")

scripts/pull_data.py CHANGED Viewed

@@ -11,6 +11,7 @@ from utils import (
     measure_execution_time,
     DATA_DIR,
     HIST_DIR,
 )
 from get_mech_info import (
     get_mech_events_since_last_run,
@@ -21,6 +22,7 @@ from cleaning_old_info import clean_old_data_from_parquet_files
 from web3_utils import updating_timestamps
 from manage_space_files import move_files
 from cloud_storage import load_historical_file
 logging.basicConfig(level=logging.INFO)
@@ -52,7 +54,7 @@ def save_historical_data():
     timestamp = current_datetime.strftime("%Y%m%d_%H%M%S")
     try:
-        tools = pd.read_parquet(DATA_DIR / "tools.parquet")
         filename = f"tools_{timestamp}.parquet"
         tools.to_parquet(HIST_DIR / filename, index=False)
         # save into cloud storage
@@ -79,7 +81,7 @@ def only_new_weekly_analysis():
     rpc = RPC
     # Run markets ETL
     logging.info("Running markets ETL")
-    mkt_etl(MARKETS_FILENAME)
     logging.info("Markets ETL completed")
     # Mech events ETL
@@ -108,7 +110,6 @@ def only_new_weekly_analysis():
     # # Run profitability analysis
     logging.info("Running profitability analysis")
     run_profitability_analysis(
-        rpc=rpc,
         tools_filename="new_tools.parquet",
         trades_filename="new_fpmmTrades.parquet",
         merge=True,
@@ -119,19 +120,13 @@ def only_new_weekly_analysis():
     # merge new json files with old json files
     update_json_files()
-    try:
-        updating_timestamps(rpc, TOOLS_FILENAME)
-    except Exception as e:
-        logging.error("Error while updating timestamps of tools")
-        print(e)
     save_historical_data()
-    clean_old_data_from_parquet_files("2024-10-17")
     compute_tools_accuracy()
-    # move to tmp folder the new generated files
     move_files()
     logging.info("Weekly analysis files generated and saved")

     measure_execution_time,
     DATA_DIR,
     HIST_DIR,
+    TMP_DIR,
 )
 from get_mech_info import (
     get_mech_events_since_last_run,
 from web3_utils import updating_timestamps
 from manage_space_files import move_files
 from cloud_storage import load_historical_file
+from tools_metrics import compute_tools_based_datasets
 logging.basicConfig(level=logging.INFO)
     timestamp = current_datetime.strftime("%Y%m%d_%H%M%S")
     try:
+        tools = pd.read_parquet(TMP_DIR / "tools.parquet")
         filename = f"tools_{timestamp}.parquet"
         tools.to_parquet(HIST_DIR / filename, index=False)
         # save into cloud storage
     rpc = RPC
     # Run markets ETL
     logging.info("Running markets ETL")
+    # mkt_etl(MARKETS_FILENAME)
     logging.info("Markets ETL completed")
     # Mech events ETL
     # # Run profitability analysis
     logging.info("Running profitability analysis")
     run_profitability_analysis(
         tools_filename="new_tools.parquet",
         trades_filename="new_fpmmTrades.parquet",
         merge=True,
     # merge new json files with old json files
     update_json_files()
     save_historical_data()
+    clean_old_data_from_parquet_files("2024-10-25")
     compute_tools_accuracy()
+    compute_tools_based_datasets()
+    # # move to tmp folder the new generated files
     move_files()
     logging.info("Weekly analysis files generated and saved")

scripts/roi_analysis.py DELETED Viewed

@@ -1,129 +0,0 @@
-import logging
-import os
-import pickle
-from web3 import Web3
-import pandas as pd
-from functools import partial
-from datetime import datetime
-from markets import (
-    etl as mkt_etl,
-    DEFAULT_FILENAME as MARKETS_FILENAME,
-)
-TOOLS_FILENAME = "tools_2024.parquet"
-from tools import (
-    etl as tools_etl,
-)
-from pull_data import (
-    DATA_DIR,
-    parallelize_timestamp_conversion,
-    block_number_to_timestamp,
-)
-from profitability import run_profitability_analysis
-from get_mech_info import get_mech_info_2024
-from utils import get_question, current_answer
-import gc
-logging.basicConfig(level=logging.INFO)
-def roi_analysis():
-    """Run ROI analysis for the trades done in 2024."""
-    rpc = "https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a"
-    web3 = Web3(Web3.HTTPProvider(rpc))
-    # Run markets ETL
-    logging.info("Running markets ETL")
-    mkt_etl(MARKETS_FILENAME)
-    logging.info("Markets ETL completed")
-    # Run tools ETL
-    logging.info("Running tools ETL")
-    # This etl is saving already the tools parquet file
-    tools_etl(
-        rpcs=[rpc],
-        mech_info=get_mech_info_2024(),
-        filename=TOOLS_FILENAME,
-    )
-    logging.info("Tools ETL completed")
-    # Run profitability analysis
-    if os.path.exists(DATA_DIR / "fpmmTrades.parquet"):
-        os.remove(DATA_DIR / "fpmmTrades.parquet")
-    logging.info("Running profitability analysis")
-    date = "2024-01-01"
-    datetime_jan_2024 = datetime.strptime(date, "%Y-%m-%d")
-    timestamp_jan_2024 = int(datetime_jan_2024.timestamp())
-    run_profitability_analysis(
-        rpc=rpc,
-        tools_filename=TOOLS_FILENAME,
-        trades_filename="fpmmTrades.parquet",
-        from_timestamp=timestamp_jan_2024,
-    )
-    logging.info("Profitability analysis completed")
-    # Get currentAnswer from FPMMS
-    fpmms = pd.read_parquet(DATA_DIR / MARKETS_FILENAME)
-    tools = pd.read_parquet(DATA_DIR / TOOLS_FILENAME)
-    # Get the question from the tools
-    logging.info("Getting the question and current answer for the tools")
-    tools["title"] = tools["prompt_request"].apply(lambda x: get_question(x))
-    tools["currentAnswer"] = tools["title"].apply(lambda x: current_answer(x, fpmms))
-    tools["currentAnswer"] = tools["currentAnswer"].str.replace("yes", "Yes")
-    tools["currentAnswer"] = tools["currentAnswer"].str.replace("no", "No")
-    # Convert block number to timestamp
-    logging.info("Converting block number to timestamp")
-    t_map = pickle.load(open(DATA_DIR / "t_map.pkl", "rb"))
-    tools["request_time"] = tools["request_block"].map(t_map)
-    # Identify tools with missing request_time and fill them
-    missing_time_indices = tools[tools["request_time"].isna()].index
-    if not missing_time_indices.empty:
-        partial_block_number_to_timestamp = partial(
-            block_number_to_timestamp, web3=web3
-        )
-        missing_timestamps = parallelize_timestamp_conversion(
-            tools.loc[missing_time_indices], partial_block_number_to_timestamp
-        )
-        # Update the original DataFrame with the missing timestamps
-        for i, timestamp in zip(missing_time_indices, missing_timestamps):
-            tools.at[i, "request_time"] = timestamp
-    tools["request_month_year"] = pd.to_datetime(tools["request_time"]).dt.strftime(
-        "%Y-%m"
-    )
-    tools["request_month_year_week"] = (
-        pd.to_datetime(tools["request_time"]).dt.to_period("W").astype(str)
-    )
-    # Save the tools data after the updates on the content
-    tools.to_parquet(DATA_DIR / TOOLS_FILENAME, index=False)
-    # Update t_map with new timestamps
-    new_timestamps = (
-        tools[["request_block", "request_time"]]
-        .dropna()
-        .set_index("request_block")
-        .to_dict()["request_time"]
-    )
-    t_map.update(new_timestamps)
-    with open(DATA_DIR / "t_map_2024.pkl", "wb") as f:
-        pickle.dump(t_map, f)
-    # clean and release all memory
-    del tools
-    del fpmms
-    del t_map
-    gc.collect()
-    logging.info("ROI analysis files generated and saved")
-if __name__ == "__main__":
-    roi_analysis()

scripts/staking.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import sys
 from typing import Any, List
-from utils import RPC, DATA_DIR
 import requests
 from tqdm import tqdm
 from web3 import Web3
@@ -194,15 +194,14 @@ def label_trades_by_staking(trades_df: pd.DataFrame, start: int = None) -> None:
                 staking_label
             )
         # tqdm.write(f"statking label {staking_label}")
-    return
 if __name__ == "__main__":
     # create_service_map()
-    trades_df = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet")
-    print("before labeling")
-    print(trades_df.staking.value_counts())
-    label_trades_by_staking(trades_df=trades_df, start=8)
-    print("after labeling")
     print(trades_df.staking.value_counts())
-    trades_df.to_parquet(DATA_DIR / "all_trades_profitability.parquet", index=False)

 import json
 import sys
 from typing import Any, List
+from utils import RPC, DATA_DIR, TMP_DIR
 import requests
 from tqdm import tqdm
 from web3 import Web3
                 staking_label
             )
         # tqdm.write(f"statking label {staking_label}")
+    return trades_df
 if __name__ == "__main__":
     # create_service_map()
+    trades_df = pd.read_parquet(TMP_DIR / "all_trades_df.parquet")
+    trades_df = trades_df.loc[trades_df["is_invalid"] == False]
+    trades_df = label_trades_by_staking(trades_df=trades_df, start=8)
     print(trades_df.staking.value_counts())
+    trades_df.to_parquet(TMP_DIR / "result_staking.parquet", index=False)

scripts/tools.py CHANGED Viewed

@@ -17,10 +17,7 @@
 #
 #   ------------------------------------------------------------------------------
-import os.path
 import json
-import time
-import random
 from typing import (
     Optional,
     List,
@@ -30,43 +27,21 @@ from typing import (
 )
 import pandas as pd
 import requests
-from eth_typing import ChecksumAddress
-from eth_utils import to_checksum_address
 from requests.adapters import HTTPAdapter
-from requests.exceptions import (
-    ReadTimeout as RequestsReadTimeoutError,
-    HTTPError as RequestsHTTPError,
-)
 from tqdm import tqdm
 from urllib3 import Retry
-from urllib3.exceptions import (
-    ReadTimeoutError as Urllib3ReadTimeoutError,
-    HTTPError as Urllib3HTTPError,
-)
-from web3 import Web3, HTTPProvider
 from markets import add_market_creator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from web3_utils import (
-    read_abi,
-    SLEEP,
-    reduce_window,
-    LATEST_BLOCK,
-    LATEST_BLOCK_NAME,
-    BLOCK_DATA_NUMBER,
-    BLOCKS_CHUNK_SIZE,
-    N_RPC_RETRIES,
     N_IPFS_RETRIES,
-    RPC_POLL_INTERVAL,
-    IPFS_POLL_INTERVAL,
 )
 from utils import (
     clean,
     BLOCK_FIELD,
-    gen_event_filename,
     limit_text,
     DATA_DIR,
     JSON_DATA_DIR,
-    REQUEST_ID_FIELD,
     MechEvent,
     MechEventName,
     MechRequest,
@@ -75,7 +50,6 @@ from utils import (
     REQUEST_ID,
     HTTP,
     HTTPS,
-    REQUEST_SENDER,
     get_result_values,
     get_vote,
     get_win_probability,
@@ -97,7 +71,6 @@ IPFS_LINKS_SERIES_NAME = "ipfs_links"
 BACKOFF_FACTOR = 1
 STATUS_FORCELIST = [404, 500, 502, 503, 504]
 DEFAULT_FILENAME = "tools.parquet"
-RE_RPC_FILTER_ERROR = r"Filter with id: '\d+' does not exist."
 ABI_ERROR = "The event signature did not match the provided ABI"
 HTTP_TIMEOUT = 10
@@ -121,127 +94,6 @@ NUM_WORKERS = 10
 GET_CONTENTS_BATCH_SIZE = 1000
-def get_events(
-    w3: Web3,
-    event: str,
-    mech_address: ChecksumAddress,
-    mech_abi_path: str,
-    earliest_block: int,
-    latest_block: int,
-) -> List:
-    """Get the delivered events."""
-    abi = read_abi(mech_abi_path)
-    contract_instance = w3.eth.contract(address=mech_address, abi=abi)
-    events = []
-    from_block = earliest_block
-    batch_size = BLOCKS_CHUNK_SIZE
-    with tqdm(
-        total=latest_block - from_block,
-        desc=f"Searching {event} events for mech {mech_address}",
-        unit="blocks",
-    ) as pbar:
-        while from_block < latest_block:
-            events_filter = contract_instance.events[event].build_filter()
-            events_filter.fromBlock = from_block
-            events_filter.toBlock = min(from_block + batch_size, latest_block)
-            entries = None
-            retries = 0
-            while entries is None:
-                try:
-                    entries = events_filter.deploy(w3).get_all_entries()
-                    retries = 0
-                except (RequestsHTTPError, Urllib3HTTPError) as exc:
-                    if "Request Entity Too Large" in exc.args[0]:
-                        events_filter, batch_size = reduce_window(
-                            contract_instance,
-                            event,
-                            from_block,
-                            batch_size,
-                            latest_block,
-                        )
-                except (Urllib3ReadTimeoutError, RequestsReadTimeoutError):
-                    events_filter, batch_size = reduce_window(
-                        contract_instance, event, from_block, batch_size, latest_block
-                    )
-                except Exception as exc:
-                    retries += 1
-                    if retries == N_RPC_RETRIES:
-                        tqdm.write(
-                            f"Skipping events for blocks {events_filter.fromBlock} - {events_filter.toBlock} "
-                            f"as the retries have been exceeded."
-                        )
-                        break
-                    sleep = SLEEP * retries
-                    # error_message = ""
-                    # if isinstance(exc.args[0], str):
-                    #     error_message = exc.args[0]
-                    # elif isinstance(exc, ValueError):
-                    #     error_message = exc.args[0].get("message", "")
-                    # if (
-                    #     (
-                    #         isinstance(exc, ValueError)
-                    #         and re.match(RE_RPC_FILTER_ERROR, error_message) is None
-                    #     )
-                    #     and not isinstance(exc, ValueError)
-                    #     and not isinstance(exc, MismatchedABI)
-                    # ):
-                    tqdm.write(
-                        f"An error was raised from the RPC: {exc}\n Retrying in {sleep} seconds."
-                    )
-                    if hasattr(exc, "message"):
-                        tqdm.write(f"Error message: {exc.message}\n")
-                    time.sleep(sleep)
-            from_block += batch_size
-            pbar.update(batch_size)
-            if entries is None:
-                continue
-            chunk = list(entries)
-            events.extend(chunk)
-            time.sleep(RPC_POLL_INTERVAL)
-    return events
-def parse_events(raw_events: List) -> List[MechEvent]:
-    # TODO use dictionary instead of List
-    """Parse all the specified MechEvents."""
-    parsed_events = []
-    for event in raw_events:
-        for_block = event.get("blockNumber", 0)
-        args = event.get(EVENT_ARGUMENTS, {})
-        request_id = args.get(REQUEST_ID, 0)
-        data = args.get(DATA, b"")
-        sender = args.get(REQUEST_SENDER, "")
-        parsed_event = MechEvent(for_block, request_id, data, sender)
-        parsed_events.append(parsed_event)
-    return parsed_events
-def parse_dict_events(events_dict: dict) -> List[MechEvent]:
-    # TODO use dictionary instead of List
-    """Parse all the specified MechEvents."""
-    parsed_events = []
-    list_ids = list(events_dict.keys())
-    for mech_id in list_ids:
-        event = events_dict[mech_id]
-        for_block = event.get("blockNumber", 0)
-        args = event.get(EVENT_ARGUMENTS, {})
-        request_id = args.get(REQUEST_ID, 0)
-        data = args.get(DATA, b"")
-        sender = args.get(REQUEST_SENDER, "")
-        parsed_event = MechEvent(for_block, request_id, data, sender)
-        parsed_events.append(parsed_event)
-    return parsed_events
 def create_session() -> requests.Session:
     """Create a session with a retry strategy."""
     session = requests.Session()
@@ -322,31 +174,6 @@ def parse_ipfs_tools_content(
     return mech_response
-def get_contents(
-    session: requests.Session, events: List[MechEvent], event_name: MechEventName
-) -> pd.DataFrame:
-    """Fetch the tools' responses."""
-    contents = []
-    for event in tqdm(events, desc=f"Tools' results", unit="results"):
-        url = event.ipfs_link(event_name)
-        response = request(session, url)
-        if response is None:
-            tqdm.write(f"Skipping {event=}.")
-            continue
-        raw_content = parse_ipfs_response(session, url, event, event_name, response)
-        if raw_content is None:
-            continue
-        mech_response = parse_ipfs_tools_content(raw_content, event, event_name)
-        if mech_response is None:
-            continue
-        contents.append(mech_response)
-        time.sleep(IPFS_POLL_INTERVAL)
-    return pd.DataFrame(contents)
 def parse_json_events(json_events: dict, keys_to_traverse: List[int]) -> pd.DataFrame:
     """Function to parse the mech info in a json format"""
     all_records = []
@@ -356,6 +183,10 @@ def parse_json_events(json_events: dict, keys_to_traverse: List[int]) -> pd.Data
             output = {}
             output["request_id"] = json_input["requestId"]
             output["request_block"] = json_input["blockNumber"]
             output["prompt_request"] = json_input["ipfsContents"]["prompt"]
             output["tool"] = json_input["ipfsContents"]["tool"]
             output["nonce"] = json_input["ipfsContents"]["nonce"]
@@ -424,144 +255,6 @@ def transform_deliver(contents: pd.DataFrame) -> pd.DataFrame:
     return clean(contents)
-def store_progress(
-    filename: str,
-    event_to_contents: Dict[str, pd.DataFrame],
-    tools: pd.DataFrame,
-) -> None:
-    """Store the given progress."""
-    print("storing given progress")
-    if filename:
-        DATA_DIR.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists
-        for event_name, content in event_to_contents.items():
-            event_filename = gen_event_filename(
-                event_name
-            )  # Ensure this function returns a valid filename string
-            try:
-                if "result" in content.columns:
-                    content = content.drop(
-                        columns=["result"]
-                    )  # Avoid in-place modification
-                content.to_parquet(DATA_DIR / event_filename, index=False)
-            except Exception as e:
-                print(f"Failed to write {event_name} data: {e}")
-        # Drop result columns for tools DataFrame
-        try:
-            if "result" in tools.columns:
-                tools = tools.drop(columns=["result"])
-            tools.to_parquet(DATA_DIR / filename, index=False)
-        except Exception as e:
-            print(f"Failed to write tools data: {e}")
-def etl(
-    rpcs: List[str],
-    mech_info: dict[str, Any],
-    filename: Optional[str] = None,
-) -> pd.DataFrame:
-    """Fetch from on-chain events, process, store and return the tools' results on
-    all the questions as a Dataframe."""
-    w3s = [Web3(HTTPProvider(r)) for r in rpcs]
-    session = create_session()
-    event_to_transformer = {
-        MechEventName.REQUEST: transform_request,
-        MechEventName.DELIVER: transform_deliver,
-    }
-    mech_to_info = {
-        to_checksum_address(address): (
-            os.path.join(CONTRACTS_PATH, filename),
-            earliest_block,
-        )
-        for address, (filename, earliest_block) in mech_info.items()
-    }
-    event_to_contents = {}
-    latest_block = LATEST_BLOCK
-    if latest_block is None:
-        latest_block = w3s[0].eth.get_block(LATEST_BLOCK_NAME)[BLOCK_DATA_NUMBER]
-    next_start_block = None
-    # Loop through events in event_to_transformer
-    for event_name, transformer in event_to_transformer.items():
-        # if next_start_block is None:
-        #     next_start_block_base = get_earliest_block(event_name)
-        # Loop through mech addresses in mech_to_info
-        events = []
-        for address, (abi, earliest_block) in mech_to_info.items():
-            next_start_block = earliest_block
-            print(
-                f"Searching for {event_name.value} events for mech {address} from block {next_start_block} to {latest_block}."
-            )
-            # parallelize the fetching of events
-            with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
-                futures = []
-                for i in range(
-                    next_start_block, latest_block, BLOCKS_CHUNK_SIZE * SNAPSHOT_RATE
-                ):
-                    futures.append(
-                        executor.submit(
-                            get_events,
-                            random.choice(w3s),
-                            event_name.value,
-                            address,
-                            abi,
-                            i,
-                            min(i + BLOCKS_CHUNK_SIZE * SNAPSHOT_RATE, latest_block),
-                        )
-                    )
-                for future in tqdm(
-                    as_completed(futures),
-                    total=len(futures),
-                    desc=f"Fetching {event_name.value} Events",
-                ):
-                    current_mech_events = future.result()
-                    events.extend(current_mech_events)
-        print("Parsing events")
-        parsed = parse_events(events)
-        contents = []
-        with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
-            futures = []
-            for i in range(0, len(parsed), GET_CONTENTS_BATCH_SIZE):
-                futures.append(
-                    executor.submit(
-                        get_contents,
-                        session,
-                        parsed[i : i + GET_CONTENTS_BATCH_SIZE],
-                        event_name,
-                    )
-                )
-            for future in tqdm(
-                as_completed(futures),
-                total=len(futures),
-                desc=f"Fetching {event_name.value} Contents",
-            ):
-                current_mech_contents = future.result()
-                contents.append(current_mech_contents)
-        contents = pd.concat(contents, ignore_index=True)
-        transformed = transformer(contents)
-        event_to_contents[event_name] = transformed.copy()
-    # Store progress
-    tools = pd.merge(*event_to_contents.values(), on=REQUEST_ID_FIELD)
-    print(tools.info())
-    store_progress(filename, event_to_contents, tools)
-    return tools
 def parse_store_json_events_parallel(json_events: Dict[str, Any], output_filename: str):
     total_nr_events = len(json_events)
     ids_to_traverse = list(json_events.keys())
@@ -614,9 +307,5 @@ def generate_tools_file(input_filename: str, output_filename: str):
 if __name__ == "__main__":
-    RPCs = [
-        "https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a",
-    ]
-    filename = DEFAULT_FILENAME
-    tools = etl(rpcs=RPCs, filename=filename)

 #
 #   ------------------------------------------------------------------------------
 import json
 from typing import (
     Optional,
     List,
 )
 import pandas as pd
 import requests
+from gnosis_timestamps import transform_timestamp_to_datetime
 from requests.adapters import HTTPAdapter
 from tqdm import tqdm
 from urllib3 import Retry
 from markets import add_market_creator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from web3_utils import (
     N_IPFS_RETRIES,
 )
 from utils import (
     clean,
     BLOCK_FIELD,
     limit_text,
     DATA_DIR,
     JSON_DATA_DIR,
     MechEvent,
     MechEventName,
     MechRequest,
     REQUEST_ID,
     HTTP,
     HTTPS,
     get_result_values,
     get_vote,
     get_win_probability,
 BACKOFF_FACTOR = 1
 STATUS_FORCELIST = [404, 500, 502, 503, 504]
 DEFAULT_FILENAME = "tools.parquet"
 ABI_ERROR = "The event signature did not match the provided ABI"
 HTTP_TIMEOUT = 10
 GET_CONTENTS_BATCH_SIZE = 1000
 def create_session() -> requests.Session:
     """Create a session with a retry strategy."""
     session = requests.Session()
     return mech_response
 def parse_json_events(json_events: dict, keys_to_traverse: List[int]) -> pd.DataFrame:
     """Function to parse the mech info in a json format"""
     all_records = []
             output = {}
             output["request_id"] = json_input["requestId"]
             output["request_block"] = json_input["blockNumber"]
+            output["request_time"] = transform_timestamp_to_datetime(
+                int(json_input["blockTimestamp"])
+            )
+            output["tx_hash"] = json_input["transactionHash"]
             output["prompt_request"] = json_input["ipfsContents"]["prompt"]
             output["tool"] = json_input["ipfsContents"]["tool"]
             output["nonce"] = json_input["ipfsContents"]["nonce"]
     return clean(contents)
 def parse_store_json_events_parallel(json_events: Dict[str, Any], output_filename: str):
     total_nr_events = len(json_events)
     ids_to_traverse = list(json_events.keys())
 if __name__ == "__main__":
+    generate_tools_file()

scripts/tools_metrics.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import pandas as pd
+from typing import List
+from utils import TMP_DIR, INC_TOOLS, DATA_DIR
+def get_error_data_by_market(
+    tools_df: pd.DataFrame, inc_tools: List[str]
+) -> pd.DataFrame:
+    """Gets the error data for the given tools and calculates the error percentage."""
+    tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
+    error = (
+        tools_inc.groupby(
+            ["tool", "request_month_year_week", "market_creator", "error"], sort=False
+        )
+        .size()
+        .unstack()
+        .fillna(0)
+        .reset_index()
+    )
+    error["error_perc"] = (error[1] / (error[0] + error[1])) * 100
+    error["total_requests"] = error[0] + error[1]
+    return error
+def get_tool_winning_rate_by_market(
+    tools_df: pd.DataFrame, inc_tools: List[str]
+) -> pd.DataFrame:
+    """Gets the tool winning rate data for the given tools by market and calculates the winning percentage."""
+    tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
+    tools_non_error = tools_inc[tools_inc["error"] != 1]
+    tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace(
+        {"no": "No", "yes": "Yes"}
+    )
+    tools_non_error = tools_non_error[
+        tools_non_error["currentAnswer"].isin(["Yes", "No"])
+    ]
+    tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])]
+    tools_non_error["win"] = (
+        tools_non_error["currentAnswer"] == tools_non_error["vote"]
+    ).astype(int)
+    tools_non_error.columns = tools_non_error.columns.astype(str)
+    wins = (
+        tools_non_error.groupby(
+            ["tool", "request_month_year_week", "market_creator", "win"], sort=False
+        )
+        .size()
+        .unstack()
+        .fillna(0)
+    )
+    wins["win_perc"] = (wins[1] / (wins[0] + wins[1])) * 100
+    wins.reset_index(inplace=True)
+    wins["total_request"] = wins[0] + wins[1]
+    wins.columns = wins.columns.astype(str)
+    # Convert request_month_year_week to string and explicitly set type for Altair
+    # wins["request_month_year_week"] = wins["request_month_year_week"].astype(str)
+    return wins
+def prepare_tools(tools: pd.DataFrame) -> pd.DataFrame:
+    tools["request_time"] = pd.to_datetime(tools["request_time"])
+    tools = tools.sort_values(by="request_time", ascending=True)
+    tools["request_month_year_week"] = (
+        pd.to_datetime(tools["request_time"]).dt.to_period("W").dt.strftime("%b-%d")
+    )
+    # preparing the tools graph
+    # adding the total
+    tools_all = tools.copy(deep=True)
+    tools_all["market_creator"] = "all"
+    # merging both dataframes
+    tools = pd.concat([tools, tools_all], ignore_index=True)
+    tools = tools.sort_values(by="request_time", ascending=True)
+    return tools
+def compute_tools_based_datasets():
+    try:
+        tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
+        tools_df = prepare_tools(tools_df)
+    except Exception as e:
+        print(f"Error reading old tools parquet file {e}")
+        return None
+    # error by markets
+    error_by_markets = get_error_data_by_market(tools_df=tools_df, inc_tools=INC_TOOLS)
+    error_by_markets.to_parquet(DATA_DIR / "error_by_markets.parquet", index=False)
+    try:
+        tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
+        tools_df = prepare_tools(tools_df)
+    except Exception as e:
+        print(f"Error reading old tools parquet file {e}")
+        return None
+    winning_df = get_tool_winning_rate_by_market(tools_df, inc_tools=INC_TOOLS)
+    winning_df.to_parquet(DATA_DIR / "winning_df.parquet", index=False)

scripts/update_tools_accuracy.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import os
 import pandas as pd
 import ipfshttpclient
-from pathlib import Path
 from utils import INC_TOOLS
 from typing import List
 ACCURACY_FILENAME = "tools_accuracy.csv"
 IPFS_SERVER = "/dns/registry.autonolas.tech/tcp/443/https"
-SCRIPTS_DIR = Path(__file__).parent
-ROOT_DIR = SCRIPTS_DIR.parent
-DATA_DIR = ROOT_DIR / "data"
 def update_tools_accuracy(
@@ -65,6 +62,7 @@ def update_tools_accuracy(
     print("tools to update")
     print(tools_to_update)
     existing_tools = list(tools_acc["tool"].values)
     acc_info["min"] = acc_info["min"].dt.strftime("%Y-%m-%d %H:%M:%S")
     acc_info["max"] = acc_info["max"].dt.strftime("%Y-%m-%d %H:%M:%S")
     for tool in tools_to_update:
@@ -101,7 +99,7 @@ def update_tools_accuracy(
 def compute_tools_accuracy():
     print("Computing accuracy of tools")
     print("Reading tools parquet file")
-    tools = pd.read_parquet(DATA_DIR / "tools.parquet")
     print(tools.head())
     # Computing tools accuracy information
     print("Computing tool accuracy information")

 import os
 import pandas as pd
 import ipfshttpclient
 from utils import INC_TOOLS
 from typing import List
+from utils import TMP_DIR, DATA_DIR
 ACCURACY_FILENAME = "tools_accuracy.csv"
 IPFS_SERVER = "/dns/registry.autonolas.tech/tcp/443/https"
 def update_tools_accuracy(
     print("tools to update")
     print(tools_to_update)
     existing_tools = list(tools_acc["tool"].values)
+    # dt.strftime("%Y-%m-%d %H:%M:%S")
     acc_info["min"] = acc_info["min"].dt.strftime("%Y-%m-%d %H:%M:%S")
     acc_info["max"] = acc_info["max"].dt.strftime("%Y-%m-%d %H:%M:%S")
     for tool in tools_to_update:
 def compute_tools_accuracy():
     print("Computing accuracy of tools")
     print("Reading tools parquet file")
+    tools = pd.read_parquet(TMP_DIR / "tools.parquet")
     print(tools.head())
     # Computing tools accuracy information
     print("Computing tool accuracy information")

scripts/web3_utils.py CHANGED Viewed

@@ -12,7 +12,15 @@ from tqdm import tqdm
 from web3 import Web3
 from typing import Any, Optional
 from web3.types import BlockParams
-from utils import JSON_DATA_DIR, DATA_DIR, SUBGRAPH_API_KEY, to_content, SUBGRAPH_URL
 from queries import conditional_tokens_gc_user_query, omen_xdai_trades_query
 import pandas as pd
@@ -96,11 +104,11 @@ def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> li
 def updating_timestamps(rpc: str, tools_filename: str):
     web3 = Web3(Web3.HTTPProvider(rpc))
-    tools = pd.read_parquet(DATA_DIR / tools_filename)
     # Convert block number to timestamp
     print("Converting block number to timestamp")
-    t_map = pickle.load(open(DATA_DIR / "t_map.pkl", "rb"))
     tools["request_time"] = tools["request_block"].map(t_map)
     no_data = tools["request_time"].isna().sum()
@@ -129,7 +137,7 @@ def updating_timestamps(rpc: str, tools_filename: str):
     # Save the tools data after the updates on the content
     print(f"Updating file {tools_filename} with timestamps")
-    tools.to_parquet(DATA_DIR / tools_filename, index=False)
     # Update t_map with new timestamps
     new_timestamps = (

 from web3 import Web3
 from typing import Any, Optional
 from web3.types import BlockParams
+from utils import (
+    JSON_DATA_DIR,
+    DATA_DIR,
+    SUBGRAPH_API_KEY,
+    to_content,
+    SUBGRAPH_URL,
+    HIST_DIR,
+    TMP_DIR,
+)
 from queries import conditional_tokens_gc_user_query, omen_xdai_trades_query
 import pandas as pd
 def updating_timestamps(rpc: str, tools_filename: str):
     web3 = Web3(Web3.HTTPProvider(rpc))
+    tools = pd.read_parquet(TMP_DIR / tools_filename)
     # Convert block number to timestamp
     print("Converting block number to timestamp")
+    t_map = pickle.load(open(TMP_DIR / "t_map.pkl", "rb"))
     tools["request_time"] = tools["request_block"].map(t_map)
     no_data = tools["request_time"].isna().sum()
     # Save the tools data after the updates on the content
     print(f"Updating file {tools_filename} with timestamps")
+    tools.to_parquet(TMP_DIR / tools_filename, index=False)
     # Update t_map with new timestamps
     new_timestamps = (

tabs/error.py CHANGED Viewed

@@ -9,33 +9,14 @@ HEIGHT = 600
 WIDTH = 1000
-def get_error_data_by_market(
-    tools_df: pd.DataFrame, inc_tools: List[str]
-) -> pd.DataFrame:
-    """Gets the error data for the given tools and calculates the error percentage."""
-    tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
-    error = (
-        tools_inc.groupby(
-            ["tool", "request_month_year_week", "market_creator", "error"], sort=False
-        )
-        .size()
-        .unstack()
-        .fillna(0)
-        .reset_index()
-    )
-    error["error_perc"] = (error[1] / (error[0] + error[1])) * 100
-    error["total_requests"] = error[0] + error[1]
-    return error
 def get_error_data_overall_by_market(error_df: pd.DataFrame) -> pd.DataFrame:
     """Gets the error data for the given tools and calculates the error percentage."""
     error_total = (
         error_df.groupby(["request_month_year_week", "market_creator"], sort=False)
-        .agg({"total_requests": "sum", 1: "sum", 0: "sum"})
         .reset_index()
     )
-    error_total["error_perc"] = (error_total[1] / error_total["total_requests"]) * 100
     error_total.columns = error_total.columns.astype(str)
     error_total["error_perc"] = error_total["error_perc"].apply(lambda x: round(x, 4))
     return error_total

 WIDTH = 1000
 def get_error_data_overall_by_market(error_df: pd.DataFrame) -> pd.DataFrame:
     """Gets the error data for the given tools and calculates the error percentage."""
     error_total = (
         error_df.groupby(["request_month_year_week", "market_creator"], sort=False)
+        .agg({"total_requests": "sum", "1": "sum", "0": "sum"})
         .reset_index()
     )
+    error_total["error_perc"] = (error_total["1"] / error_total["total_requests"]) * 100
     error_total.columns = error_total.columns.astype(str)
     error_total["error_perc"] = error_total["error_perc"].apply(lambda x: round(x, 4))
     return error_total

tabs/metrics.py CHANGED Viewed

@@ -28,40 +28,6 @@ HEIGHT = 600
 WIDTH = 1000
-def get_metrics(
-    metric_name: str, column_name: str, market_creator: str, trades_df: pd.DataFrame
-) -> pd.DataFrame:
-    # this is to filter out the data before 2023-09-01
-    trades_filtered = trades_df[trades_df["creation_timestamp"] > "2023-09-01"]
-    if market_creator != "all":
-        trades_filtered = trades_filtered.loc[
-            trades_filtered["market_creator"] == market_creator
-        ]
-    trades_filtered = (
-        trades_filtered.groupby("month_year_week", sort=False)[column_name]
-        .quantile([0.25, 0.5, 0.75])
-        .unstack()
-    )
-    # reformat the data as percentile, date, value
-    trades_filtered = trades_filtered.melt(
-        id_vars=["month_year_week"], var_name="percentile", value_name=metric_name
-    )
-    trades_filtered.columns = trades_filtered.columns.astype(str)
-    trades_filtered.reset_index(inplace=True)
-    trades_filtered.columns = [
-        "month_year_week",
-        "25th_percentile",
-        "50th_percentile",
-        "75th_percentile",
-    ]
-    # reformat the data as percentile, date, value
-    trades_filtered = trades_filtered.melt(
-        id_vars=["month_year_week"], var_name="percentile", value_name=metric_name
-    )
-    return trades_filtered
 def get_boxplot_metrics(column_name: str, trades_df: pd.DataFrame) -> pd.DataFrame:
     trades_filtered = trades_df[
         ["creation_timestamp", "month_year_week", "market_creator", column_name]
@@ -81,45 +47,6 @@ def get_boxplot_metrics(column_name: str, trades_df: pd.DataFrame) -> pd.DataFra
     return all_filtered_trades
-def plot2_trade_details(
-    metric_name: str, market_creator: str, trades_df: pd.DataFrame
-) -> gr.Plot:
-    """Plots the trade details for the given trade detail."""
-    if metric_name == "mech calls":
-        metric_name = "mech_calls"
-        column_name = "num_mech_calls"
-        yaxis_title = "Nr of mech calls per trade"
-    elif metric_name == "ROI":
-        column_name = "roi"
-        yaxis_title = "ROI (net profit/cost)"
-    elif metric_name == "collateral amount":
-        metric_name = "collateral_amount"
-        column_name = metric_name
-        yaxis_title = "Collateral amount per trade (xDAI)"
-    elif metric_name == "net earnings":
-        metric_name = "net_earnings"
-        column_name = metric_name
-        yaxis_title = "Net profit per trade (xDAI)"
-    else:  # earnings
-        column_name = metric_name
-        yaxis_title = "Gross profit per trade (xDAI)"
-    trades_filtered = get_metrics(metric_name, column_name, market_creator, trades_df)
-    fig = px.line(
-        trades_filtered, x="month_year_week", y=metric_name, color="percentile"
-    )
-    fig.update_layout(
-        xaxis_title="Week",
-        yaxis_title=yaxis_title,
-        legend=dict(yanchor="top", y=0.5),
-    )
-    fig.update_xaxes(tickformat="%b %d\n%Y")
-    return gr.Plot(
-        value=fig,
-    )
 def plot_trade_metrics(
     metric_name: str, trades_df: pd.DataFrame, trader_filter: str = None
 ) -> gr.Plot:

 WIDTH = 1000
 def get_boxplot_metrics(column_name: str, trades_df: pd.DataFrame) -> pd.DataFrame:
     trades_filtered = trades_df[
         ["creation_timestamp", "month_year_week", "market_creator", column_name]
     return all_filtered_trades
 def plot_trade_metrics(
     metric_name: str, trades_df: pd.DataFrame, trader_filter: str = None
 ) -> gr.Plot:

tabs/tool_win.py CHANGED Viewed

@@ -26,40 +26,6 @@ def prepare_tools(tools: pd.DataFrame) -> pd.DataFrame:
     return tools
-def get_tool_winning_rate_by_market(
-    tools_df: pd.DataFrame, inc_tools: List[str]
-) -> pd.DataFrame:
-    """Gets the tool winning rate data for the given tools by market and calculates the winning percentage."""
-    tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
-    tools_non_error = tools_inc[tools_inc["error"] != 1]
-    tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace(
-        {"no": "No", "yes": "Yes"}
-    )
-    tools_non_error = tools_non_error[
-        tools_non_error["currentAnswer"].isin(["Yes", "No"])
-    ]
-    tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])]
-    tools_non_error["win"] = (
-        tools_non_error["currentAnswer"] == tools_non_error["vote"]
-    ).astype(int)
-    tools_non_error.columns = tools_non_error.columns.astype(str)
-    wins = (
-        tools_non_error.groupby(
-            ["tool", "request_month_year_week", "market_creator", "win"], sort=False
-        )
-        .size()
-        .unstack()
-        .fillna(0)
-    )
-    wins["win_perc"] = (wins[1] / (wins[0] + wins[1])) * 100
-    wins.reset_index(inplace=True)
-    wins["total_request"] = wins[0] + wins[1]
-    wins.columns = wins.columns.astype(str)
-    # Convert request_month_year_week to string and explicitly set type for Altair
-    # wins["request_month_year_week"] = wins["request_month_year_week"].astype(str)
-    return wins
 def get_overall_winning_rate_by_market(wins_df: pd.DataFrame) -> pd.DataFrame:
     """Gets the overall winning rate data for the given tools and calculates the winning percentage."""
     overall_wins = (

     return tools
 def get_overall_winning_rate_by_market(wins_df: pd.DataFrame) -> pd.DataFrame:
     """Gets the overall winning rate data for the given tools and calculates the winning percentage."""
     overall_wins = (