arshy commited on
Commit
1edb6cc
·
1 Parent(s): 4fe4cfe
data/all_trades_profitability.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b77385b7a1d673d3906867642f2b7f0cd1407fbb223d0719402dfed69eacce18
3
- size 8306850
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:651c73abd6f2d68f12fa1b20363340c1ceff7652960fe4b47442b95865ef78ae
3
+ size 8363176
data/delivers.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65a6a1bb45f34a5d03884c95b91b8bcf9fcd605bd8ddcbcfbb8a2d46ff7c874d
3
- size 1740508783
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:873f67b40ebc7dd4ce409ffde5d00a269fc41a24c9627c7df11f02fe3101a389
3
+ size 1777877349
data/fpmmTrades.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:666289cd9f5d0416d0fd435f1112e6f8c5d6d68e5658d4628fa100b35d585302
3
- size 20896303
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db7352aa0dcf2ffd2f3c86a2edbcb13dca42c9d5089787a5a73399065a3e6444
3
+ size 21257018
data/requests.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51b9e0a5cf0d548af32a14b4adb536238084053f25ba22f09446331054c36810
3
- size 47481087
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f89b9db573611cd096e6b17c909842690c3bd2f38f0763e4a809ccfe0ef718d6
3
+ size 48251533
data/summary_profitability.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7371fc22b1f05b2501cc6fd535df09d2059e302c2da1a67ce9d7d866810f85c4
3
- size 52478
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6b13394febf32397270399196772b87014367fd2131fe15d87deb53771b6f60
3
+ size 52459
data/t_map.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c713d04aeb16fdcf2fb6db1efa4752d00bb778d6ed83b67e8606fc51b67a367
3
- size 7919513
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c73106e6ae68724a551c807e8a67d209878ecaf2badaae84307fd9ccb9c9cff9
3
+ size 8126752
data/tools.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a89ecde0ccdf064c23d987f34a9f2ebfd62d03cabeddff3c8f26673ee46d727a
3
- size 1746997644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcbf8e0f5725fbe23b7b069031292ddfa8c0fb1425fca5b730492583ba175d54
3
+ size 1784716177
scripts/tools.py CHANGED
@@ -267,19 +267,19 @@ class MechResponse:
267
  if isinstance(self.result, str):
268
  kwargs = json.loads(self.result)
269
  self.result = PredictionResponse(**kwargs)
270
- self.error = str(False)
271
 
272
  except JSONDecodeError:
273
  self.error_message = "Response parsing error"
274
- self.error = str(True)
275
 
276
  except Exception as e:
277
  self.error_message = str(e)
278
- self.error = str(True)
279
 
280
  else:
281
  self.error_message = "Invalid response from tool"
282
- self.error = str(True)
283
  self.result = None
284
 
285
 
@@ -616,6 +616,7 @@ def store_progress(
616
  tools: pd.DataFrame,
617
  ) -> None:
618
  """Store the given progress."""
 
619
  if filename:
620
  DATA_DIR.mkdir(parents=True, exist_ok=True) # Ensure the directory exists
621
  for event_name, content in event_to_contents.items():
@@ -623,8 +624,6 @@ def store_progress(
623
  try:
624
  if "result" in content.columns:
625
  content = content.drop(columns=["result"]) # Avoid in-place modification
626
- if 'error' in content.columns:
627
- content['error'] = content['error'].astype(bool)
628
  content.to_parquet(DATA_DIR / event_filename, index=False)
629
  except Exception as e:
630
  print(f"Failed to write {event_name}: {e}")
@@ -632,8 +631,6 @@ def store_progress(
632
  try:
633
  if "result" in tools.columns:
634
  tools = tools.drop(columns=["result"])
635
- if 'error' in tools.columns:
636
- tools['error'] = tools['error'].astype(bool)
637
  tools.to_parquet(DATA_DIR / filename, index=False)
638
  except Exception as e:
639
  print(f"Failed to write tools data: {e}")
 
267
  if isinstance(self.result, str):
268
  kwargs = json.loads(self.result)
269
  self.result = PredictionResponse(**kwargs)
270
+ self.error = 0
271
 
272
  except JSONDecodeError:
273
  self.error_message = "Response parsing error"
274
+ self.error = 1
275
 
276
  except Exception as e:
277
  self.error_message = str(e)
278
+ self.error = 1
279
 
280
  else:
281
  self.error_message = "Invalid response from tool"
282
+ self.error = 1
283
  self.result = None
284
 
285
 
 
616
  tools: pd.DataFrame,
617
  ) -> None:
618
  """Store the given progress."""
619
+ print("starting")
620
  if filename:
621
  DATA_DIR.mkdir(parents=True, exist_ok=True) # Ensure the directory exists
622
  for event_name, content in event_to_contents.items():
 
624
  try:
625
  if "result" in content.columns:
626
  content = content.drop(columns=["result"]) # Avoid in-place modification
 
 
627
  content.to_parquet(DATA_DIR / event_filename, index=False)
628
  except Exception as e:
629
  print(f"Failed to write {event_name}: {e}")
 
631
  try:
632
  if "result" in tools.columns:
633
  tools = tools.drop(columns=["result"])
 
 
634
  tools.to_parquet(DATA_DIR / filename, index=False)
635
  except Exception as e:
636
  print(f"Failed to write tools data: {e}")
tabs/error.py CHANGED
@@ -19,14 +19,14 @@ def get_error_data(tools_df: pd.DataFrame, inc_tools: List[str]) -> pd.DataFrame
19
  tools_inc = tools_df[tools_df['tool'].isin(inc_tools)]
20
  # tools_inc['error'] = tools_inc.apply(set_error, axis=1)
21
  error = tools_inc.groupby(['tool', 'request_month_year_week', 'error']).size().unstack().fillna(0).reset_index()
22
- error['error_perc'] = (error[True] / (error[False] + error[True])) * 100
23
- error['total_requests'] = error[False] + error[True]
24
  return error
25
 
26
  def get_error_data_overall(error_df: pd.DataFrame) -> pd.DataFrame:
27
  """Gets the error data for the given tools and calculates the error percentage."""
28
- error_total = error_df.groupby('request_month_year_week').agg({'total_requests': 'sum', False: 'sum', True: 'sum'}).reset_index()
29
- error_total['error_perc'] = (error_total[True] / error_total['total_requests']) * 100
30
  error_total.columns = error_total.columns.astype(str)
31
  error_total['error_perc'] = error_total['error_perc'].apply(lambda x: round(x, 4))
32
  return error_total
 
19
  tools_inc = tools_df[tools_df['tool'].isin(inc_tools)]
20
  # tools_inc['error'] = tools_inc.apply(set_error, axis=1)
21
  error = tools_inc.groupby(['tool', 'request_month_year_week', 'error']).size().unstack().fillna(0).reset_index()
22
+ error['error_perc'] = (error[1] / (error[0] + error[1])) * 100
23
+ error['total_requests'] = error[0] + error[1]
24
  return error
25
 
26
  def get_error_data_overall(error_df: pd.DataFrame) -> pd.DataFrame:
27
  """Gets the error data for the given tools and calculates the error percentage."""
28
+ error_total = error_df.groupby('request_month_year_week').agg({'total_requests': 'sum', 1: 'sum', 0: 'sum'}).reset_index()
29
+ error_total['error_perc'] = (error_total[1] / error_total['total_requests']) * 100
30
  error_total.columns = error_total.columns.astype(str)
31
  error_total['error_perc'] = error_total['error_perc'].apply(lambda x: round(x, 4))
32
  return error_total
tabs/tool_win.py CHANGED
@@ -7,20 +7,11 @@ HEIGHT=600
7
  WIDTH=1000
8
 
9
 
10
- # def set_error(row: pd.Series) -> bool:
11
- # """Sets the error for the given row."""
12
- # if row.error not in [True, False]:
13
- # if not row.prompt_response:
14
- # return True
15
- # return False
16
- # return row.error
17
-
18
-
19
  def get_tool_winning_rate(tools_df: pd.DataFrame, inc_tools: List[str]) -> pd.DataFrame:
20
  """Gets the tool winning rate data for the given tools and calculates the winning percentage."""
21
  tools_inc = tools_df[tools_df['tool'].isin(inc_tools)]
22
  # tools_inc['error'] = tools_inc.apply(set_error, axis=1)
23
- tools_non_error = tools_inc[tools_inc['error'] != True]
24
  tools_non_error.loc[:, 'currentAnswer'] = tools_non_error['currentAnswer'].replace({'no': 'No', 'yes': 'Yes'})
25
  tools_non_error = tools_non_error[tools_non_error['currentAnswer'].isin(['Yes', 'No'])]
26
  tools_non_error = tools_non_error[tools_non_error['vote'].isin(['Yes', 'No'])]
 
7
  WIDTH=1000
8
 
9
 
 
 
 
 
 
 
 
 
 
10
  def get_tool_winning_rate(tools_df: pd.DataFrame, inc_tools: List[str]) -> pd.DataFrame:
11
  """Gets the tool winning rate data for the given tools and calculates the winning percentage."""
12
  tools_inc = tools_df[tools_df['tool'].isin(inc_tools)]
13
  # tools_inc['error'] = tools_inc.apply(set_error, axis=1)
14
+ tools_non_error = tools_inc[tools_inc['error'] != 1]
15
  tools_non_error.loc[:, 'currentAnswer'] = tools_non_error['currentAnswer'].replace({'no': 'No', 'yes': 'Yes'})
16
  tools_non_error = tools_non_error[tools_non_error['currentAnswer'].isin(['Yes', 'No'])]
17
  tools_non_error = tools_non_error[tools_non_error['vote'].isin(['Yes', 'No'])]
test.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 6,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
@@ -20,134 +20,362 @@
20
  "from enum import Enum\n",
21
  "from tqdm import tqdm\n",
22
  "import numpy as np\n",
23
- "from pathlib import Path"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  ]
25
  },
26
  {
27
  "cell_type": "code",
28
- "execution_count": 9,
29
  "metadata": {},
30
  "outputs": [],
31
  "source": [
32
- "INC_TOOLS = [\n",
33
- " 'prediction-online', \n",
34
- " 'prediction-offline', \n",
35
- " 'claude-prediction-online', \n",
36
- " 'claude-prediction-offline', \n",
37
- " 'prediction-offline-sme',\n",
38
- " 'prediction-online-sme',\n",
39
- " 'prediction-request-rag',\n",
40
- " 'prediction-request-reasoning',\n",
41
- " 'prediction-url-cot-claude', \n",
42
- " 'prediction-request-rag-claude',\n",
43
- " 'prediction-request-reasoning-claude'\n",
44
- "]"
45
  ]
46
  },
47
  {
48
  "cell_type": "code",
49
- "execution_count": 2,
50
  "metadata": {},
51
- "outputs": [
52
- {
53
- "name": "stderr",
54
- "output_type": "stream",
55
- "text": [
56
- "/var/folders/l_/g22b1g_n0gn4tmx9lkxqv5x00000gn/T/ipykernel_58769/3518445359.py:5: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
57
- " trades_df['month_year'] = trades_df['creation_timestamp'].dt.to_period('M').astype(str)\n",
58
- "/var/folders/l_/g22b1g_n0gn4tmx9lkxqv5x00000gn/T/ipykernel_58769/3518445359.py:6: UserWarning: Converting to PeriodArray/Index representation will drop timezone information.\n",
59
- " trades_df['month_year_week'] = trades_df['creation_timestamp'].dt.to_period('W').astype(str)\n"
60
- ]
61
- }
62
- ],
63
  "source": [
64
- "def prepare_trades(trades_df: pd.DataFrame) -> pd.DataFrame:\n",
65
- " \"\"\"Prepares the trades data for analysis.\"\"\"\n",
66
- " trades_df['creation_timestamp'] = pd.to_datetime(trades_df['creation_timestamp'])\n",
67
- " trades_df['creation_timestamp'] = trades_df['creation_timestamp'].dt.tz_convert('UTC')\n",
68
- " trades_df['month_year'] = trades_df['creation_timestamp'].dt.to_period('M').astype(str)\n",
69
- " trades_df['month_year_week'] = trades_df['creation_timestamp'].dt.to_period('W').astype(str)\n",
70
- " trades_df['winning_trade'] = trades_df['winning_trade'].astype(int)\n",
71
- " return trades_df\n",
 
 
 
 
 
 
 
 
 
72
  "\n",
73
- "def prepare_data():\n",
74
- " tools_df = pd.read_parquet(\"./data/tools.parquet\")\n",
75
- " trades_df = pd.read_parquet(\"./data/all_trades_profitability.parquet\")\n",
76
  "\n",
77
- " tools_df['request_time'] = pd.to_datetime(tools_df['request_time'])\n",
78
- " tools_df = tools_df[tools_df['request_time'].dt.year == 2024]\n",
79
  "\n",
80
- " trades_df['creation_timestamp'] = pd.to_datetime(trades_df['creation_timestamp'])\n",
81
- " trades_df = trades_df[trades_df['creation_timestamp'].dt.year == 2024]\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  "\n",
83
- " trades_df = prepare_trades(trades_df)\n",
84
- " return tools_df, trades_df\n",
 
 
 
 
 
 
 
 
85
  "\n",
86
- "tools_df, trades_df = prepare_data()"
 
87
  ]
88
  },
89
  {
90
  "cell_type": "code",
91
- "execution_count": 4,
92
  "metadata": {},
93
- "outputs": [
94
- {
95
- "data": {
96
- "text/plain": [
97
- "Index(['trader_address', 'trade_id', 'creation_timestamp', 'title',\n",
98
- " 'market_status', 'collateral_amount', 'outcome_index',\n",
99
- " 'trade_fee_amount', 'outcomes_tokens_traded', 'current_answer',\n",
100
- " 'is_invalid', 'winning_trade', 'earnings', 'redeemed',\n",
101
- " 'redeemed_amount', 'num_mech_calls', 'mech_fee_amount', 'net_earnings',\n",
102
- " 'roi', 'month_year', 'month_year_week'],\n",
103
- " dtype='object')"
104
- ]
105
- },
106
- "execution_count": 4,
107
- "metadata": {},
108
- "output_type": "execute_result"
109
- }
110
- ],
111
  "source": [
112
- "trades_df.columns"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  ]
114
  },
115
  {
116
  "cell_type": "code",
117
- "execution_count": 21,
118
  "metadata": {},
119
  "outputs": [],
120
  "source": [
121
- "def get_error_data(tools_df: pd.DataFrame, inc_tools: List[str]) -> pd.DataFrame:\n",
122
- " \"\"\"Gets the error data for the given tools and calculates the error percentage.\"\"\"\n",
123
- " tools_inc = tools_df[tools_df['tool'].isin(inc_tools)]\n",
124
- " error = tools_inc.groupby(['tool', 'request_month_year_week', 'error']).size().unstack(fill_value=0).reset_index()\n",
125
- " error['error_perc'] = (error[True] / (error[False] + error[True])) * 100\n",
126
- " error['total_requests'] = error[False] + error[True]\n",
127
- " return error\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "\n",
129
- "def get_error_data_overall(error_df: pd.DataFrame) -> pd.DataFrame:\n",
130
- " \"\"\"Gets the error data for the given tools and calculates the error percentage.\"\"\"\n",
131
- " error_total = error_df.groupby('request_month_year_week').agg({'total_requests': 'sum', False: 'sum', True: 'sum'}).reset_index()\n",
132
- " error_total['error_perc'] = (error_total[True] / error_total['total_requests']) * 100\n",
133
- " error_total.columns = error_total.columns.astype(str)\n",
134
- " error_total['error_perc'] = error_total['error_perc'].apply(lambda x: round(x, 4))\n",
135
- " return error_total"
 
 
 
 
 
136
  ]
137
  },
138
  {
139
  "cell_type": "code",
140
- "execution_count": 22,
141
  "metadata": {},
142
  "outputs": [],
143
  "source": [
144
- "error_df = get_error_data(\n",
145
- " tools_df=tools_df,\n",
146
- " inc_tools=INC_TOOLS\n",
147
- ")\n",
148
- "error_overall_df = get_error_data_overall(\n",
149
- " error_df=error_df\n",
150
- ")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  ]
152
  },
153
  {
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": null,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
 
20
  "from enum import Enum\n",
21
  "from tqdm import tqdm\n",
22
  "import numpy as np\n",
23
+ "from pathlib import Path\n",
24
+ "import pickle"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": null,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "# trades = pd.read_parquet('/Users/arshath/play/openautonomy/olas-prediction-live-dashboard_old/data/all_trades_profitability.parquet')\n",
34
+ "tools = pd.read_parquet('/Users/arshath/play/openautonomy/olas-prediction-live-dashboard_old/data/tools.parquet')"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "tools.groupby(['request_month_year_week', 'error']).size().unstack()"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "t_map = pickle.load(open('./data/t_map.pkl', 'rb'))\n",
53
+ "tools['request_time'] = tools['request_block'].map(t_map)\n",
54
+ "tools.to_parquet('./data/tools.parquet')"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "tools['request_time'] = pd.to_datetime(tools['request_time'])\n",
64
+ "tools = tools[tools['request_time'] >= pd.to_datetime('2024-05-01')]\n",
65
+ "tools['request_block'].max()"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "requests = pd.read_parquet(\"./data/requests.parquet\")\n",
75
+ "delivers = pd.read_parquet(\"./data/delivers.parquet\")\n",
76
+ "print(requests.shape)\n",
77
+ "print(delivers.shape)"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "requests[requests['request_block'] <= 33714082].reset_index(drop=True).to_parquet(\"./data/requests.parquet\")\n",
87
+ "delivers[delivers['deliver_block'] <= 33714082].reset_index(drop=True).to_parquet(\"./data/delivers.parquet\")"
88
  ]
89
  },
90
  {
91
  "cell_type": "code",
92
+ "execution_count": null,
93
  "metadata": {},
94
  "outputs": [],
95
  "source": [
96
+ "import sys \n",
97
+ "\n",
98
+ "sys.path.append('./')\n",
99
+ "from scripts.tools import *"
 
 
 
 
 
 
 
 
 
100
  ]
101
  },
102
  {
103
  "cell_type": "code",
104
+ "execution_count": null,
105
  "metadata": {},
106
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
107
  "source": [
108
+ "RPCs = [\n",
109
+ " \"https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a\",\n",
110
+ "]\n",
111
+ "w3s = [Web3(HTTPProvider(r)) for r in RPCs]\n",
112
+ "session = create_session()\n",
113
+ "event_to_transformer = {\n",
114
+ " MechEventName.REQUEST: transform_request,\n",
115
+ " MechEventName.DELIVER: transform_deliver,\n",
116
+ "}\n",
117
+ "mech_to_info = {\n",
118
+ " to_checksum_address(address): (\n",
119
+ " os.path.join(CONTRACTS_PATH, filename),\n",
120
+ " earliest_block,\n",
121
+ " )\n",
122
+ " for address, (filename, earliest_block) in MECH_TO_INFO.items()\n",
123
+ "}\n",
124
+ "event_to_contents = {}\n",
125
  "\n",
126
+ "# latest_block = w3s[0].eth.get_block(LATEST_BLOCK_NAME)[BLOCK_DATA_NUMBER]\n",
127
+ "latest_block = 34032575\n",
 
128
  "\n",
129
+ "next_start_block = latest_block - 300\n",
 
130
  "\n",
131
+ "events_request = []\n",
132
+ "events_deliver = []\n",
133
+ "# Loop through events in event_to_transformer\n",
134
+ "for event_name, transformer in event_to_transformer.items():\n",
135
+ " print(f\"Fetching {event_name.value} events\")\n",
136
+ " for address, (abi, earliest_block) in mech_to_info.items():\n",
137
+ " # parallelize the fetching of events\n",
138
+ " with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:\n",
139
+ " futures = []\n",
140
+ " for i in range(\n",
141
+ " next_start_block, latest_block, BLOCKS_CHUNK_SIZE * SNAPSHOT_RATE\n",
142
+ " ):\n",
143
+ " futures.append(\n",
144
+ " executor.submit(\n",
145
+ " get_events,\n",
146
+ " random.choice(w3s),\n",
147
+ " event_name.value,\n",
148
+ " address,\n",
149
+ " abi,\n",
150
+ " i,\n",
151
+ " min(i + BLOCKS_CHUNK_SIZE * SNAPSHOT_RATE, latest_block),\n",
152
+ " )\n",
153
+ " )\n",
154
  "\n",
155
+ " for future in tqdm(\n",
156
+ " as_completed(futures),\n",
157
+ " total=len(futures),\n",
158
+ " desc=f\"Fetching {event_name.value} Events\",\n",
159
+ " ):\n",
160
+ " current_mech_events = future.result()\n",
161
+ " if event_name == MechEventName.REQUEST:\n",
162
+ " events_request.extend(current_mech_events)\n",
163
+ " elif event_name == MechEventName.DELIVER:\n",
164
+ " events_deliver.extend(current_mech_events)\n",
165
  "\n",
166
+ " parsed_request = parse_events(events_request)\n",
167
+ " parsed_deliver = parse_events(events_deliver)"
168
  ]
169
  },
170
  {
171
  "cell_type": "code",
172
+ "execution_count": null,
173
  "metadata": {},
174
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  "source": [
176
+ "contents_request = []\n",
177
+ "with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:\n",
178
+ " futures = []\n",
179
+ " for i in range(0, len(parsed_request), GET_CONTENTS_BATCH_SIZE):\n",
180
+ " futures.append(\n",
181
+ " executor.submit(\n",
182
+ " get_contents,\n",
183
+ " session,\n",
184
+ " parsed_request[i : i + GET_CONTENTS_BATCH_SIZE],\n",
185
+ " MechEventName.REQUEST,\n",
186
+ " )\n",
187
+ " )\n",
188
+ "\n",
189
+ " for future in tqdm(\n",
190
+ " as_completed(futures),\n",
191
+ " total=len(futures),\n",
192
+ " desc=f\"Fetching {event_name.value} Contents\",\n",
193
+ " ):\n",
194
+ " current_mech_contents = future.result()\n",
195
+ " contents_request.append(current_mech_contents)\n",
196
+ "\n",
197
+ "contents_request = pd.concat(contents_request, ignore_index=True)"
198
  ]
199
  },
200
  {
201
  "cell_type": "code",
202
+ "execution_count": null,
203
  "metadata": {},
204
  "outputs": [],
205
  "source": [
206
+ "contents_deliver = []\n",
207
+ "with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:\n",
208
+ " futures = []\n",
209
+ " for i in range(0, len(parsed_deliver), GET_CONTENTS_BATCH_SIZE):\n",
210
+ " futures.append(\n",
211
+ " executor.submit(\n",
212
+ " get_contents,\n",
213
+ " session,\n",
214
+ " parsed_deliver[i : i + GET_CONTENTS_BATCH_SIZE],\n",
215
+ " MechEventName.DELIVER,\n",
216
+ " )\n",
217
+ " )\n",
218
+ "\n",
219
+ " for future in tqdm(\n",
220
+ " as_completed(futures),\n",
221
+ " total=len(futures),\n",
222
+ " desc=f\"Fetching {event_name.value} Contents\",\n",
223
+ " ):\n",
224
+ " current_mech_contents = future.result()\n",
225
+ " contents_deliver.append(current_mech_contents)\n",
226
  "\n",
227
+ "contents_deliver = pd.concat(contents_deliver, ignore_index=True)"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "metadata": {},
234
+ "outputs": [],
235
+ "source": [
236
+ "full_contents = True\n",
237
+ "transformed_request = event_to_transformer[MechEventName.REQUEST](contents_request)\n",
238
+ "transformed_deliver = event_to_transformer[MechEventName.DELIVER](contents_deliver, full_contents=full_contents)"
239
  ]
240
  },
241
  {
242
  "cell_type": "code",
243
+ "execution_count": null,
244
  "metadata": {},
245
  "outputs": [],
246
  "source": [
247
+ "transformed_request.shape"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "transformed_deliver.shape"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": null,
262
+ "metadata": {},
263
+ "outputs": [],
264
+ "source": [
265
+ "tools = pd.merge(transformed_request, transformed_deliver, on=REQUEST_ID_FIELD)\n",
266
+ "tools.columns"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "metadata": {},
273
+ "outputs": [],
274
+ "source": [
275
+ "def store_progress(\n",
276
+ " filename: str,\n",
277
+ " event_to_contents: Dict[str, pd.DataFrame],\n",
278
+ " tools: pd.DataFrame,\n",
279
+ ") -> None:\n",
280
+ " \"\"\"Store the given progress.\"\"\"\n",
281
+ " if filename:\n",
282
+ " DATA_DIR.mkdir(parents=True, exist_ok=True) # Ensure the directory exists\n",
283
+ " for event_name, content in event_to_contents.items():\n",
284
+ " event_filename = gen_event_filename(event_name) # Ensure this function returns a valid filename string\n",
285
+ " try:\n",
286
+ " if \"result\" in content.columns:\n",
287
+ " content = content.drop(columns=[\"result\"]) # Avoid in-place modification\n",
288
+ " if 'error' in content.columns:\n",
289
+ " content['error'] = content['error'].astype(bool)\n",
290
+ " content.to_parquet(DATA_DIR / event_filename, index=False)\n",
291
+ " except Exception as e:\n",
292
+ " print(f\"Failed to write {event_name}: {e}\")\n",
293
+ " try:\n",
294
+ " if \"result\" in tools.columns:\n",
295
+ " tools = tools.drop(columns=[\"result\"])\n",
296
+ " if 'error' in tools.columns:\n",
297
+ " tools['error'] = tools['error'].astype(bool)\n",
298
+ " tools.to_parquet(DATA_DIR / filename, index=False)\n",
299
+ " except Exception as e:\n",
300
+ " print(f\"Failed to write tools data: {e}\")"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "code",
305
+ "execution_count": null,
306
+ "metadata": {},
307
+ "outputs": [],
308
+ "source": [
309
+ "# store_progress(filename, event_to_contents, tools)"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": null,
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": [
318
+ "if 'result' in transformed_deliver.columns:\n",
319
+ " transformed_deliver = transformed_deliver.drop(columns=['result'])\n",
320
+ "if 'error' in transformed_deliver.columns:\n",
321
+ " transformed_deliver['error'] = transformed_deliver['error'].astype(bool)"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "code",
326
+ "execution_count": null,
327
+ "metadata": {},
328
+ "outputs": [],
329
+ "source": [
330
+ "transformed_deliver.to_parquet(\"transformed_deliver.parquet\", index=False)"
331
+ ]
332
+ },
333
+ {
334
+ "cell_type": "code",
335
+ "execution_count": null,
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": [
339
+ "d = pd.read_parquet(\"transformed_deliver.parquet\")"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "markdown",
344
+ "metadata": {},
345
+ "source": [
346
+ "### duck db"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": null,
352
+ "metadata": {},
353
+ "outputs": [],
354
+ "source": [
355
+ "import duckdb\n",
356
+ "from datetime import datetime, timedelta\n",
357
+ "\n",
358
+ "# Calculate the date for two months ago\n",
359
+ "two_months_ago = (datetime.now() - timedelta(days=60)).strftime('%Y-%m-%d')\n",
360
+ "\n",
361
+ "# Connect to an in-memory DuckDB instance\n",
362
+ "con = duckdb.connect(':memory:')\n",
363
+ "\n",
364
+ "# Perform a SQL query to select data from the past two months directly from the Parquet file\n",
365
+ "query = f\"\"\"\n",
366
+ "SELECT *\n",
367
+ "FROM read_parquet('/Users/arshath/play/openautonomy/olas-prediction-live-dashboard_old/data/tools.parquet')\n",
368
+ "WHERE request_time >= '{two_months_ago}'\n",
369
+ "\"\"\"\n",
370
+ "\n",
371
+ "# Fetch the result as a pandas DataFrame\n",
372
+ "df = con.execute(query).fetchdf()\n",
373
+ "\n",
374
+ "# Close the connection\n",
375
+ "con.close()\n",
376
+ "\n",
377
+ "# Print the DataFrame\n",
378
+ "print(df)"
379
  ]
380
  },
381
  {