cyberosa
commited on
Commit
·
c811726
1
Parent(s):
570e118
cleaning, new notebooks and two months data logic
Browse files- app.py +105 -110
- notebooks/confidence_analysis.ipynb +0 -0
- scripts/get_mech_info.py +89 -0
- scripts/profitability.py +16 -16
- scripts/pull_data.py +39 -22
- scripts/tools.py +48 -136
- scripts/utils.py +110 -0
app.py
CHANGED
@@ -4,25 +4,25 @@ import pandas as pd
|
|
4 |
import duckdb
|
5 |
import logging
|
6 |
from tabs.trades import (
|
7 |
-
prepare_trades,
|
8 |
-
get_overall_trades,
|
9 |
get_overall_winning_trades,
|
10 |
plot_trades_by_week,
|
11 |
plot_winning_trades_by_week,
|
12 |
-
plot_trade_details
|
13 |
)
|
14 |
from tabs.tool_win import (
|
15 |
get_tool_winning_rate,
|
16 |
get_overall_winning_rate,
|
17 |
plot_tool_winnings_overall,
|
18 |
-
plot_tool_winnings_by_tool
|
19 |
)
|
20 |
from tabs.error import (
|
21 |
-
get_error_data,
|
22 |
get_error_data_overall,
|
23 |
plot_error_data,
|
24 |
plot_tool_error_data,
|
25 |
-
plot_week_error_data
|
26 |
)
|
27 |
from tabs.about import about_olas_predict
|
28 |
|
@@ -33,21 +33,25 @@ def get_logger():
|
|
33 |
# stream handler and formatter
|
34 |
stream_handler = logging.StreamHandler()
|
35 |
stream_handler.setLevel(logging.DEBUG)
|
36 |
-
formatter = logging.Formatter(
|
|
|
|
|
37 |
stream_handler.setFormatter(formatter)
|
38 |
logger.addHandler(stream_handler)
|
39 |
return logger
|
40 |
|
|
|
41 |
logger = get_logger()
|
42 |
|
|
|
43 |
def get_last_one_month_data():
|
44 |
"""
|
45 |
Get the last one month data from the tools.parquet file
|
46 |
"""
|
47 |
logger.info("Getting last one month data")
|
48 |
-
con = duckdb.connect(
|
49 |
-
one_months_ago = (datetime.now() - timedelta(days=60)).strftime(
|
50 |
-
|
51 |
# Query to fetch data from all_trades_profitability.parquet
|
52 |
query2 = f"""
|
53 |
SELECT *
|
@@ -69,19 +73,47 @@ def get_last_one_month_data():
|
|
69 |
|
70 |
return df1, df2
|
71 |
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def prepare_data():
|
74 |
"""
|
75 |
Prepare the data for the dashboard
|
76 |
"""
|
77 |
-
tools_df, trades_df =
|
78 |
|
79 |
-
tools_df[
|
80 |
-
trades_df[
|
81 |
|
82 |
trades_df = prepare_trades(trades_df)
|
83 |
return tools_df, trades_df
|
84 |
|
|
|
85 |
tools_df, trades_df = prepare_data()
|
86 |
|
87 |
|
@@ -89,53 +121,39 @@ demo = gr.Blocks()
|
|
89 |
|
90 |
|
91 |
INC_TOOLS = [
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
]
|
104 |
|
105 |
|
106 |
-
error_df = get_error_data(
|
107 |
-
|
108 |
-
|
109 |
-
)
|
110 |
-
|
111 |
-
|
112 |
-
)
|
113 |
-
winning_rate_df = get_tool_winning_rate(
|
114 |
-
tools_df=tools_df,
|
115 |
-
inc_tools=INC_TOOLS
|
116 |
-
)
|
117 |
-
winning_rate_overall_df = get_overall_winning_rate(
|
118 |
-
wins_df=winning_rate_df
|
119 |
-
)
|
120 |
-
trades_count_df = get_overall_trades(
|
121 |
-
trades_df=trades_df
|
122 |
-
)
|
123 |
-
trades_winning_rate_df = get_overall_winning_trades(
|
124 |
-
trades_df=trades_df
|
125 |
-
)
|
126 |
|
127 |
with demo:
|
128 |
gr.HTML("<h1>Olas Predict Actual Performance</h1>")
|
129 |
-
gr.Markdown(
|
|
|
|
|
130 |
|
131 |
with gr.Tabs():
|
132 |
with gr.TabItem("🔥Trades Dashboard"):
|
133 |
with gr.Row():
|
134 |
gr.Markdown("# Plot of number of trades by week")
|
135 |
with gr.Row():
|
136 |
-
trades_by_week_plot = plot_trades_by_week(
|
137 |
-
trades_df=trades_count_df
|
138 |
-
)
|
139 |
with gr.Row():
|
140 |
gr.Markdown("# Plot of winning trades by week")
|
141 |
with gr.Row():
|
@@ -146,32 +164,30 @@ with demo:
|
|
146 |
gr.Markdown("# Plot of trade details")
|
147 |
with gr.Row():
|
148 |
trade_details_selector = gr.Dropdown(
|
149 |
-
label="Select a trade",
|
150 |
choices=[
|
151 |
"mech calls",
|
152 |
"collateral amount",
|
153 |
"earnings",
|
154 |
"net earnings",
|
155 |
-
"ROI"
|
156 |
],
|
157 |
-
value="mech calls"
|
158 |
)
|
159 |
with gr.Row():
|
160 |
trade_details_plot = plot_trade_details(
|
161 |
-
trade_detail="mech calls",
|
162 |
-
trades_df=trades_df
|
163 |
)
|
164 |
-
|
165 |
def update_trade_details(trade_detail):
|
166 |
return plot_trade_details(
|
167 |
-
trade_detail=trade_detail,
|
168 |
-
trades_df=trades_df
|
169 |
)
|
170 |
|
171 |
trade_details_selector.change(
|
172 |
-
update_trade_details,
|
173 |
-
inputs=trade_details_selector,
|
174 |
-
outputs=trade_details_plot
|
175 |
)
|
176 |
|
177 |
with gr.Row():
|
@@ -185,27 +201,25 @@ with demo:
|
|
185 |
|
186 |
with gr.Row():
|
187 |
winning_selector = gr.Dropdown(
|
188 |
-
label="Select Metric",
|
189 |
-
choices=[
|
190 |
-
value=
|
191 |
)
|
192 |
|
193 |
with gr.Row():
|
194 |
winning_plot = plot_tool_winnings_overall(
|
195 |
-
wins_df=winning_rate_overall_df,
|
196 |
-
winning_selector="win_perc"
|
197 |
)
|
198 |
|
199 |
def update_tool_winnings_overall_plot(winning_selector):
|
200 |
return plot_tool_winnings_overall(
|
201 |
-
wins_df=winning_rate_overall_df,
|
202 |
-
winning_selector=winning_selector
|
203 |
)
|
204 |
|
205 |
winning_selector.change(
|
206 |
update_tool_winnings_overall_plot,
|
207 |
-
inputs=winning_selector,
|
208 |
-
outputs=winning_plot
|
209 |
)
|
210 |
|
211 |
with gr.Row():
|
@@ -215,30 +229,24 @@ with demo:
|
|
215 |
|
216 |
with gr.Row():
|
217 |
gr.Markdown("# Plot showing winning rate by tool")
|
218 |
-
|
219 |
with gr.Row():
|
220 |
sel_tool = gr.Dropdown(
|
221 |
-
label="Select a tool",
|
222 |
-
choices=INC_TOOLS,
|
223 |
-
value=INC_TOOLS[0]
|
224 |
)
|
225 |
|
226 |
with gr.Row():
|
227 |
tool_winnings_by_tool_plot = plot_tool_winnings_by_tool(
|
228 |
-
wins_df=winning_rate_df,
|
229 |
-
tool=INC_TOOLS[0]
|
230 |
)
|
231 |
|
232 |
def update_tool_winnings_by_tool_plot(tool):
|
233 |
-
return plot_tool_winnings_by_tool(
|
234 |
-
wins_df=winning_rate_df,
|
235 |
-
tool=tool
|
236 |
-
)
|
237 |
|
238 |
sel_tool.change(
|
239 |
update_tool_winnings_by_tool_plot,
|
240 |
-
inputs=sel_tool,
|
241 |
-
outputs=tool_winnings_by_tool_plot
|
242 |
)
|
243 |
|
244 |
with gr.Row():
|
@@ -250,35 +258,24 @@ with demo:
|
|
250 |
with gr.Row():
|
251 |
gr.Markdown("# Plot showing overall error")
|
252 |
with gr.Row():
|
253 |
-
error_overall_plot = plot_error_data(
|
254 |
-
error_all_df=error_overall_df
|
255 |
-
)
|
256 |
with gr.Row():
|
257 |
gr.Markdown("# Plot showing error by tool")
|
258 |
with gr.Row():
|
259 |
sel_tool = gr.Dropdown(
|
260 |
-
label="Select a tool",
|
261 |
-
choices=INC_TOOLS,
|
262 |
-
value=INC_TOOLS[0]
|
263 |
)
|
264 |
|
265 |
with gr.Row():
|
266 |
tool_error_plot = plot_tool_error_data(
|
267 |
-
error_df=error_df,
|
268 |
-
tool=INC_TOOLS[0]
|
269 |
)
|
270 |
|
271 |
-
|
272 |
def update_tool_error_plot(tool):
|
273 |
-
return plot_tool_error_data(
|
274 |
-
error_df=error_df,
|
275 |
-
tool=tool
|
276 |
-
)
|
277 |
|
278 |
sel_tool.change(
|
279 |
-
update_tool_error_plot,
|
280 |
-
inputs=sel_tool,
|
281 |
-
outputs=tool_error_plot
|
282 |
)
|
283 |
with gr.Row():
|
284 |
sel_tool
|
@@ -289,29 +286,27 @@ with demo:
|
|
289 |
gr.Markdown("# Plot showing error by week")
|
290 |
|
291 |
with gr.Row():
|
292 |
-
choices = error_overall_df[
|
293 |
# sort the choices by the latest week to be on the top
|
294 |
choices = sorted(choices)
|
295 |
sel_week = gr.Dropdown(
|
296 |
-
label="Select a week",
|
297 |
-
|
298 |
-
value=choices[-1]
|
299 |
-
)
|
300 |
|
301 |
with gr.Row():
|
302 |
week_error_plot = plot_week_error_data(
|
303 |
-
error_df=error_df,
|
304 |
-
week=choices[-1]
|
305 |
)
|
306 |
|
307 |
def update_week_error_plot(selected_week):
|
308 |
-
return plot_week_error_data(
|
309 |
-
error_df=error_df,
|
310 |
-
week=selected_week
|
311 |
-
)
|
312 |
|
313 |
-
sel_tool.change(
|
314 |
-
|
|
|
|
|
|
|
|
|
315 |
|
316 |
with gr.Row():
|
317 |
sel_tool
|
|
|
4 |
import duckdb
|
5 |
import logging
|
6 |
from tabs.trades import (
|
7 |
+
prepare_trades,
|
8 |
+
get_overall_trades,
|
9 |
get_overall_winning_trades,
|
10 |
plot_trades_by_week,
|
11 |
plot_winning_trades_by_week,
|
12 |
+
plot_trade_details,
|
13 |
)
|
14 |
from tabs.tool_win import (
|
15 |
get_tool_winning_rate,
|
16 |
get_overall_winning_rate,
|
17 |
plot_tool_winnings_overall,
|
18 |
+
plot_tool_winnings_by_tool,
|
19 |
)
|
20 |
from tabs.error import (
|
21 |
+
get_error_data,
|
22 |
get_error_data_overall,
|
23 |
plot_error_data,
|
24 |
plot_tool_error_data,
|
25 |
+
plot_week_error_data,
|
26 |
)
|
27 |
from tabs.about import about_olas_predict
|
28 |
|
|
|
33 |
# stream handler and formatter
|
34 |
stream_handler = logging.StreamHandler()
|
35 |
stream_handler.setLevel(logging.DEBUG)
|
36 |
+
formatter = logging.Formatter(
|
37 |
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
38 |
+
)
|
39 |
stream_handler.setFormatter(formatter)
|
40 |
logger.addHandler(stream_handler)
|
41 |
return logger
|
42 |
|
43 |
+
|
44 |
logger = get_logger()
|
45 |
|
46 |
+
|
47 |
def get_last_one_month_data():
|
48 |
"""
|
49 |
Get the last one month data from the tools.parquet file
|
50 |
"""
|
51 |
logger.info("Getting last one month data")
|
52 |
+
con = duckdb.connect(":memory:")
|
53 |
+
one_months_ago = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
|
54 |
+
|
55 |
# Query to fetch data from all_trades_profitability.parquet
|
56 |
query2 = f"""
|
57 |
SELECT *
|
|
|
73 |
|
74 |
return df1, df2
|
75 |
|
76 |
+
|
77 |
+
def get_all_data():
|
78 |
+
"""
|
79 |
+
Get all data from the tools.parquet and all_trades_profitability.parquet files
|
80 |
+
"""
|
81 |
+
logger.info("Getting all data")
|
82 |
+
con = duckdb.connect(":memory:")
|
83 |
+
|
84 |
+
# Query to fetch data from all_trades_profitability.parquet
|
85 |
+
query2 = f"""
|
86 |
+
SELECT *
|
87 |
+
FROM read_parquet('./data/all_trades_profitability.parquet')
|
88 |
+
"""
|
89 |
+
df2 = con.execute(query2).fetchdf()
|
90 |
+
logger.info("Got all data from all_trades_profitability.parquet")
|
91 |
+
|
92 |
+
query1 = f"""
|
93 |
+
SELECT *
|
94 |
+
FROM read_parquet('./data/tools.parquet')
|
95 |
+
"""
|
96 |
+
df1 = con.execute(query1).fetchdf()
|
97 |
+
logger.info("Got all data from tools.parquet")
|
98 |
+
|
99 |
+
con.close()
|
100 |
+
|
101 |
+
return df1, df2
|
102 |
+
|
103 |
+
|
104 |
def prepare_data():
|
105 |
"""
|
106 |
Prepare the data for the dashboard
|
107 |
"""
|
108 |
+
tools_df, trades_df = get_all_data()
|
109 |
|
110 |
+
tools_df["request_time"] = pd.to_datetime(tools_df["request_time"])
|
111 |
+
trades_df["creation_timestamp"] = pd.to_datetime(trades_df["creation_timestamp"])
|
112 |
|
113 |
trades_df = prepare_trades(trades_df)
|
114 |
return tools_df, trades_df
|
115 |
|
116 |
+
|
117 |
tools_df, trades_df = prepare_data()
|
118 |
|
119 |
|
|
|
121 |
|
122 |
|
123 |
INC_TOOLS = [
|
124 |
+
"prediction-online",
|
125 |
+
"prediction-offline",
|
126 |
+
"claude-prediction-online",
|
127 |
+
"claude-prediction-offline",
|
128 |
+
"prediction-offline-sme",
|
129 |
+
"prediction-online-sme",
|
130 |
+
"prediction-request-rag",
|
131 |
+
"prediction-request-reasoning",
|
132 |
+
"prediction-url-cot-claude",
|
133 |
+
"prediction-request-rag-claude",
|
134 |
+
"prediction-request-reasoning-claude",
|
135 |
]
|
136 |
|
137 |
|
138 |
+
error_df = get_error_data(tools_df=tools_df, inc_tools=INC_TOOLS)
|
139 |
+
error_overall_df = get_error_data_overall(error_df=error_df)
|
140 |
+
winning_rate_df = get_tool_winning_rate(tools_df=tools_df, inc_tools=INC_TOOLS)
|
141 |
+
winning_rate_overall_df = get_overall_winning_rate(wins_df=winning_rate_df)
|
142 |
+
trades_count_df = get_overall_trades(trades_df=trades_df)
|
143 |
+
trades_winning_rate_df = get_overall_winning_trades(trades_df=trades_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
with demo:
|
146 |
gr.HTML("<h1>Olas Predict Actual Performance</h1>")
|
147 |
+
gr.Markdown(
|
148 |
+
"This app shows the actual performance of Olas Predict tools on the live market."
|
149 |
+
)
|
150 |
|
151 |
with gr.Tabs():
|
152 |
with gr.TabItem("🔥Trades Dashboard"):
|
153 |
with gr.Row():
|
154 |
gr.Markdown("# Plot of number of trades by week")
|
155 |
with gr.Row():
|
156 |
+
trades_by_week_plot = plot_trades_by_week(trades_df=trades_count_df)
|
|
|
|
|
157 |
with gr.Row():
|
158 |
gr.Markdown("# Plot of winning trades by week")
|
159 |
with gr.Row():
|
|
|
164 |
gr.Markdown("# Plot of trade details")
|
165 |
with gr.Row():
|
166 |
trade_details_selector = gr.Dropdown(
|
167 |
+
label="Select a trade",
|
168 |
choices=[
|
169 |
"mech calls",
|
170 |
"collateral amount",
|
171 |
"earnings",
|
172 |
"net earnings",
|
173 |
+
"ROI",
|
174 |
],
|
175 |
+
value="mech calls",
|
176 |
)
|
177 |
with gr.Row():
|
178 |
trade_details_plot = plot_trade_details(
|
179 |
+
trade_detail="mech calls", trades_df=trades_df
|
|
|
180 |
)
|
181 |
+
|
182 |
def update_trade_details(trade_detail):
|
183 |
return plot_trade_details(
|
184 |
+
trade_detail=trade_detail, trades_df=trades_df
|
|
|
185 |
)
|
186 |
|
187 |
trade_details_selector.change(
|
188 |
+
update_trade_details,
|
189 |
+
inputs=trade_details_selector,
|
190 |
+
outputs=trade_details_plot,
|
191 |
)
|
192 |
|
193 |
with gr.Row():
|
|
|
201 |
|
202 |
with gr.Row():
|
203 |
winning_selector = gr.Dropdown(
|
204 |
+
label="Select Metric",
|
205 |
+
choices=["losses", "wins", "total_request", "win_perc"],
|
206 |
+
value="win_perc",
|
207 |
)
|
208 |
|
209 |
with gr.Row():
|
210 |
winning_plot = plot_tool_winnings_overall(
|
211 |
+
wins_df=winning_rate_overall_df, winning_selector="win_perc"
|
|
|
212 |
)
|
213 |
|
214 |
def update_tool_winnings_overall_plot(winning_selector):
|
215 |
return plot_tool_winnings_overall(
|
216 |
+
wins_df=winning_rate_overall_df, winning_selector=winning_selector
|
|
|
217 |
)
|
218 |
|
219 |
winning_selector.change(
|
220 |
update_tool_winnings_overall_plot,
|
221 |
+
inputs=winning_selector,
|
222 |
+
outputs=winning_plot,
|
223 |
)
|
224 |
|
225 |
with gr.Row():
|
|
|
229 |
|
230 |
with gr.Row():
|
231 |
gr.Markdown("# Plot showing winning rate by tool")
|
232 |
+
|
233 |
with gr.Row():
|
234 |
sel_tool = gr.Dropdown(
|
235 |
+
label="Select a tool", choices=INC_TOOLS, value=INC_TOOLS[0]
|
|
|
|
|
236 |
)
|
237 |
|
238 |
with gr.Row():
|
239 |
tool_winnings_by_tool_plot = plot_tool_winnings_by_tool(
|
240 |
+
wins_df=winning_rate_df, tool=INC_TOOLS[0]
|
|
|
241 |
)
|
242 |
|
243 |
def update_tool_winnings_by_tool_plot(tool):
|
244 |
+
return plot_tool_winnings_by_tool(wins_df=winning_rate_df, tool=tool)
|
|
|
|
|
|
|
245 |
|
246 |
sel_tool.change(
|
247 |
update_tool_winnings_by_tool_plot,
|
248 |
+
inputs=sel_tool,
|
249 |
+
outputs=tool_winnings_by_tool_plot,
|
250 |
)
|
251 |
|
252 |
with gr.Row():
|
|
|
258 |
with gr.Row():
|
259 |
gr.Markdown("# Plot showing overall error")
|
260 |
with gr.Row():
|
261 |
+
error_overall_plot = plot_error_data(error_all_df=error_overall_df)
|
|
|
|
|
262 |
with gr.Row():
|
263 |
gr.Markdown("# Plot showing error by tool")
|
264 |
with gr.Row():
|
265 |
sel_tool = gr.Dropdown(
|
266 |
+
label="Select a tool", choices=INC_TOOLS, value=INC_TOOLS[0]
|
|
|
|
|
267 |
)
|
268 |
|
269 |
with gr.Row():
|
270 |
tool_error_plot = plot_tool_error_data(
|
271 |
+
error_df=error_df, tool=INC_TOOLS[0]
|
|
|
272 |
)
|
273 |
|
|
|
274 |
def update_tool_error_plot(tool):
|
275 |
+
return plot_tool_error_data(error_df=error_df, tool=tool)
|
|
|
|
|
|
|
276 |
|
277 |
sel_tool.change(
|
278 |
+
update_tool_error_plot, inputs=sel_tool, outputs=tool_error_plot
|
|
|
|
|
279 |
)
|
280 |
with gr.Row():
|
281 |
sel_tool
|
|
|
286 |
gr.Markdown("# Plot showing error by week")
|
287 |
|
288 |
with gr.Row():
|
289 |
+
choices = error_overall_df["request_month_year_week"].unique().tolist()
|
290 |
# sort the choices by the latest week to be on the top
|
291 |
choices = sorted(choices)
|
292 |
sel_week = gr.Dropdown(
|
293 |
+
label="Select a week", choices=choices, value=choices[-1]
|
294 |
+
)
|
|
|
|
|
295 |
|
296 |
with gr.Row():
|
297 |
week_error_plot = plot_week_error_data(
|
298 |
+
error_df=error_df, week=choices[-1]
|
|
|
299 |
)
|
300 |
|
301 |
def update_week_error_plot(selected_week):
|
302 |
+
return plot_week_error_data(error_df=error_df, week=selected_week)
|
|
|
|
|
|
|
303 |
|
304 |
+
sel_tool.change(
|
305 |
+
update_tool_error_plot, inputs=sel_tool, outputs=tool_error_plot
|
306 |
+
)
|
307 |
+
sel_week.change(
|
308 |
+
update_week_error_plot, inputs=sel_week, outputs=week_error_plot
|
309 |
+
)
|
310 |
|
311 |
with gr.Row():
|
312 |
sel_tool
|
notebooks/confidence_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scripts/get_mech_info.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from string import Template
|
3 |
+
from typing import Any
|
4 |
+
from datetime import datetime, timedelta, UTC
|
5 |
+
import requests
|
6 |
+
|
7 |
+
MECH_SUBGRAPH_URL = "https://api.thegraph.com/subgraphs/name/stakewise/ethereum-gnosis"
|
8 |
+
SUBGRAPH_HEADERS = {
|
9 |
+
"Accept": "application/json, multipart/mixed",
|
10 |
+
"Content-Type": "application/json",
|
11 |
+
}
|
12 |
+
QUERY_BATCH_SIZE = 1000
|
13 |
+
DATETIME_60_DAYS_AGO = datetime.now(UTC) - timedelta(days=60)
|
14 |
+
BLOCK_NUMBER = Template(
|
15 |
+
"""
|
16 |
+
{
|
17 |
+
blocks(
|
18 |
+
first: 1,
|
19 |
+
orderBy: timestamp,
|
20 |
+
orderDirection: asc,
|
21 |
+
where: {
|
22 |
+
timestamp_gte: "${timestamp_from}",
|
23 |
+
timestamp_lte: "${timestamp_to}"
|
24 |
+
}
|
25 |
+
){
|
26 |
+
id
|
27 |
+
}
|
28 |
+
}
|
29 |
+
"""
|
30 |
+
)
|
31 |
+
|
32 |
+
|
33 |
+
def fetch_block_number(timestamp_from: int, timestamp_to: int) -> dict:
|
34 |
+
"""Get a block number by its timestamp margins."""
|
35 |
+
|
36 |
+
query = BLOCK_NUMBER.substitute(
|
37 |
+
timestamp_from=timestamp_from, timestamp_to=timestamp_to
|
38 |
+
)
|
39 |
+
# print(f"Sending query for the subgraph = {query}")
|
40 |
+
|
41 |
+
response = requests.post(
|
42 |
+
MECH_SUBGRAPH_URL,
|
43 |
+
headers=SUBGRAPH_HEADERS,
|
44 |
+
json={"query": query},
|
45 |
+
timeout=300,
|
46 |
+
)
|
47 |
+
|
48 |
+
result_json = response.json()
|
49 |
+
print(f"Response of the query={result_json}")
|
50 |
+
blocks = result_json.get("data", {}).get("blocks", "")
|
51 |
+
return blocks[0]
|
52 |
+
|
53 |
+
|
54 |
+
def get_mech_info_last_60_days() -> dict[str, Any]:
|
55 |
+
"""Query the subgraph to get the last 60 days of information from mech."""
|
56 |
+
|
57 |
+
timestamp_60_days_ago = int((DATETIME_60_DAYS_AGO).timestamp())
|
58 |
+
margin = timedelta(seconds=5)
|
59 |
+
timestamp_60_days_ago_plus_margin = int((DATETIME_60_DAYS_AGO + margin).timestamp())
|
60 |
+
|
61 |
+
last_month_block_number = fetch_block_number(
|
62 |
+
timestamp_60_days_ago, timestamp_60_days_ago_plus_margin
|
63 |
+
)
|
64 |
+
# expecting only one block
|
65 |
+
last_month_block_number = last_month_block_number.get("id", "")
|
66 |
+
if last_month_block_number.isdigit():
|
67 |
+
last_month_block_number = int(last_month_block_number)
|
68 |
+
|
69 |
+
if last_month_block_number == "":
|
70 |
+
raise ValueError("Could not find a valid block number for last month data")
|
71 |
+
|
72 |
+
MECH_TO_INFO = {
|
73 |
+
# this block number is when the creator had its first tx ever, and after this mech's creation
|
74 |
+
"0xff82123dfb52ab75c417195c5fdb87630145ae81": (
|
75 |
+
"old_mech_abi.json",
|
76 |
+
last_month_block_number,
|
77 |
+
),
|
78 |
+
# this block number is when this mech was created
|
79 |
+
"0x77af31de935740567cf4ff1986d04b2c964a786a": (
|
80 |
+
"new_mech_abi.json",
|
81 |
+
last_month_block_number,
|
82 |
+
),
|
83 |
+
}
|
84 |
+
return MECH_TO_INFO
|
85 |
+
|
86 |
+
|
87 |
+
if __name__ == "__main__":
|
88 |
+
result = get_mech_info_last_60_days()
|
89 |
+
print(result)
|
scripts/profitability.py
CHANGED
@@ -28,6 +28,7 @@ from enum import Enum
|
|
28 |
from tqdm import tqdm
|
29 |
import numpy as np
|
30 |
from pathlib import Path
|
|
|
31 |
|
32 |
IRRELEVANT_TOOLS = [
|
33 |
"openai-text-davinci-002",
|
@@ -59,6 +60,7 @@ SCRIPTS_DIR = Path(__file__).parent
|
|
59 |
ROOT_DIR = SCRIPTS_DIR.parent
|
60 |
DATA_DIR = ROOT_DIR / "data"
|
61 |
|
|
|
62 |
class MarketState(Enum):
|
63 |
"""Market state"""
|
64 |
|
@@ -343,7 +345,6 @@ def wei_to_unit(wei: int) -> float:
|
|
343 |
def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
|
344 |
"""Returns whether the user has redeemed the position."""
|
345 |
user_positions = user_json["data"]["user"]["userPositions"]
|
346 |
-
outcomes_tokens_traded = int(fpmmTrade["outcomeTokensTraded"])
|
347 |
condition_id = fpmmTrade["fpmm.condition.id"]
|
348 |
|
349 |
for position in user_positions:
|
@@ -358,12 +359,12 @@ def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
|
|
358 |
return False
|
359 |
|
360 |
|
361 |
-
def create_fpmmTrades(rpc: str):
|
362 |
"""Create fpmmTrades for all trades."""
|
363 |
trades_json = _query_omen_xdai_subgraph(
|
364 |
-
from_timestamp=
|
365 |
to_timestamp=DEFAULT_TO_TIMESTAMP,
|
366 |
-
fpmm_from_timestamp=
|
367 |
fpmm_to_timestamp=DEFAULT_TO_TIMESTAMP,
|
368 |
)
|
369 |
|
@@ -384,18 +385,14 @@ def create_fpmmTrades(rpc: str):
|
|
384 |
# change creator to creator_address
|
385 |
df.rename(columns={"creator": "trader_address"}, inplace=True)
|
386 |
|
387 |
-
# save to csv
|
388 |
-
df.to_parquet(DATA_DIR / "fpmmTrades.parquet", index=False)
|
389 |
-
|
390 |
return df
|
391 |
|
392 |
|
393 |
def prepare_profitalibity_data(rpc: str):
|
394 |
"""Prepare data for profitalibity analysis."""
|
395 |
|
396 |
-
# Check if tools.
|
397 |
try:
|
398 |
-
# load tools.csv
|
399 |
tools = pd.read_parquet(DATA_DIR / "tools.parquet")
|
400 |
|
401 |
# make sure creator_address is in the columns
|
@@ -412,16 +409,18 @@ def prepare_profitalibity_data(rpc: str):
|
|
412 |
print("tools.parquet not found. Please run tools.py first.")
|
413 |
return
|
414 |
|
415 |
-
# Check if fpmmTrades.
|
416 |
try:
|
417 |
-
# load fpmmTrades.csv
|
418 |
fpmmTrades = pd.read_parquet(DATA_DIR / "fpmmTrades.parquet")
|
419 |
print("fpmmTrades.parquet loaded")
|
420 |
except FileNotFoundError:
|
421 |
print("fpmmTrades.parquet not found. Creating fpmmTrades.parquet...")
|
422 |
-
|
|
|
|
|
423 |
fpmmTrades.to_parquet(DATA_DIR / "fpmmTrades.parquet", index=False)
|
424 |
-
|
|
|
425 |
|
426 |
# make sure trader_address is in the columns
|
427 |
assert "trader_address" in fpmmTrades.columns, "trader_address column not found"
|
@@ -468,7 +467,7 @@ def analyse_trader(
|
|
468 |
# Iterate over the trades
|
469 |
for i, trade in tqdm(trades.iterrows(), total=len(trades), desc="Analysing trades"):
|
470 |
try:
|
471 |
-
if not trade[
|
472 |
print(f"Skipping trade {i} because currentAnswer is NaN")
|
473 |
continue
|
474 |
# Parsing and computing shared values
|
@@ -535,7 +534,8 @@ def analyse_trader(
|
|
535 |
"num_mech_calls": num_mech_calls,
|
536 |
"mech_fee_amount": num_mech_calls * DEFAULT_MECH_FEE,
|
537 |
"net_earnings": net_earnings,
|
538 |
-
"roi": net_earnings
|
|
|
539 |
}
|
540 |
|
541 |
except Exception as e:
|
@@ -613,7 +613,7 @@ def run_profitability_analysis(rpc):
|
|
613 |
# load dfs from csv for analysis
|
614 |
print("Preparing data...")
|
615 |
fpmmTrades, tools = prepare_profitalibity_data(rpc)
|
616 |
-
tools[
|
617 |
|
618 |
# all trades profitability df
|
619 |
print("Analysing trades...")
|
|
|
28 |
from tqdm import tqdm
|
29 |
import numpy as np
|
30 |
from pathlib import Path
|
31 |
+
from get_mech_info import DATETIME_60_DAYS_AGO
|
32 |
|
33 |
IRRELEVANT_TOOLS = [
|
34 |
"openai-text-davinci-002",
|
|
|
60 |
ROOT_DIR = SCRIPTS_DIR.parent
|
61 |
DATA_DIR = ROOT_DIR / "data"
|
62 |
|
63 |
+
|
64 |
class MarketState(Enum):
|
65 |
"""Market state"""
|
66 |
|
|
|
345 |
def _is_redeemed(user_json: dict[str, Any], fpmmTrade: dict[str, Any]) -> bool:
|
346 |
"""Returns whether the user has redeemed the position."""
|
347 |
user_positions = user_json["data"]["user"]["userPositions"]
|
|
|
348 |
condition_id = fpmmTrade["fpmm.condition.id"]
|
349 |
|
350 |
for position in user_positions:
|
|
|
359 |
return False
|
360 |
|
361 |
|
362 |
+
def create_fpmmTrades(rpc: str, from_timestamp: float = DEFAULT_FROM_TIMESTAMP):
|
363 |
"""Create fpmmTrades for all trades."""
|
364 |
trades_json = _query_omen_xdai_subgraph(
|
365 |
+
from_timestamp=from_timestamp,
|
366 |
to_timestamp=DEFAULT_TO_TIMESTAMP,
|
367 |
+
fpmm_from_timestamp=from_timestamp,
|
368 |
fpmm_to_timestamp=DEFAULT_TO_TIMESTAMP,
|
369 |
)
|
370 |
|
|
|
385 |
# change creator to creator_address
|
386 |
df.rename(columns={"creator": "trader_address"}, inplace=True)
|
387 |
|
|
|
|
|
|
|
388 |
return df
|
389 |
|
390 |
|
391 |
def prepare_profitalibity_data(rpc: str):
|
392 |
"""Prepare data for profitalibity analysis."""
|
393 |
|
394 |
+
# Check if tools.parquet is in the same directory
|
395 |
try:
|
|
|
396 |
tools = pd.read_parquet(DATA_DIR / "tools.parquet")
|
397 |
|
398 |
# make sure creator_address is in the columns
|
|
|
409 |
print("tools.parquet not found. Please run tools.py first.")
|
410 |
return
|
411 |
|
412 |
+
# Check if fpmmTrades.parquet is in the same directory
|
413 |
try:
|
|
|
414 |
fpmmTrades = pd.read_parquet(DATA_DIR / "fpmmTrades.parquet")
|
415 |
print("fpmmTrades.parquet loaded")
|
416 |
except FileNotFoundError:
|
417 |
print("fpmmTrades.parquet not found. Creating fpmmTrades.parquet...")
|
418 |
+
# Prepare the same time window as used for the tools
|
419 |
+
timestamp_60_days_ago = (DATETIME_60_DAYS_AGO).timestamp()
|
420 |
+
fpmmTrades = create_fpmmTrades(rpc, from_timestamp=timestamp_60_days_ago)
|
421 |
fpmmTrades.to_parquet(DATA_DIR / "fpmmTrades.parquet", index=False)
|
422 |
+
# This is not needed
|
423 |
+
# fpmmTrades = pd.read_parquet(DATA_DIR / "fpmmTrades.parquet")
|
424 |
|
425 |
# make sure trader_address is in the columns
|
426 |
assert "trader_address" in fpmmTrades.columns, "trader_address column not found"
|
|
|
467 |
# Iterate over the trades
|
468 |
for i, trade in tqdm(trades.iterrows(), total=len(trades), desc="Analysing trades"):
|
469 |
try:
|
470 |
+
if not trade["fpmm.currentAnswer"]:
|
471 |
print(f"Skipping trade {i} because currentAnswer is NaN")
|
472 |
continue
|
473 |
# Parsing and computing shared values
|
|
|
534 |
"num_mech_calls": num_mech_calls,
|
535 |
"mech_fee_amount": num_mech_calls * DEFAULT_MECH_FEE,
|
536 |
"net_earnings": net_earnings,
|
537 |
+
"roi": net_earnings
|
538 |
+
/ (collateral_amount + fee_amount + num_mech_calls * DEFAULT_MECH_FEE),
|
539 |
}
|
540 |
|
541 |
except Exception as e:
|
|
|
613 |
# load dfs from csv for analysis
|
614 |
print("Preparing data...")
|
615 |
fpmmTrades, tools = prepare_profitalibity_data(rpc)
|
616 |
+
tools["trader_address"] = tools["trader_address"].str.lower()
|
617 |
|
618 |
# all trades profitability df
|
619 |
print("Analysing trades...")
|
scripts/pull_data.py
CHANGED
@@ -19,6 +19,7 @@ from tools import (
|
|
19 |
DEFAULT_FILENAME as TOOLS_FILENAME,
|
20 |
)
|
21 |
from profitability import run_profitability_analysis
|
|
|
22 |
import gc
|
23 |
|
24 |
logging.basicConfig(level=logging.INFO)
|
@@ -27,6 +28,7 @@ SCRIPTS_DIR = Path(__file__).parent
|
|
27 |
ROOT_DIR = SCRIPTS_DIR.parent
|
28 |
DATA_DIR = ROOT_DIR / "data"
|
29 |
|
|
|
30 |
def get_question(text: str) -> str:
|
31 |
"""Get the question from a text."""
|
32 |
# Regex to find text within double quotes
|
@@ -43,24 +45,26 @@ def get_question(text: str) -> str:
|
|
43 |
|
44 |
def current_answer(text: str, fpmms: pd.DataFrame) -> Optional[str]:
|
45 |
"""Get the current answer for a question."""
|
46 |
-
row = fpmms[fpmms[
|
47 |
if row.shape[0] == 0:
|
48 |
return None
|
49 |
-
return row[
|
50 |
|
51 |
|
52 |
def block_number_to_timestamp(block_number: int, web3: Web3) -> str:
|
53 |
"""Convert a block number to a timestamp."""
|
54 |
block = web3.eth.get_block(block_number)
|
55 |
-
timestamp = datetime.utcfromtimestamp(block[
|
56 |
-
return timestamp.strftime(
|
57 |
|
58 |
|
59 |
def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> list:
|
60 |
"""Parallelize the timestamp conversion."""
|
61 |
-
block_numbers = df[
|
62 |
with ThreadPoolExecutor(max_workers=10) as executor:
|
63 |
-
results = list(
|
|
|
|
|
64 |
return results
|
65 |
|
66 |
|
@@ -76,10 +80,11 @@ def weekly_analysis():
|
|
76 |
|
77 |
# Run tools ETL
|
78 |
logging.info("Running tools ETL")
|
|
|
|
|
79 |
tools_etl(
|
80 |
rpcs=[rpc],
|
81 |
filename=TOOLS_FILENAME,
|
82 |
-
full_contents=True,
|
83 |
)
|
84 |
logging.info("Tools ETL completed")
|
85 |
|
@@ -98,35 +103,48 @@ def weekly_analysis():
|
|
98 |
|
99 |
# Get the question from the tools
|
100 |
logging.info("Getting the question and current answer for the tools")
|
101 |
-
tools[
|
102 |
-
tools[
|
103 |
|
104 |
-
tools[
|
105 |
-
tools[
|
106 |
|
107 |
# Convert block number to timestamp
|
108 |
logging.info("Converting block number to timestamp")
|
109 |
t_map = pickle.load(open(DATA_DIR / "t_map.pkl", "rb"))
|
110 |
-
tools[
|
111 |
|
112 |
# Identify tools with missing request_time and fill them
|
113 |
-
missing_time_indices = tools[tools[
|
114 |
if not missing_time_indices.empty:
|
115 |
-
partial_block_number_to_timestamp = partial(
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
118 |
# Update the original DataFrame with the missing timestamps
|
119 |
for i, timestamp in zip(missing_time_indices, missing_timestamps):
|
120 |
-
tools.at[i,
|
121 |
|
122 |
-
tools[
|
123 |
-
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
# Save the tools
|
126 |
tools.to_parquet(DATA_DIR / TOOLS_FILENAME, index=False)
|
127 |
|
128 |
# Update t_map with new timestamps
|
129 |
-
new_timestamps =
|
|
|
|
|
|
|
|
|
|
|
130 |
t_map.update(new_timestamps)
|
131 |
|
132 |
with open(DATA_DIR / "t_map.pkl", "wb") as f:
|
@@ -142,4 +160,3 @@ def weekly_analysis():
|
|
142 |
|
143 |
if __name__ == "__main__":
|
144 |
weekly_analysis()
|
145 |
-
|
|
|
19 |
DEFAULT_FILENAME as TOOLS_FILENAME,
|
20 |
)
|
21 |
from profitability import run_profitability_analysis
|
22 |
+
|
23 |
import gc
|
24 |
|
25 |
logging.basicConfig(level=logging.INFO)
|
|
|
28 |
ROOT_DIR = SCRIPTS_DIR.parent
|
29 |
DATA_DIR = ROOT_DIR / "data"
|
30 |
|
31 |
+
|
32 |
def get_question(text: str) -> str:
|
33 |
"""Get the question from a text."""
|
34 |
# Regex to find text within double quotes
|
|
|
45 |
|
46 |
def current_answer(text: str, fpmms: pd.DataFrame) -> Optional[str]:
|
47 |
"""Get the current answer for a question."""
|
48 |
+
row = fpmms[fpmms["title"] == text]
|
49 |
if row.shape[0] == 0:
|
50 |
return None
|
51 |
+
return row["currentAnswer"].values[0]
|
52 |
|
53 |
|
54 |
def block_number_to_timestamp(block_number: int, web3: Web3) -> str:
|
55 |
"""Convert a block number to a timestamp."""
|
56 |
block = web3.eth.get_block(block_number)
|
57 |
+
timestamp = datetime.utcfromtimestamp(block["timestamp"])
|
58 |
+
return timestamp.strftime("%Y-%m-%d %H:%M:%S")
|
59 |
|
60 |
|
61 |
def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> list:
|
62 |
"""Parallelize the timestamp conversion."""
|
63 |
+
block_numbers = df["request_block"].tolist()
|
64 |
with ThreadPoolExecutor(max_workers=10) as executor:
|
65 |
+
results = list(
|
66 |
+
tqdm(executor.map(function, block_numbers), total=len(block_numbers))
|
67 |
+
)
|
68 |
return results
|
69 |
|
70 |
|
|
|
80 |
|
81 |
# Run tools ETL
|
82 |
logging.info("Running tools ETL")
|
83 |
+
|
84 |
+
# This etl is saving already the tools parquet file
|
85 |
tools_etl(
|
86 |
rpcs=[rpc],
|
87 |
filename=TOOLS_FILENAME,
|
|
|
88 |
)
|
89 |
logging.info("Tools ETL completed")
|
90 |
|
|
|
103 |
|
104 |
# Get the question from the tools
|
105 |
logging.info("Getting the question and current answer for the tools")
|
106 |
+
tools["title"] = tools["prompt_request"].apply(lambda x: get_question(x))
|
107 |
+
tools["currentAnswer"] = tools["title"].apply(lambda x: current_answer(x, fpmms))
|
108 |
|
109 |
+
tools["currentAnswer"] = tools["currentAnswer"].str.replace("yes", "Yes")
|
110 |
+
tools["currentAnswer"] = tools["currentAnswer"].str.replace("no", "No")
|
111 |
|
112 |
# Convert block number to timestamp
|
113 |
logging.info("Converting block number to timestamp")
|
114 |
t_map = pickle.load(open(DATA_DIR / "t_map.pkl", "rb"))
|
115 |
+
tools["request_time"] = tools["request_block"].map(t_map)
|
116 |
|
117 |
# Identify tools with missing request_time and fill them
|
118 |
+
missing_time_indices = tools[tools["request_time"].isna()].index
|
119 |
if not missing_time_indices.empty:
|
120 |
+
partial_block_number_to_timestamp = partial(
|
121 |
+
block_number_to_timestamp, web3=web3
|
122 |
+
)
|
123 |
+
missing_timestamps = parallelize_timestamp_conversion(
|
124 |
+
tools.loc[missing_time_indices], partial_block_number_to_timestamp
|
125 |
+
)
|
126 |
+
|
127 |
# Update the original DataFrame with the missing timestamps
|
128 |
for i, timestamp in zip(missing_time_indices, missing_timestamps):
|
129 |
+
tools.at[i, "request_time"] = timestamp
|
130 |
|
131 |
+
tools["request_month_year"] = pd.to_datetime(tools["request_time"]).dt.strftime(
|
132 |
+
"%Y-%m"
|
133 |
+
)
|
134 |
+
tools["request_month_year_week"] = (
|
135 |
+
pd.to_datetime(tools["request_time"]).dt.to_period("W").astype(str)
|
136 |
+
)
|
137 |
|
138 |
+
# Save the tools data after the updates on the content
|
139 |
tools.to_parquet(DATA_DIR / TOOLS_FILENAME, index=False)
|
140 |
|
141 |
# Update t_map with new timestamps
|
142 |
+
new_timestamps = (
|
143 |
+
tools[["request_block", "request_time"]]
|
144 |
+
.dropna()
|
145 |
+
.set_index("request_block")
|
146 |
+
.to_dict()["request_time"]
|
147 |
+
)
|
148 |
t_map.update(new_timestamps)
|
149 |
|
150 |
with open(DATA_DIR / "t_map.pkl", "wb") as f:
|
|
|
160 |
|
161 |
if __name__ == "__main__":
|
162 |
weekly_analysis()
|
|
scripts/tools.py
CHANGED
@@ -20,22 +20,17 @@
|
|
20 |
import json
|
21 |
import os.path
|
22 |
import re
|
23 |
-
import sys
|
24 |
import time
|
25 |
import random
|
26 |
from dataclasses import dataclass
|
27 |
from enum import Enum
|
28 |
-
from io import StringIO
|
29 |
from typing import (
|
30 |
Optional,
|
31 |
List,
|
32 |
Dict,
|
33 |
Any,
|
34 |
Union,
|
35 |
-
Callable,
|
36 |
-
Tuple,
|
37 |
)
|
38 |
-
|
39 |
import pandas as pd
|
40 |
import requests
|
41 |
from json.decoder import JSONDecodeError
|
@@ -56,8 +51,18 @@ from web3 import Web3, HTTPProvider
|
|
56 |
from web3.exceptions import MismatchedABI
|
57 |
from web3.types import BlockParams
|
58 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
59 |
-
from
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
CONTRACTS_PATH = "contracts"
|
63 |
MECH_TO_INFO = {
|
@@ -71,14 +76,11 @@ LATEST_BLOCK: Optional[int] = None
|
|
71 |
LATEST_BLOCK_NAME: BlockParams = "latest"
|
72 |
BLOCK_DATA_NUMBER = "number"
|
73 |
BLOCKS_CHUNK_SIZE = 10_000
|
74 |
-
REDUCE_FACTOR = 0.25
|
75 |
EVENT_ARGUMENTS = "args"
|
76 |
DATA = "data"
|
77 |
REQUEST_ID = "requestId"
|
78 |
-
REQUEST_ID_FIELD = "request_id"
|
79 |
REQUEST_SENDER = "sender"
|
80 |
PROMPT_FIELD = "prompt"
|
81 |
-
BLOCK_FIELD = "block"
|
82 |
CID_PREFIX = "f01701220"
|
83 |
HTTP = "http://"
|
84 |
HTTPS = HTTP[:4] + "s" + HTTP[4:]
|
@@ -89,7 +91,6 @@ STATUS_FORCELIST = [404, 500, 502, 503, 504]
|
|
89 |
DEFAULT_FILENAME = "tools.parquet"
|
90 |
RE_RPC_FILTER_ERROR = r"Filter with id: '\d+' does not exist."
|
91 |
ABI_ERROR = "The event signature did not match the provided ABI"
|
92 |
-
SLEEP = 0.5
|
93 |
HTTP_TIMEOUT = 10
|
94 |
N_IPFS_RETRIES = 1
|
95 |
N_RPC_RETRIES = 100
|
@@ -109,13 +110,12 @@ IRRELEVANT_TOOLS = [
|
|
109 |
"deepmind-optimization",
|
110 |
]
|
111 |
# this is how frequently we will keep a snapshot of the progress so far in terms of blocks' batches
|
112 |
-
# for example, the value 1 means that for every `BLOCKS_CHUNK_SIZE` blocks that we search,
|
|
|
113 |
SNAPSHOT_RATE = 10
|
114 |
NUM_WORKERS = 10
|
115 |
GET_CONTENTS_BATCH_SIZE = 1000
|
116 |
-
|
117 |
-
ROOT_DIR = SCRIPTS_DIR.parent
|
118 |
-
DATA_DIR = ROOT_DIR / "data"
|
119 |
|
120 |
class MechEventName(Enum):
|
121 |
"""The mech's event names."""
|
@@ -289,31 +289,6 @@ EVENT_TO_MECH_STRUCT = {
|
|
289 |
}
|
290 |
|
291 |
|
292 |
-
def parse_args() -> str:
|
293 |
-
"""Parse the arguments and return the RPC."""
|
294 |
-
if len(sys.argv) != 2:
|
295 |
-
raise ValueError("Expected the RPC as a positional argument.")
|
296 |
-
return sys.argv[1]
|
297 |
-
|
298 |
-
|
299 |
-
def read_abi(abi_path: str) -> str:
|
300 |
-
"""Read and return the wxDAI contract's ABI."""
|
301 |
-
with open(abi_path) as abi_file:
|
302 |
-
return abi_file.read()
|
303 |
-
|
304 |
-
|
305 |
-
def reduce_window(contract_instance, event, from_block, batch_size, latest_block):
|
306 |
-
"""Dynamically reduce the batch size window."""
|
307 |
-
keep_fraction = 1 - REDUCE_FACTOR
|
308 |
-
events_filter = contract_instance.events[event].build_filter()
|
309 |
-
events_filter.fromBlock = from_block
|
310 |
-
batch_size = int(batch_size * keep_fraction)
|
311 |
-
events_filter.toBlock = min(from_block + batch_size, latest_block)
|
312 |
-
tqdm.write(f"RPC timed out! Resizing batch size to {batch_size}.")
|
313 |
-
time.sleep(SLEEP)
|
314 |
-
return events_filter, batch_size
|
315 |
-
|
316 |
-
|
317 |
def get_events(
|
318 |
w3: Web3,
|
319 |
event: str,
|
@@ -442,13 +417,6 @@ def request(
|
|
442 |
return None
|
443 |
|
444 |
|
445 |
-
def limit_text(text: str, limit: int = 200) -> str:
|
446 |
-
"""Limit the given text"""
|
447 |
-
if len(text) > limit:
|
448 |
-
return f"{text[:limit]}..."
|
449 |
-
return text
|
450 |
-
|
451 |
-
|
452 |
def parse_ipfs_response(
|
453 |
session: requests.Session,
|
454 |
url: str,
|
@@ -523,38 +491,12 @@ def get_contents(
|
|
523 |
return pd.DataFrame(contents)
|
524 |
|
525 |
|
526 |
-
def check_for_dicts(df: pd.DataFrame) -> List[str]:
|
527 |
-
"""Check for columns that contain dictionaries."""
|
528 |
-
dict_columns = []
|
529 |
-
for column in df.columns:
|
530 |
-
if df[column].apply(lambda x: isinstance(x, dict)).any():
|
531 |
-
dict_columns.append(column)
|
532 |
-
return dict_columns
|
533 |
-
|
534 |
-
|
535 |
-
def drop_dict_rows(df: pd.DataFrame,
|
536 |
-
dict_columns: List[str]) -> pd.DataFrame:
|
537 |
-
"""Drop rows that contain dictionaries."""
|
538 |
-
for column in dict_columns:
|
539 |
-
df = df[~df[column].apply(lambda x: isinstance(x, dict))]
|
540 |
-
return df
|
541 |
-
|
542 |
-
|
543 |
-
def clean(df: pd.DataFrame) -> pd.DataFrame:
|
544 |
-
"""Clean the dataframe."""
|
545 |
-
dict_columns = check_for_dicts(df)
|
546 |
-
df = drop_dict_rows(df, dict_columns)
|
547 |
-
cleaned = df.drop_duplicates()
|
548 |
-
cleaned[REQUEST_ID_FIELD] = cleaned[REQUEST_ID_FIELD].astype("str")
|
549 |
-
return cleaned
|
550 |
-
|
551 |
-
|
552 |
def transform_request(contents: pd.DataFrame) -> pd.DataFrame:
|
553 |
"""Transform the requests dataframe."""
|
554 |
return clean(contents)
|
555 |
|
556 |
|
557 |
-
def transform_deliver(contents: pd.DataFrame
|
558 |
"""Transform the delivers dataframe."""
|
559 |
unpacked_result = pd.json_normalize(contents.result)
|
560 |
# # drop result column if it exists
|
@@ -578,55 +520,27 @@ def transform_deliver(contents: pd.DataFrame, full_contents=False) -> pd.DataFra
|
|
578 |
return clean(contents)
|
579 |
|
580 |
|
581 |
-
def gen_event_filename(event_name: MechEventName) -> str:
|
582 |
-
"""Generate the filename of an event."""
|
583 |
-
return f"{event_name.value.lower()}s.parquet"
|
584 |
-
|
585 |
-
|
586 |
-
def read_n_last_lines(filename: str, n: int = 1) -> str:
|
587 |
-
"""Return the `n` last lines' content of a file."""
|
588 |
-
num_newlines = 0
|
589 |
-
with open(filename, "rb") as f:
|
590 |
-
try:
|
591 |
-
f.seek(-2, os.SEEK_END)
|
592 |
-
while num_newlines < n:
|
593 |
-
f.seek(-2, os.SEEK_CUR)
|
594 |
-
if f.read(1) == b"\n":
|
595 |
-
num_newlines += 1
|
596 |
-
except OSError:
|
597 |
-
f.seek(0)
|
598 |
-
last_line = f.readline().decode()
|
599 |
-
return last_line
|
600 |
-
|
601 |
-
|
602 |
-
def get_earliest_block(event_name: MechEventName) -> int:
|
603 |
-
"""Get the earliest block number to use when filtering for events."""
|
604 |
-
filename = gen_event_filename(event_name)
|
605 |
-
if not os.path.exists(DATA_DIR / filename):
|
606 |
-
return 0
|
607 |
-
|
608 |
-
df = pd.read_parquet(DATA_DIR / filename)
|
609 |
-
block_field = f"{event_name.value.lower()}_{BLOCK_FIELD}"
|
610 |
-
return int(df[block_field].max())
|
611 |
-
|
612 |
-
|
613 |
def store_progress(
|
614 |
filename: str,
|
615 |
event_to_contents: Dict[str, pd.DataFrame],
|
616 |
tools: pd.DataFrame,
|
617 |
) -> None:
|
618 |
"""Store the given progress."""
|
619 |
-
print("
|
620 |
if filename:
|
621 |
DATA_DIR.mkdir(parents=True, exist_ok=True) # Ensure the directory exists
|
622 |
for event_name, content in event_to_contents.items():
|
623 |
-
event_filename = gen_event_filename(
|
|
|
|
|
624 |
try:
|
625 |
if "result" in content.columns:
|
626 |
-
content = content.drop(
|
|
|
|
|
627 |
content.to_parquet(DATA_DIR / event_filename, index=False)
|
628 |
except Exception as e:
|
629 |
-
print(f"Failed to write {event_name}: {e}")
|
630 |
# Drop result and error columns for tools DataFrame
|
631 |
try:
|
632 |
if "result" in tools.columns:
|
@@ -637,7 +551,8 @@ def store_progress(
|
|
637 |
|
638 |
|
639 |
def etl(
|
640 |
-
rpcs: List[str],
|
|
|
641 |
) -> pd.DataFrame:
|
642 |
"""Fetch from on-chain events, process, store and return the tools' results on all the questions as a Dataframe."""
|
643 |
w3s = [Web3(HTTPProvider(r)) for r in rpcs]
|
@@ -646,13 +561,15 @@ def etl(
|
|
646 |
MechEventName.REQUEST: transform_request,
|
647 |
MechEventName.DELIVER: transform_deliver,
|
648 |
}
|
|
|
649 |
mech_to_info = {
|
650 |
to_checksum_address(address): (
|
651 |
os.path.join(CONTRACTS_PATH, filename),
|
652 |
earliest_block,
|
653 |
)
|
654 |
-
for address, (filename, earliest_block) in
|
655 |
}
|
|
|
656 |
event_to_contents = {}
|
657 |
|
658 |
latest_block = LATEST_BLOCK
|
@@ -663,17 +580,13 @@ def etl(
|
|
663 |
|
664 |
# Loop through events in event_to_transformer
|
665 |
for event_name, transformer in event_to_transformer.items():
|
666 |
-
if next_start_block is None:
|
667 |
-
|
668 |
|
669 |
# Loop through mech addresses in mech_to_info
|
670 |
events = []
|
671 |
for address, (abi, earliest_block) in mech_to_info.items():
|
672 |
-
|
673 |
-
next_start_block = earliest_block
|
674 |
-
else:
|
675 |
-
next_start_block = next_start_block_base
|
676 |
-
|
677 |
print(
|
678 |
f"Searching for {event_name.value} events for mech {address} from block {next_start_block} to {latest_block}."
|
679 |
)
|
@@ -704,6 +617,7 @@ def etl(
|
|
704 |
current_mech_events = future.result()
|
705 |
events.extend(current_mech_events)
|
706 |
|
|
|
707 |
parsed = parse_events(events)
|
708 |
|
709 |
contents = []
|
@@ -729,31 +643,28 @@ def etl(
|
|
729 |
|
730 |
contents = pd.concat(contents, ignore_index=True)
|
731 |
|
732 |
-
|
733 |
-
if event_name == MechEventName.REQUEST:
|
734 |
-
transformed = transformer(contents)
|
735 |
-
elif event_name == MechEventName.DELIVER:
|
736 |
-
transformed = transformer(contents, full_contents=full_contents)
|
737 |
-
|
738 |
-
events_filename = gen_event_filename(event_name)
|
739 |
|
740 |
-
|
741 |
-
|
|
|
742 |
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
|
747 |
-
|
748 |
-
|
749 |
|
750 |
-
|
751 |
-
|
752 |
|
753 |
event_to_contents[event_name] = transformed.copy()
|
754 |
|
755 |
# Store progress
|
756 |
tools = pd.merge(*event_to_contents.values(), on=REQUEST_ID_FIELD)
|
|
|
|
|
757 |
store_progress(filename, event_to_contents, tools)
|
758 |
|
759 |
return tools
|
@@ -763,5 +674,6 @@ if __name__ == "__main__":
|
|
763 |
RPCs = [
|
764 |
"https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a",
|
765 |
]
|
|
|
766 |
|
767 |
-
tools = etl(rpcs=RPCs, filename=
|
|
|
20 |
import json
|
21 |
import os.path
|
22 |
import re
|
|
|
23 |
import time
|
24 |
import random
|
25 |
from dataclasses import dataclass
|
26 |
from enum import Enum
|
|
|
27 |
from typing import (
|
28 |
Optional,
|
29 |
List,
|
30 |
Dict,
|
31 |
Any,
|
32 |
Union,
|
|
|
|
|
33 |
)
|
|
|
34 |
import pandas as pd
|
35 |
import requests
|
36 |
from json.decoder import JSONDecodeError
|
|
|
51 |
from web3.exceptions import MismatchedABI
|
52 |
from web3.types import BlockParams
|
53 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
54 |
+
from get_mech_info import get_mech_info_last_60_days
|
55 |
+
from utils import (
|
56 |
+
clean,
|
57 |
+
BLOCK_FIELD,
|
58 |
+
gen_event_filename,
|
59 |
+
read_abi,
|
60 |
+
SLEEP,
|
61 |
+
reduce_window,
|
62 |
+
limit_text,
|
63 |
+
DATA_DIR,
|
64 |
+
REQUEST_ID_FIELD,
|
65 |
+
)
|
66 |
|
67 |
CONTRACTS_PATH = "contracts"
|
68 |
MECH_TO_INFO = {
|
|
|
76 |
LATEST_BLOCK_NAME: BlockParams = "latest"
|
77 |
BLOCK_DATA_NUMBER = "number"
|
78 |
BLOCKS_CHUNK_SIZE = 10_000
|
|
|
79 |
EVENT_ARGUMENTS = "args"
|
80 |
DATA = "data"
|
81 |
REQUEST_ID = "requestId"
|
|
|
82 |
REQUEST_SENDER = "sender"
|
83 |
PROMPT_FIELD = "prompt"
|
|
|
84 |
CID_PREFIX = "f01701220"
|
85 |
HTTP = "http://"
|
86 |
HTTPS = HTTP[:4] + "s" + HTTP[4:]
|
|
|
91 |
DEFAULT_FILENAME = "tools.parquet"
|
92 |
RE_RPC_FILTER_ERROR = r"Filter with id: '\d+' does not exist."
|
93 |
ABI_ERROR = "The event signature did not match the provided ABI"
|
|
|
94 |
HTTP_TIMEOUT = 10
|
95 |
N_IPFS_RETRIES = 1
|
96 |
N_RPC_RETRIES = 100
|
|
|
110 |
"deepmind-optimization",
|
111 |
]
|
112 |
# this is how frequently we will keep a snapshot of the progress so far in terms of blocks' batches
|
113 |
+
# for example, the value 1 means that for every `BLOCKS_CHUNK_SIZE` blocks that we search,
|
114 |
+
# we also store the snapshot
|
115 |
SNAPSHOT_RATE = 10
|
116 |
NUM_WORKERS = 10
|
117 |
GET_CONTENTS_BATCH_SIZE = 1000
|
118 |
+
|
|
|
|
|
119 |
|
120 |
class MechEventName(Enum):
|
121 |
"""The mech's event names."""
|
|
|
289 |
}
|
290 |
|
291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
def get_events(
|
293 |
w3: Web3,
|
294 |
event: str,
|
|
|
417 |
return None
|
418 |
|
419 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
def parse_ipfs_response(
|
421 |
session: requests.Session,
|
422 |
url: str,
|
|
|
491 |
return pd.DataFrame(contents)
|
492 |
|
493 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
def transform_request(contents: pd.DataFrame) -> pd.DataFrame:
|
495 |
"""Transform the requests dataframe."""
|
496 |
return clean(contents)
|
497 |
|
498 |
|
499 |
+
def transform_deliver(contents: pd.DataFrame) -> pd.DataFrame:
|
500 |
"""Transform the delivers dataframe."""
|
501 |
unpacked_result = pd.json_normalize(contents.result)
|
502 |
# # drop result column if it exists
|
|
|
520 |
return clean(contents)
|
521 |
|
522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
def store_progress(
|
524 |
filename: str,
|
525 |
event_to_contents: Dict[str, pd.DataFrame],
|
526 |
tools: pd.DataFrame,
|
527 |
) -> None:
|
528 |
"""Store the given progress."""
|
529 |
+
print("storing given progress")
|
530 |
if filename:
|
531 |
DATA_DIR.mkdir(parents=True, exist_ok=True) # Ensure the directory exists
|
532 |
for event_name, content in event_to_contents.items():
|
533 |
+
event_filename = gen_event_filename(
|
534 |
+
event_name
|
535 |
+
) # Ensure this function returns a valid filename string
|
536 |
try:
|
537 |
if "result" in content.columns:
|
538 |
+
content = content.drop(
|
539 |
+
columns=["result"]
|
540 |
+
) # Avoid in-place modification
|
541 |
content.to_parquet(DATA_DIR / event_filename, index=False)
|
542 |
except Exception as e:
|
543 |
+
print(f"Failed to write {event_name} data: {e}")
|
544 |
# Drop result and error columns for tools DataFrame
|
545 |
try:
|
546 |
if "result" in tools.columns:
|
|
|
551 |
|
552 |
|
553 |
def etl(
|
554 |
+
rpcs: List[str],
|
555 |
+
filename: Optional[str] = None,
|
556 |
) -> pd.DataFrame:
|
557 |
"""Fetch from on-chain events, process, store and return the tools' results on all the questions as a Dataframe."""
|
558 |
w3s = [Web3(HTTPProvider(r)) for r in rpcs]
|
|
|
561 |
MechEventName.REQUEST: transform_request,
|
562 |
MechEventName.DELIVER: transform_deliver,
|
563 |
}
|
564 |
+
|
565 |
mech_to_info = {
|
566 |
to_checksum_address(address): (
|
567 |
os.path.join(CONTRACTS_PATH, filename),
|
568 |
earliest_block,
|
569 |
)
|
570 |
+
for address, (filename, earliest_block) in get_mech_info_last_60_days().items()
|
571 |
}
|
572 |
+
|
573 |
event_to_contents = {}
|
574 |
|
575 |
latest_block = LATEST_BLOCK
|
|
|
580 |
|
581 |
# Loop through events in event_to_transformer
|
582 |
for event_name, transformer in event_to_transformer.items():
|
583 |
+
# if next_start_block is None:
|
584 |
+
# next_start_block_base = get_earliest_block(event_name)
|
585 |
|
586 |
# Loop through mech addresses in mech_to_info
|
587 |
events = []
|
588 |
for address, (abi, earliest_block) in mech_to_info.items():
|
589 |
+
next_start_block = earliest_block
|
|
|
|
|
|
|
|
|
590 |
print(
|
591 |
f"Searching for {event_name.value} events for mech {address} from block {next_start_block} to {latest_block}."
|
592 |
)
|
|
|
617 |
current_mech_events = future.result()
|
618 |
events.extend(current_mech_events)
|
619 |
|
620 |
+
print("Parsing events")
|
621 |
parsed = parse_events(events)
|
622 |
|
623 |
contents = []
|
|
|
643 |
|
644 |
contents = pd.concat(contents, ignore_index=True)
|
645 |
|
646 |
+
transformed = transformer(contents)
|
|
|
|
|
|
|
|
|
|
|
|
|
647 |
|
648 |
+
# Remove appending data, always new files
|
649 |
+
# if os.path.exists(DATA_DIR / events_filename):
|
650 |
+
# old = pd.read_parquet(DATA_DIR / events_filename)
|
651 |
|
652 |
+
# # Reset index to avoid index conflicts
|
653 |
+
# old.reset_index(drop=True, inplace=True)
|
654 |
+
# transformed.reset_index(drop=True, inplace=True)
|
655 |
|
656 |
+
# # Concatenate DataFrames
|
657 |
+
# transformed = pd.concat([old, transformed], ignore_index=True)
|
658 |
|
659 |
+
# # Drop duplicates if necessary
|
660 |
+
# transformed.drop_duplicates(subset=REQUEST_ID_FIELD, inplace=True)
|
661 |
|
662 |
event_to_contents[event_name] = transformed.copy()
|
663 |
|
664 |
# Store progress
|
665 |
tools = pd.merge(*event_to_contents.values(), on=REQUEST_ID_FIELD)
|
666 |
+
print(tools.info())
|
667 |
+
|
668 |
store_progress(filename, event_to_contents, tools)
|
669 |
|
670 |
return tools
|
|
|
674 |
RPCs = [
|
675 |
"https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a",
|
676 |
]
|
677 |
+
filename = DEFAULT_FILENAME
|
678 |
|
679 |
+
tools = etl(rpcs=RPCs, filename=filename)
|
scripts/utils.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
from tqdm import tqdm
|
5 |
+
from tools import MechEventName
|
6 |
+
from typing import List
|
7 |
+
import pandas as pd
|
8 |
+
import gc
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
REDUCE_FACTOR = 0.25
|
12 |
+
SLEEP = 0.5
|
13 |
+
REQUEST_ID_FIELD = "request_id"
|
14 |
+
SCRIPTS_DIR = Path(__file__).parent
|
15 |
+
ROOT_DIR = SCRIPTS_DIR.parent
|
16 |
+
DATA_DIR = ROOT_DIR / "data"
|
17 |
+
BLOCK_FIELD = "block"
|
18 |
+
|
19 |
+
|
20 |
+
def parse_args() -> str:
|
21 |
+
"""Parse the arguments and return the RPC."""
|
22 |
+
if len(sys.argv) != 2:
|
23 |
+
raise ValueError("Expected the RPC as a positional argument.")
|
24 |
+
return sys.argv[1]
|
25 |
+
|
26 |
+
|
27 |
+
def read_abi(abi_path: str) -> str:
|
28 |
+
"""Read and return the wxDAI contract's ABI."""
|
29 |
+
with open(abi_path) as abi_file:
|
30 |
+
return abi_file.read()
|
31 |
+
|
32 |
+
|
33 |
+
def reduce_window(contract_instance, event, from_block, batch_size, latest_block):
|
34 |
+
"""Dynamically reduce the batch size window."""
|
35 |
+
keep_fraction = 1 - REDUCE_FACTOR
|
36 |
+
events_filter = contract_instance.events[event].build_filter()
|
37 |
+
events_filter.fromBlock = from_block
|
38 |
+
batch_size = int(batch_size * keep_fraction)
|
39 |
+
events_filter.toBlock = min(from_block + batch_size, latest_block)
|
40 |
+
tqdm.write(f"RPC timed out! Resizing batch size to {batch_size}.")
|
41 |
+
time.sleep(SLEEP)
|
42 |
+
return events_filter, batch_size
|
43 |
+
|
44 |
+
|
45 |
+
def limit_text(text: str, limit: int = 200) -> str:
|
46 |
+
"""Limit the given text"""
|
47 |
+
if len(text) > limit:
|
48 |
+
return f"{text[:limit]}..."
|
49 |
+
return text
|
50 |
+
|
51 |
+
|
52 |
+
def check_for_dicts(df: pd.DataFrame) -> List[str]:
|
53 |
+
"""Check for columns that contain dictionaries."""
|
54 |
+
dict_columns = []
|
55 |
+
for column in df.columns:
|
56 |
+
if df[column].apply(lambda x: isinstance(x, dict)).any():
|
57 |
+
dict_columns.append(column)
|
58 |
+
return dict_columns
|
59 |
+
|
60 |
+
|
61 |
+
def drop_dict_rows(df: pd.DataFrame, dict_columns: List[str]) -> pd.DataFrame:
|
62 |
+
"""Drop rows that contain dictionaries."""
|
63 |
+
for column in dict_columns:
|
64 |
+
df = df[~df[column].apply(lambda x: isinstance(x, dict))]
|
65 |
+
return df
|
66 |
+
|
67 |
+
|
68 |
+
def clean(df: pd.DataFrame) -> pd.DataFrame:
|
69 |
+
"""Clean the dataframe."""
|
70 |
+
dict_columns = check_for_dicts(df)
|
71 |
+
df = drop_dict_rows(df, dict_columns)
|
72 |
+
cleaned = df.drop_duplicates()
|
73 |
+
cleaned[REQUEST_ID_FIELD] = cleaned[REQUEST_ID_FIELD].astype("str")
|
74 |
+
return cleaned
|
75 |
+
|
76 |
+
|
77 |
+
def gen_event_filename(event_name: MechEventName) -> str:
|
78 |
+
"""Generate the filename of an event."""
|
79 |
+
return f"{event_name.value.lower()}s.parquet"
|
80 |
+
|
81 |
+
|
82 |
+
def read_n_last_lines(filename: str, n: int = 1) -> str:
|
83 |
+
"""Return the `n` last lines' content of a file."""
|
84 |
+
num_newlines = 0
|
85 |
+
with open(filename, "rb") as f:
|
86 |
+
try:
|
87 |
+
f.seek(-2, os.SEEK_END)
|
88 |
+
while num_newlines < n:
|
89 |
+
f.seek(-2, os.SEEK_CUR)
|
90 |
+
if f.read(1) == b"\n":
|
91 |
+
num_newlines += 1
|
92 |
+
except OSError:
|
93 |
+
f.seek(0)
|
94 |
+
last_line = f.readline().decode()
|
95 |
+
return last_line
|
96 |
+
|
97 |
+
|
98 |
+
def get_earliest_block(event_name: MechEventName) -> int:
|
99 |
+
"""Get the earliest block number to use when filtering for events."""
|
100 |
+
filename = gen_event_filename(event_name)
|
101 |
+
if not os.path.exists(DATA_DIR / filename):
|
102 |
+
return 0
|
103 |
+
|
104 |
+
df = pd.read_parquet(DATA_DIR / filename)
|
105 |
+
block_field = f"{event_name.value.lower()}_{BLOCK_FIELD}"
|
106 |
+
earliest_block = int(df[block_field].max())
|
107 |
+
# clean and release all memory
|
108 |
+
del df
|
109 |
+
gc.collect()
|
110 |
+
return earliest_block
|