{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import pandas as pd\n", "from pathlib import Path\n", "from web3 import Web3\n", "from concurrent.futures import ThreadPoolExecutor\n", "from tqdm import tqdm\n", "from functools import partial\n", "from datetime import datetime\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Make t_map" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tools = pd.read_csv(\"../data/tools.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tools.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "t_map = tools[['request_block', 'request_time']].set_index('request_block').to_dict()['request_time']\n", "\n", "with open('../data/t_map.pkl', 'wb') as f:\n", " pickle.dump(t_map, f)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('../data/t_map.pkl', 'rb') as f:\n", " t_map = pickle.load(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Markets" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'currentAnswer', 'title'], dtype='object')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fpmms = pd.read_csv(\"../data/fpmms.csv\")\n", "fpmms.columns" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/l_/g22b1g_n0gn4tmx9lkxqv5x00000gn/T/ipykernel_42934/371090584.py:1: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.\n", " delivers = pd.read_csv(\"../data/delivers.csv\")\n" ] }, { "data": { "text/plain": [ "(263613, 12)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "delivers = pd.read_csv(\"../data/delivers.csv\")\n", "delivers.shape\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(245092, 6)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "requests = pd.read_csv(\"../data/requests.csv\")\n", "requests.columns\n", "\n", "requests.shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/l_/g22b1g_n0gn4tmx9lkxqv5x00000gn/T/ipykernel_42934/3254331204.py:1: DtypeWarning: Columns (7,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", " tools = pd.read_csv(\"../data/tools.csv\")\n" ] }, { "data": { "text/plain": [ "Index(['request_id', 'request_block', 'prompt_request', 'tool', 'nonce',\n", " 'trader_address', 'deliver_block', 'error', 'error_message',\n", " 'prompt_response', 'mech_address', 'p_yes', 'p_no', 'confidence',\n", " 'info_utility', 'vote', 'win_probability', 'title', 'currentAnswer',\n", " 'request_time', 'request_month_year', 'request_month_year_week'],\n", " dtype='object')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tools = pd.read_csv(\"../data/tools.csv\")\n", "tools.columns" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "841" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tools['request_time'].isna().sum()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def block_number_to_timestamp(block_number: int, web3: Web3) -> str:\n", " \"\"\"Convert a block number to a timestamp.\"\"\"\n", " block = web3.eth.get_block(block_number)\n", " timestamp = datetime.utcfromtimestamp(block['timestamp'])\n", " return timestamp.strftime('%Y-%m-%d %H:%M:%S')\n", "\n", "\n", "def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> list:\n", " \"\"\"Parallelize the timestamp conversion.\"\"\"\n", " block_numbers = df['request_block'].tolist()\n", " with ThreadPoolExecutor(max_workers=10) as executor:\n", " results = list(tqdm(executor.map(function, block_numbers), total=len(block_numbers))) \n", " return results\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "rpc = \"https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a\"\n", "web3 = Web3(Web3.HTTPProvider(rpc))\n", "\n", "partial_block_number_to_timestamp = partial(block_number_to_timestamp, web3=web3)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 841/841 [00:25<00:00, 33.18it/s]\n" ] } ], "source": [ "missing_time_indices = tools[tools['request_time'].isna()].index\n", "if not missing_time_indices.empty:\n", " partial_block_number_to_timestamp = partial(block_number_to_timestamp, web3=web3)\n", " missing_timestamps = parallelize_timestamp_conversion(tools.loc[missing_time_indices], partial_block_number_to_timestamp)\n", " \n", " # Update the original DataFrame with the missing timestamps\n", " for i, timestamp in zip(missing_time_indices, missing_timestamps):\n", " tools.at[i, 'request_time'] = timestamp" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tools['request_time'].isna().sum()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "tools['request_month_year'] = pd.to_datetime(tools['request_time']).dt.strftime('%Y-%m')\n", "tools['request_month_year_week'] = pd.to_datetime(tools['request_time']).dt.to_period('W').astype(str)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tools['request_month_year_week'].isna().sum()\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "tools.to_csv(\"../data/tools.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "with open('../data/t_map.pkl', 'rb') as f:\n", " t_map = pickle.load(f)\n", "new_timestamps = tools[['request_block', 'request_time']].dropna().set_index('request_block').to_dict()['request_time']\n", "t_map.update(new_timestamps)\n", "\n", "with open('../data/t_map.pkl', 'wb') as f:\n", " pickle.dump(t_map, f)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "autogen", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 2 }