{ "cells": [ { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import csv\n", "import requests\n", "hash = \"QmR8etyW3TPFadNtNrW54vfnFqmh8vBrMARWV76EmxCZyk\"\n", "ipfs_address = \"https://gateway.autonolas.tech/ipfs/\"\n", "\n", "accuracy_link= ipfs_address + hash\n", "response = requests.get(accuracy_link)\n", "print(response)\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "headers = ['tool', 'tool_accuracy', 'total_requests', 'min', 'max']\n" ] } ], "source": [ "from io import StringIO\n", "accuracy_store = {}\n", "data = StringIO(response.text)\n", "csv_reader = csv.reader(data, delimiter=',')\n", "for row in csv_reader:\n", " if row[0] == \"tool\":\n", " print(f\"headers = {row}\")\n", " continue\n", " accuracy_store[row[0]] = [\n", " row[2],\n", " row[1],\n", " ]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'claude-prediction-offline': ['481', '57.380457380457386'], 'claude-prediction-online': ['1055', '61.137440758293835'], 'prediction-offline': ['4465', '67.41321388577828'], 'prediction-offline-sme': ['61', '70.49180327868852'], 'prediction-online': ['9490', '66.00632244467862'], 'prediction-online-sme': ['14642', '65.67408823931157'], 'prediction-request-rag': ['2691', '63.58231140839836'], 'prediction-request-rag-claude': ['7428', '65.64351103931072'], 'prediction-request-reasoning': ['17372', '67.11374625834677'], 'prediction-request-reasoning-claude': ['2470', '66.72064777327935'], 'prediction-url-cot-claude': ['1596', '61.904761904761905']}\n" ] } ], "source": [ "print(accuracy_store)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "fpmms = pd.read_parquet('../data/fpmms.parquet')\n", "tools = pd.read_parquet('../data/tools.parquet')\n", "trades = pd.read_parquet('../data/all_trades_profitability.parquet')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "INC_TOOLS = [\n", " \"prediction-online\",\n", " \"prediction-offline\",\n", " \"claude-prediction-online\",\n", " \"claude-prediction-offline\",\n", " \"prediction-offline-sme\",\n", " \"prediction-online-sme\",\n", " \"prediction-request-rag\",\n", " \"prediction-request-reasoning\",\n", " \"prediction-url-cot-claude\",\n", " \"prediction-request-rag-claude\",\n", " \"prediction-request-reasoning-claude\",\n", "]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wintooltool_accuracytotal_requests
0claude-prediction-offline66.308244279
1claude-prediction-online58.9140271105
2prediction-offline67.7179152283
3prediction-offline-sme55.55555618
4prediction-online65.4590665631
5prediction-online-sme67.4176568167
6prediction-request-rag64.2170721769
7prediction-request-rag-claude69.5545664490
8prediction-request-reasoning68.8135949828
9prediction-request-reasoning-claude68.9102562184
10prediction-url-cot-claude64.5849801265
\n", "
" ], "text/plain": [ "win tool tool_accuracy total_requests\n", "0 claude-prediction-offline 66.308244 279\n", "1 claude-prediction-online 58.914027 1105\n", "2 prediction-offline 67.717915 2283\n", "3 prediction-offline-sme 55.555556 18\n", "4 prediction-online 65.459066 5631\n", "5 prediction-online-sme 67.417656 8167\n", "6 prediction-request-rag 64.217072 1769\n", "7 prediction-request-rag-claude 69.554566 4490\n", "8 prediction-request-reasoning 68.813594 9828\n", "9 prediction-request-reasoning-claude 68.910256 2184\n", "10 prediction-url-cot-claude 64.584980 1265" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tools_inc = tools[tools['tool'].isin(INC_TOOLS)]\n", "# filtering errors\n", "tools_non_error = tools_inc[tools_inc['error'] != 1]\n", "tools_non_error.loc[:, 'currentAnswer'] = tools_non_error['currentAnswer'].replace({'no': 'No', 'yes': 'Yes'})\n", "tools_non_error = tools_non_error[tools_non_error['currentAnswer'].isin(['Yes', 'No'])]\n", "tools_non_error = tools_non_error[tools_non_error['vote'].isin(['Yes', 'No'])]\n", "tools_non_error['win'] = (tools_non_error['currentAnswer'] == tools_non_error['vote']).astype(int)\n", "tools_non_error.columns = tools_non_error.columns.astype(str)\n", "wins = tools_non_error.groupby(['tool', 'win']).size().unstack().fillna(0)\n", "wins['tool_accuracy'] = (wins[1] / (wins[0] + wins[1])) * 100\n", "wins.reset_index(inplace=True)\n", "wins['total_requests'] = wins[0] + wins[1]\n", "wins.columns = wins.columns.astype(str)\n", "wins = wins[[\"tool\", \"tool_accuracy\", \"total_requests\"]]\n", "wins" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
minmax
tool
claude-prediction-offline2024-04-23 13:09:302024-06-10 00:31:30
claude-prediction-online2024-04-12 12:24:202024-06-09 21:41:20
prediction-offline2024-04-12 12:20:102024-06-08 23:45:00
prediction-offline-sme2024-04-16 07:58:452024-04-29 20:45:15
prediction-online2024-04-16 05:52:402024-06-09 21:47:20
prediction-online-sme2024-04-12 11:51:302024-06-10 00:06:00
prediction-request-rag2024-04-12 11:39:402024-06-09 21:17:45
prediction-request-rag-claude2024-04-12 11:14:302024-06-07 11:42:30
prediction-request-reasoning2024-04-12 11:57:052024-06-09 21:50:45
prediction-request-reasoning-claude2024-04-12 11:53:552024-06-05 05:00:10
prediction-url-cot-claude2024-04-12 11:37:152024-06-05 05:21:10
\n", "
" ], "text/plain": [ " min max\n", "tool \n", "claude-prediction-offline 2024-04-23 13:09:30 2024-06-10 00:31:30\n", "claude-prediction-online 2024-04-12 12:24:20 2024-06-09 21:41:20\n", "prediction-offline 2024-04-12 12:20:10 2024-06-08 23:45:00\n", "prediction-offline-sme 2024-04-16 07:58:45 2024-04-29 20:45:15\n", "prediction-online 2024-04-16 05:52:40 2024-06-09 21:47:20\n", "prediction-online-sme 2024-04-12 11:51:30 2024-06-10 00:06:00\n", "prediction-request-rag 2024-04-12 11:39:40 2024-06-09 21:17:45\n", "prediction-request-rag-claude 2024-04-12 11:14:30 2024-06-07 11:42:30\n", "prediction-request-reasoning 2024-04-12 11:57:05 2024-06-09 21:50:45\n", "prediction-request-reasoning-claude 2024-04-12 11:53:55 2024-06-05 05:00:10\n", "prediction-url-cot-claude 2024-04-12 11:37:15 2024-06-05 05:21:10" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tools_inc = tools[tools['tool'].isin(INC_TOOLS)]\n", "# filtering errors\n", "tools_non_error = tools_inc[tools_inc['error'] != 1]\n", "tools_non_error.loc[:, 'currentAnswer'] = tools_non_error['currentAnswer'].replace({'no': 'No', 'yes': 'Yes'})\n", "tools_non_error = tools_non_error[tools_non_error['currentAnswer'].isin(['Yes', 'No'])]\n", "tools_non_error = tools_non_error[tools_non_error['vote'].isin(['Yes', 'No'])]\n", "tools_non_error['win'] = (tools_non_error['currentAnswer'] == tools_non_error['vote']).astype(int)\n", "tools_non_error.columns = tools_non_error.columns.astype(str)\n", "timeline = tools_non_error.groupby(['tool'])[\"request_time\"].agg([\"min\",\"max\"])\n", "timeline" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tooltool_accuracytotal_requestsminmax
0claude-prediction-offline66.3082442792024-04-23 13:09:302024-06-10 00:31:30
1claude-prediction-online58.91402711052024-04-12 12:24:202024-06-09 21:41:20
2prediction-offline67.71791522832024-04-12 12:20:102024-06-08 23:45:00
3prediction-offline-sme55.555556182024-04-16 07:58:452024-04-29 20:45:15
4prediction-online65.45906656312024-04-16 05:52:402024-06-09 21:47:20
5prediction-online-sme67.41765681672024-04-12 11:51:302024-06-10 00:06:00
6prediction-request-rag64.21707217692024-04-12 11:39:402024-06-09 21:17:45
7prediction-request-rag-claude69.55456644902024-04-12 11:14:302024-06-07 11:42:30
8prediction-request-reasoning68.81359498282024-04-12 11:57:052024-06-09 21:50:45
9prediction-request-reasoning-claude68.91025621842024-04-12 11:53:552024-06-05 05:00:10
10prediction-url-cot-claude64.58498012652024-04-12 11:37:152024-06-05 05:21:10
\n", "
" ], "text/plain": [ " tool tool_accuracy total_requests \\\n", "0 claude-prediction-offline 66.308244 279 \n", "1 claude-prediction-online 58.914027 1105 \n", "2 prediction-offline 67.717915 2283 \n", "3 prediction-offline-sme 55.555556 18 \n", "4 prediction-online 65.459066 5631 \n", "5 prediction-online-sme 67.417656 8167 \n", "6 prediction-request-rag 64.217072 1769 \n", "7 prediction-request-rag-claude 69.554566 4490 \n", "8 prediction-request-reasoning 68.813594 9828 \n", "9 prediction-request-reasoning-claude 68.910256 2184 \n", "10 prediction-url-cot-claude 64.584980 1265 \n", "\n", " min max \n", "0 2024-04-23 13:09:30 2024-06-10 00:31:30 \n", "1 2024-04-12 12:24:20 2024-06-09 21:41:20 \n", "2 2024-04-12 12:20:10 2024-06-08 23:45:00 \n", "3 2024-04-16 07:58:45 2024-04-29 20:45:15 \n", "4 2024-04-16 05:52:40 2024-06-09 21:47:20 \n", "5 2024-04-12 11:51:30 2024-06-10 00:06:00 \n", "6 2024-04-12 11:39:40 2024-06-09 21:17:45 \n", "7 2024-04-12 11:14:30 2024-06-07 11:42:30 \n", "8 2024-04-12 11:57:05 2024-06-09 21:50:45 \n", "9 2024-04-12 11:53:55 2024-06-05 05:00:10 \n", "10 2024-04-12 11:37:15 2024-06-05 05:21:10 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "total = wins.merge(timeline,how=\"left\", on=\"tool\")\n", "total" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "total.to_csv(\"accuracy_info.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def extract_question(text):\n", " pattern = r'\"([^\"]+\\?)\"'\n", " match = re.search(pattern, text)\n", " if match:\n", " return match.group(1)\n", " return text" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_current_answer(q):\n", " return trades[trades['title'] == q]['current_answer'].unique()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# only select trades in May 2024\n", "trades['creation_timestamp'] = pd.to_datetime(trades['creation_timestamp'])\n", "trades = trades[trades['creation_timestamp'].dt.month == 5]\n", "trades = trades[trades['creation_timestamp'].dt.year == 2024]\n", "\n", "# make a column for winning_vote\n", "tools['winning_vote'] = (tools['vote'] == tools['currentAnswer'])\n", "tools = tools[tools['tool']!= 'resolve-market-reasoning-gpt-4'].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "tools['prompt_request'] = tools['prompt_request'].apply(extract_question)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "trades_grouped = trades.groupby(['title', 'winning_trade']).size().unstack().fillna(0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "winning_trades_percentage = trades_grouped[True] / trades_grouped.sum(axis=1)\n", "winning_trades_percentage = winning_trades_percentage.reset_index()\n", "winning_trades_percentage.columns = ['title', 'winning_trade_percentage']\n", "winning_trades_percentage['num_trades'] = list(trades_grouped.sum(axis=1).values)\n", "winning_trades_percentage_bottom_50 = winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False)[-50:].reset_index(drop=True)\n", "winning_trades_percentage_top_50 = winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False)[:50].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# winning_trades_percentage.sort_values(by='winning_trade_percentage', ascending=False).reset_index(drop=True).to_csv('winning_trades_percentage.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Will Kylian Mbappe leave Paris St-Germain at the end of the season by 16 May 2024?',\n", " 'Will BlizzCon be reinstated on or by 1 May 2024 after its cancellation in 2024?',\n", " 'Will Joe Biden approve more weapons for Ukraine by 4 May 2024?',\n", " \"Will FiiO's new custom in-ear monitors become the top-selling wireless earbuds by 9 May 2024?\",\n", " 'Will Mohamed Salah leave Liverpool on 7 May 2024?',\n", " \"Will Ryan Gosling accept a 'dark' role in a film by 14 May 2024?\",\n", " 'Will the Philadelphia 76ers win the NBA play-offs on 7 May 2024?',\n", " 'Will the Panamanian presidential election result in a clear victor by 12 May 2024?',\n", " 'Will the Museum of Old and New Art in Tasmania be allowed to keep its exhibit women-only by 14 May 2024?',\n", " \"Will Diego Maradona's 'Stolen' Golden Ball be auctioned off on 14 May 2024?\",\n", " 'Will the Mercedes G-Wagen release an electric version on 1 May 2024?',\n", " 'Will the Israeli government lift the broadcast ban on Al Jazeera on or before 13 May 2024?',\n", " 'Will Intel release its Core Ultra 200 Arrow Lake CPUs by 16 May 2024?',\n", " 'Will the Atlanta City Council pay $3.8 million to settle a lawsuit by the family of a church deacon who died in a struggle with a city police officer by 13 May 2024?',\n", " 'Will Voyager-1 continue to send readable data until 1 May 2024?',\n", " 'Will the Amber Alert issued in New Mexico result in the discovery of the missing 10-month-old baby by 13 May 2024?',\n", " \"Will Florida's ban on lab-grown meat be overturned by 12 May 2024?\",\n", " \"Will the US government successfully distribute the $138.7 million payout to Larry Nassar's victims by 1 May 2024?\",\n", " 'Will a new sport be officially added to the Olympics programme on 16 May 2024?',\n", " \"Will Kristi Noem be announced as Donald Trump's vice presidential running mate by 6 May 2024?\",\n", " 'Will the United Auto Workers union strike against Daimler Truck on or by 7 May 2024?',\n", " 'Will the World Snooker Championship 2024 conclude with Judd Trump or Tom Ford as the winner by May 5, 2024?',\n", " \"Will Maria Georgas be announced as the next 'Bachelorette' lead on 9 May 2024?\",\n", " 'Will Apple release new iPads at their event on May 7, 2024?',\n", " 'Will Joe Biden still be the President of the United States on 11 May 2024?',\n", " \"Will the world's biggest 3D printer be used to make parts of houses by 2 May 2024?\",\n", " \"Will Anthony Edwards be named NBA's MVP on 11 May 2024?\",\n", " 'Will a winner be declared in the Eurovision 2024 grand final by 19 May 2024?',\n", " \"Will a new mission be launched to explore the moon's 'hidden side' by 12 May 2024?\",\n", " 'Will Mike Tyson win his bout against Jake Paul on 7 May 2024?',\n", " 'Will the bird flu outbreak be declared a global pandemic by 12 May 2024?',\n", " 'Will the new Apple Pencil Pro be revealed by 15 May 2024?',\n", " \"Will the amateur angler who landed UK's 'biggest fish' in Essex catch another record-breaking fish by 7 May 2024?\",\n", " \"Will Saul 'Canelo' Alvarez successfully defend his WBA, WBC, WBO, and IBF titles again by 13 May 2024?\",\n", " \"Will Taylor Swift's 'The Tortured Poets Department' album reach number 1 on Billboard 200 on 3 May 2024?\",\n", " 'Will Joe Biden attend the White House Correspondents Dinner on 5 May 2024?',\n", " 'Will King Charles perform public duties on 5 May 2024, after his progress in cancer treatment?',\n", " \"Will LinkedIn's new puzzle games Pinpoint, Queens, and Crossclimb be successful on their platform by 9 May 2024?\",\n", " 'Will South Dakota Governor Kristi Noem resign over the puppy killing controversy by 15 May 2024?',\n", " 'Will Apple announce the release of a new M4 chip by 13 May 2024?',\n", " 'Will Eric Adams still be the mayor of New York City on 10 May 2024?',\n", " \"Will the livestream video 'portals' connecting New York City and Dublin still be operational on 19 May 2024?\",\n", " 'Will there be more pro-Palestinian protests on US university campuses on 6 May 2024?',\n", " 'Will Google Pixel 8a be released at Google I/O 2024 on 14 May?',\n", " 'Will Apple announce more than just a spec bump at the May 2024 iPad event?',\n", " \"Will Apple's new Magic Keyboard for the iPad Pro M4 be released by 15 May 2024?\",\n", " 'Will the UEFA Champions League final be between PSG and Borussia Dortmund on 13 May 2024?',\n", " 'Will the FBI report an increase in scams targeting Americans older than 60 in 2024?',\n", " 'Will Erik ten Hag remain as Manchester United manager on 17 May 2024?',\n", " 'Will Jofra Archer be a part of the England squad for T20 World Cup in June 2024?']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "winning_trades_percentage_top_50['title'].tolist()\n", "\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?\",\n", " 'Will Fiona Harvey officially file a lawsuit against Netflix and Richard Gadd by 17 May 2024?',\n", " 'Will the final report on the Baltimore bridge collapse be released by 20 May 2024?',\n", " 'Will the Autonomous Racing League successfully hold their second race by May 3, 2024?',\n", " 'Will Trent Staggs win the Senatorial race to replace Sen. Mitt Romney (R-UT) on 5 May 2024?',\n", " 'Will the Houston area experience flooding conditions on 11 May 2024?',\n", " \"Will 'Wednesday' season 2 be released on Netflix by 1 May 2024?\",\n", " 'Will Arsenal win against Bournemouth in the Premier League match on 12 May 2024?',\n", " 'Will Qualcomm release its Snapdragon X Plus laptop chip by 1 May 2024?',\n", " \"Will Feyenoord's Arne Slot become the new manager of Liverpool by 1 May 2024?\",\n", " 'Will the FCC receive additional funding for replacing Huawei gear by 10 May 2024?',\n", " 'Will there be any major cyber attack on an organization using AI before 2 May 2024?',\n", " 'Will Sony complete the takeover of Paramount by 11 May 2024?',\n", " \"Will 'Hell's Kitchen' win the Tony Awards for Best Musical on 7 May 2024?\",\n", " 'Will Tesla announce reinstating any laid off supercharger workers by 11 May 2024?',\n", " 'Will there be another tornado in Nebraska and Iowa on 6 May 2024?',\n", " 'Will the DJI drones be officially banned in the United States by 4 May 2024?',\n", " 'Will OpenAI debut a multimodal AI digital assistant by 19 May 2024?',\n", " 'Will TikTok be purchased by a Wall Street or Tech billionaire by 2 May 2024?',\n", " \"Will the 'Lost' Gustav Klimt painting be sold at the auction in Vienna on 3 May 2024?\",\n", " \"Will the Federal Communications Commission levy fines against AT&T, Sprint, T-Mobile, and Verizon for illegally sharing customers' location data by 9 May 2024?\",\n", " 'Will the Manchester City win the WSL title on 14 May 2024?',\n", " 'Will Meta start making profit from generative AI by 3 May 2024?',\n", " 'Will Apple launch an AI-powered iOS 18 on or by 1 May 2024?',\n", " 'Will iOS 18 receive a major AI overhaul by 6 May 2024?',\n", " 'Will Ippei Mizuhara be sentenced for bank fraud by 15 May 2024?',\n", " 'Will Tesla lay off nearly 2,700 workers at its Austin, Texas factory by 1 May 2024?',\n", " 'Will Manchester City win the Premier League title on 11 May 2024?',\n", " 'Will there be another deadly pandemic by 8 May 2024?',\n", " 'Will China successfully collect samples from the far side of the Moon on 10 May 2024?',\n", " \"Will the American Airlines correct their system's error of mistaking 101-year-old passenger for a baby by 7 May 2024?\",\n", " 'Will the Boeing Starliner capsule successfully complete its first astronaut-crewed flight to the International Space Station by 13 May 2024?',\n", " \"Will the Technics' special-edition turntable in collaboration with Lamborghini be released by 17 May 2024?\",\n", " 'Will the Florida Panthers win against the Boston Bruins in the Game 3 on 17 May 2024?',\n", " 'Will Harvard Yard be free from Anti-Israel protests by 2 May 2024?',\n", " \"Will Samsung's latest jibe have any impact on Apple's sales by 11 May 2024?\",\n", " \"Will the Miss USA organization respond to the call for 'full transparency' from contestants by 16 May 2024?\",\n", " 'Will Tom Daley win a medal at the Paris Olympics 2024 by 14 May 2024?',\n", " \"Will Liverpool win any more trophies in Jurgen Klopp's final season?\",\n", " 'Will Liverpool win any more trophies by 2 May 2024?',\n", " 'Will Caitlin Clark score more than 20 points in her next NBA game by 10 May 2024?',\n", " 'Will the statues of civil rights leader Daisy Bates and singer Johnny Cash replace the Arkansas statues at the U.S Capitol by 14 May 2024?',\n", " \"Will the season 6 of Netflix's Cobra Kai be released in 3 parts by 12 May 2024?\",\n", " \"Will the 'Don't Say Gay' education restrictions bill be implemented in Alabama on or before 1 May 2024?\",\n", " \"Will the 'lost' Gustav Klimt painting be successfully auctioned by 3 May 2024?\",\n", " 'Will the Kansas City Chiefs win their next game on or before May 15, 2024?',\n", " 'Will Lando Norris win another F1 race by 15 May 2024?',\n", " 'Will Pennsylvania be a red state by 6 May 2024?',\n", " 'Will Tesla face significant financial troubles by 11 May 2024?',\n", " 'Will the BattlerGC Pro be released for the GameCube on or by 3 May 2024?']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "winning_trades_percentage_bottom_50['title'].tolist()" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "def losing_percentage(q):\n", " print(f\"Losing percentage for: {q}\")\n", " q_losing = tools[tools['prompt_request'].str.contains(q)].groupby(['tool', 'winning_vote']).size().unstack().fillna(0)\n", " q_losing_perc = q_losing[False] / (q_losing[False] + q_losing[True])\n", " q_losing_perc = q_losing_perc.reset_index()\n", " q_losing_perc.columns = ['tool', 'losing_percentage']\n", " q_losing_perc['num_calls'] = list(q_losing.sum(axis=1).values)\n", " q_losing_perc = q_losing_perc.sort_values(by='losing_percentage', ascending=False)\n", " return q_losing_perc" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0prediction-offline1.00000040.0
4prediction-request-rag-claude1.00000017.0
7prediction-url-cot-claude1.0000002.0
2prediction-online-sme0.65671667.0
6prediction-request-reasoning-claude0.5714297.0
5prediction-request-reasoning0.53846252.0
3prediction-request-rag0.2500004.0
1prediction-online0.18518527.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 prediction-offline 1.000000 40.0\n", "4 prediction-request-rag-claude 1.000000 17.0\n", "7 prediction-url-cot-claude 1.000000 2.0\n", "2 prediction-online-sme 0.656716 67.0\n", "6 prediction-request-reasoning-claude 0.571429 7.0\n", "5 prediction-request-reasoning 0.538462 52.0\n", "3 prediction-request-rag 0.250000 4.0\n", "1 prediction-online 0.185185 27.0" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# have confirmed market resolution was correct\n", "losing_percentage(winning_trades_percentage_bottom_50.loc[0, 'title'])" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will 'Scavengers Reign' be renewed for a second season on Netflix by 19 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0prediction-offline1.00000040.0
4prediction-request-rag-claude1.00000017.0
7prediction-url-cot-claude1.0000002.0
2prediction-online-sme0.65671667.0
6prediction-request-reasoning-claude0.5714297.0
5prediction-request-reasoning0.53846252.0
3prediction-request-rag0.2500004.0
1prediction-online0.18518527.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 prediction-offline 1.000000 40.0\n", "4 prediction-request-rag-claude 1.000000 17.0\n", "7 prediction-url-cot-claude 1.000000 2.0\n", "2 prediction-online-sme 0.656716 67.0\n", "6 prediction-request-reasoning-claude 0.571429 7.0\n", "5 prediction-request-reasoning 0.538462 52.0\n", "3 prediction-request-rag 0.250000 4.0\n", "1 prediction-online 0.185185 27.0" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# have confirmed currentAnswer\n", "losing_percentage(winning_trades_percentage_bottom_50.loc[0, 'title'])" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will Fiona Harvey officially file a lawsuit against Netflix and Richard Gadd by 17 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
7prediction-url-cot-claude1.0000001.0
2prediction-online-sme0.97727344.0
1prediction-online0.97500040.0
0prediction-offline0.67741931.0
5prediction-request-reasoning0.53448358.0
4prediction-request-rag-claude0.22388167.0
6prediction-request-reasoning-claude0.2000005.0
3prediction-request-rag0.0000008.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "7 prediction-url-cot-claude 1.000000 1.0\n", "2 prediction-online-sme 0.977273 44.0\n", "1 prediction-online 0.975000 40.0\n", "0 prediction-offline 0.677419 31.0\n", "5 prediction-request-reasoning 0.534483 58.0\n", "4 prediction-request-rag-claude 0.223881 67.0\n", "6 prediction-request-reasoning-claude 0.200000 5.0\n", "3 prediction-request-rag 0.000000 8.0" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# have confirmed currentAnswer\n", "losing_percentage(winning_trades_percentage_bottom_50.loc[1, 'title'])" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will the final report on the Baltimore bridge collapse be released by 20 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0claude-prediction-offline1.0000005.0
1claude-prediction-online1.0000001.0
2prediction-offline1.00000087.0
6prediction-request-rag-claude1.00000025.0
9prediction-url-cot-claude1.0000001.0
3prediction-online0.95122041.0
8prediction-request-reasoning-claude0.8333336.0
5prediction-request-rag0.7142867.0
7prediction-request-reasoning0.43750048.0
4prediction-online-sme0.39436671.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 claude-prediction-offline 1.000000 5.0\n", "1 claude-prediction-online 1.000000 1.0\n", "2 prediction-offline 1.000000 87.0\n", "6 prediction-request-rag-claude 1.000000 25.0\n", "9 prediction-url-cot-claude 1.000000 1.0\n", "3 prediction-online 0.951220 41.0\n", "8 prediction-request-reasoning-claude 0.833333 6.0\n", "5 prediction-request-rag 0.714286 7.0\n", "7 prediction-request-reasoning 0.437500 48.0\n", "4 prediction-online-sme 0.394366 71.0" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# have confirmed currentAnswer\n", "losing_percentage(winning_trades_percentage_bottom_50.loc[2, 'title'])" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will the Autonomous Racing League successfully hold their second race by May 3, 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0claude-prediction-offline1.02.0
1prediction-offline1.023.0
2prediction-online1.014.0
3prediction-online-sme1.018.0
4prediction-request-rag1.05.0
5prediction-request-rag-claude1.08.0
8prediction-url-cot-claude1.06.0
6prediction-request-reasoning0.018.0
7prediction-request-reasoning-claude0.03.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 claude-prediction-offline 1.0 2.0\n", "1 prediction-offline 1.0 23.0\n", "2 prediction-online 1.0 14.0\n", "3 prediction-online-sme 1.0 18.0\n", "4 prediction-request-rag 1.0 5.0\n", "5 prediction-request-rag-claude 1.0 8.0\n", "8 prediction-url-cot-claude 1.0 6.0\n", "6 prediction-request-reasoning 0.0 18.0\n", "7 prediction-request-reasoning-claude 0.0 3.0" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# have confirmed currentAnswer\n", "losing_percentage(winning_trades_percentage_bottom_50.loc[3, 'title'])" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will the Houston area experience flooding conditions on 11 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0claude-prediction-offline1.0000002.0
1claude-prediction-online1.0000006.0
2prediction-offline1.00000058.0
4prediction-online-sme1.00000039.0
5prediction-request-rag1.0000004.0
8prediction-request-reasoning-claude1.0000008.0
9prediction-url-cot-claude1.0000005.0
6prediction-request-rag-claude0.75471753.0
7prediction-request-reasoning0.36904884.0
3prediction-online0.16666772.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 claude-prediction-offline 1.000000 2.0\n", "1 claude-prediction-online 1.000000 6.0\n", "2 prediction-offline 1.000000 58.0\n", "4 prediction-online-sme 1.000000 39.0\n", "5 prediction-request-rag 1.000000 4.0\n", "8 prediction-request-reasoning-claude 1.000000 8.0\n", "9 prediction-url-cot-claude 1.000000 5.0\n", "6 prediction-request-rag-claude 0.754717 53.0\n", "7 prediction-request-reasoning 0.369048 84.0\n", "3 prediction-online 0.166667 72.0" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "losing_percentage(winning_trades_percentage_bottom_50.loc[5, 'title'])" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will 'Wednesday' season 2 be released on Netflix by 1 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
1prediction-online-sme0.7500004.0
5prediction-request-reasoning-claude0.7500004.0
2prediction-request-rag0.6666676.0
3prediction-request-rag-claude0.5000002.0
4prediction-request-reasoning0.4000005.0
0claude-prediction-online0.0000001.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "1 prediction-online-sme 0.750000 4.0\n", "5 prediction-request-reasoning-claude 0.750000 4.0\n", "2 prediction-request-rag 0.666667 6.0\n", "3 prediction-request-rag-claude 0.500000 2.0\n", "4 prediction-request-reasoning 0.400000 5.0\n", "0 claude-prediction-online 0.000000 1.0" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "losing_percentage(winning_trades_percentage_bottom_50.loc[6, 'title'])" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will Arsenal win against Bournemouth in the Premier League match on 12 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0prediction-offline1.00000011.0
1prediction-online1.00000017.0
2prediction-online-sme1.00000030.0
4prediction-request-rag-claude1.00000045.0
5prediction-request-reasoning0.874016127.0
3prediction-request-rag0.2500004.0
6prediction-request-reasoning-claude0.0000002.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 prediction-offline 1.000000 11.0\n", "1 prediction-online 1.000000 17.0\n", "2 prediction-online-sme 1.000000 30.0\n", "4 prediction-request-rag-claude 1.000000 45.0\n", "5 prediction-request-reasoning 0.874016 127.0\n", "3 prediction-request-rag 0.250000 4.0\n", "6 prediction-request-reasoning-claude 0.000000 2.0" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "losing_percentage(winning_trades_percentage_bottom_50.loc[7, 'title'])" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will Qualcomm release its Snapdragon X Plus laptop chip by 1 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0claude-prediction-offline1.0000007.0
1prediction-offline1.0000001.0
3prediction-online-sme1.00000019.0
5prediction-request-rag-claude1.00000015.0
4prediction-request-rag0.94117617.0
2prediction-online0.8000005.0
7prediction-request-reasoning-claude0.66666715.0
6prediction-request-reasoning0.65217423.0
8prediction-url-cot-claude0.3333333.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 claude-prediction-offline 1.000000 7.0\n", "1 prediction-offline 1.000000 1.0\n", "3 prediction-online-sme 1.000000 19.0\n", "5 prediction-request-rag-claude 1.000000 15.0\n", "4 prediction-request-rag 0.941176 17.0\n", "2 prediction-online 0.800000 5.0\n", "7 prediction-request-reasoning-claude 0.666667 15.0\n", "6 prediction-request-reasoning 0.652174 23.0\n", "8 prediction-url-cot-claude 0.333333 3.0" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "losing_percentage(winning_trades_percentage_bottom_50.loc[8, 'title'])" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will Feyenoord's Arne Slot become the new manager of Liverpool by 1 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0claude-prediction-offline1.0000004.0
1prediction-offline1.0000002.0
8prediction-url-cot-claude1.0000002.0
6prediction-request-reasoning0.91666712.0
7prediction-request-reasoning-claude0.90000010.0
4prediction-request-rag0.71428614.0
3prediction-online-sme0.6666679.0
2prediction-online0.5000002.0
5prediction-request-rag-claude0.45454511.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 claude-prediction-offline 1.000000 4.0\n", "1 prediction-offline 1.000000 2.0\n", "8 prediction-url-cot-claude 1.000000 2.0\n", "6 prediction-request-reasoning 0.916667 12.0\n", "7 prediction-request-reasoning-claude 0.900000 10.0\n", "4 prediction-request-rag 0.714286 14.0\n", "3 prediction-online-sme 0.666667 9.0\n", "2 prediction-online 0.500000 2.0\n", "5 prediction-request-rag-claude 0.454545 11.0" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "losing_percentage(winning_trades_percentage_bottom_50.loc[9, 'title'])" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Losing percentage for: Will the FCC receive additional funding for replacing Huawei gear by 10 May 2024?\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
0claude-prediction-offline1.0000006.0
1claude-prediction-online1.0000003.0
2prediction-offline1.00000036.0
6prediction-request-rag-claude1.00000050.0
4prediction-online-sme0.98648674.0
5prediction-request-rag0.94736819.0
3prediction-online0.91071456.0
9prediction-url-cot-claude0.7777789.0
7prediction-request-reasoning0.46575373.0
8prediction-request-reasoning-claude0.07142914.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "0 claude-prediction-offline 1.000000 6.0\n", "1 claude-prediction-online 1.000000 3.0\n", "2 prediction-offline 1.000000 36.0\n", "6 prediction-request-rag-claude 1.000000 50.0\n", "4 prediction-online-sme 0.986486 74.0\n", "5 prediction-request-rag 0.947368 19.0\n", "3 prediction-online 0.910714 56.0\n", "9 prediction-url-cot-claude 0.777778 9.0\n", "7 prediction-request-reasoning 0.465753 73.0\n", "8 prediction-request-reasoning-claude 0.071429 14.0" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "losing_percentage(winning_trades_percentage_bottom_50.loc[10, 'title'])" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "all_q = winning_trades_percentage_bottom_50['title'].unique().tolist()\n", "q_losing = tools[tools['prompt_request'].isin(all_q)]\n", "q_losing = q_losing.groupby(['tool'])['winning_vote'].value_counts().unstack().fillna(0)\n", "q_losing_perc = q_losing[False] / (q_losing[False] + q_losing[True])\n", "q_losing_perc = q_losing_perc.reset_index()\n", "q_losing_perc.columns = ['tool', 'losing_percentage']\n", "q_losing_perc['num_calls'] = list(q_losing.sum(axis=1).values)\n", "q_losing_perc = q_losing_perc.sort_values(by='losing_percentage', ascending=False)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toollosing_percentagenum_calls
3prediction-offline-sme1.0000002.0
7prediction-request-rag-claude0.9130071184.0
2prediction-offline0.8932811012.0
6prediction-request-rag0.889881336.0
5prediction-online-sme0.8571431722.0
4prediction-online0.8535531154.0
8prediction-request-reasoning0.8474512727.0
10prediction-url-cot-claude0.846154130.0
1claude-prediction-online0.73584953.0
9prediction-request-reasoning-claude0.659664238.0
0claude-prediction-offline0.591549142.0
\n", "
" ], "text/plain": [ " tool losing_percentage num_calls\n", "3 prediction-offline-sme 1.000000 2.0\n", "7 prediction-request-rag-claude 0.913007 1184.0\n", "2 prediction-offline 0.893281 1012.0\n", "6 prediction-request-rag 0.889881 336.0\n", "5 prediction-online-sme 0.857143 1722.0\n", "4 prediction-online 0.853553 1154.0\n", "8 prediction-request-reasoning 0.847451 2727.0\n", "10 prediction-url-cot-claude 0.846154 130.0\n", "1 claude-prediction-online 0.735849 53.0\n", "9 prediction-request-reasoning-claude 0.659664 238.0\n", "0 claude-prediction-offline 0.591549 142.0" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "q_losing_perc" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
confidence0.000.100.200.300.400.500.550.600.650.700.750.800.850.900.950.991.00
tool
claude-prediction-offline0.00.05.046.04.00.00.087.00.00.00.00.00.00.00.00.00.0
claude-prediction-online0.00.02.010.07.03.00.030.00.00.00.00.00.01.00.00.00.0
prediction-offline0.0267.02.013.0302.0189.00.0231.03.00.00.00.01.02.00.00.01.0
prediction-offline-sme0.00.00.00.00.00.00.00.00.00.02.00.00.00.00.00.00.0
prediction-online0.022.04.05.043.023.08.0670.099.02.076.028.055.025.011.00.020.0
prediction-online-sme1.027.010.00.071.02.00.0679.0234.039.0149.076.0109.080.06.00.039.0
prediction-request-rag0.03.02.00.04.04.00.025.05.048.011.036.057.016.011.01.020.0
prediction-request-rag-claude0.00.01.032.00.00.00.0175.00.0513.00.0209.03.040.03.00.00.0
prediction-request-reasoning0.03.0103.01.058.097.00.0315.0176.0441.0317.0339.0159.044.058.00.097.0
prediction-request-reasoning-claude0.00.00.03.04.00.00.027.00.038.04.076.00.08.01.00.02.0
prediction-url-cot-claude0.02.01.02.00.00.00.040.00.060.00.022.00.03.00.00.00.0
\n", "
" ], "text/plain": [ "confidence 0.00 0.10 0.20 0.30 0.40 0.50 \\\n", "tool \n", "claude-prediction-offline 0.0 0.0 5.0 46.0 4.0 0.0 \n", "claude-prediction-online 0.0 0.0 2.0 10.0 7.0 3.0 \n", "prediction-offline 0.0 267.0 2.0 13.0 302.0 189.0 \n", "prediction-offline-sme 0.0 0.0 0.0 0.0 0.0 0.0 \n", "prediction-online 0.0 22.0 4.0 5.0 43.0 23.0 \n", "prediction-online-sme 1.0 27.0 10.0 0.0 71.0 2.0 \n", "prediction-request-rag 0.0 3.0 2.0 0.0 4.0 4.0 \n", "prediction-request-rag-claude 0.0 0.0 1.0 32.0 0.0 0.0 \n", "prediction-request-reasoning 0.0 3.0 103.0 1.0 58.0 97.0 \n", "prediction-request-reasoning-claude 0.0 0.0 0.0 3.0 4.0 0.0 \n", "prediction-url-cot-claude 0.0 2.0 1.0 2.0 0.0 0.0 \n", "\n", "confidence 0.55 0.60 0.65 0.70 0.75 0.80 \\\n", "tool \n", "claude-prediction-offline 0.0 87.0 0.0 0.0 0.0 0.0 \n", "claude-prediction-online 0.0 30.0 0.0 0.0 0.0 0.0 \n", "prediction-offline 0.0 231.0 3.0 0.0 0.0 0.0 \n", "prediction-offline-sme 0.0 0.0 0.0 0.0 2.0 0.0 \n", "prediction-online 8.0 670.0 99.0 2.0 76.0 28.0 \n", "prediction-online-sme 0.0 679.0 234.0 39.0 149.0 76.0 \n", "prediction-request-rag 0.0 25.0 5.0 48.0 11.0 36.0 \n", "prediction-request-rag-claude 0.0 175.0 0.0 513.0 0.0 209.0 \n", "prediction-request-reasoning 0.0 315.0 176.0 441.0 317.0 339.0 \n", "prediction-request-reasoning-claude 0.0 27.0 0.0 38.0 4.0 76.0 \n", "prediction-url-cot-claude 0.0 40.0 0.0 60.0 0.0 22.0 \n", "\n", "confidence 0.85 0.90 0.95 0.99 1.00 \n", "tool \n", "claude-prediction-offline 0.0 0.0 0.0 0.0 0.0 \n", "claude-prediction-online 0.0 1.0 0.0 0.0 0.0 \n", "prediction-offline 1.0 2.0 0.0 0.0 1.0 \n", "prediction-offline-sme 0.0 0.0 0.0 0.0 0.0 \n", "prediction-online 55.0 25.0 11.0 0.0 20.0 \n", "prediction-online-sme 109.0 80.0 6.0 0.0 39.0 \n", "prediction-request-rag 57.0 16.0 11.0 1.0 20.0 \n", "prediction-request-rag-claude 3.0 40.0 3.0 0.0 0.0 \n", "prediction-request-reasoning 159.0 44.0 58.0 0.0 97.0 \n", "prediction-request-reasoning-claude 0.0 8.0 1.0 0.0 2.0 \n", "prediction-url-cot-claude 0.0 3.0 0.0 0.0 0.0 " ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_q = winning_trades_percentage_bottom_50['title'].unique().tolist()\n", "q_losing = tools[tools['prompt_request'].isin(all_q)]\n", "q_losing.groupby(['tool'])['confidence'].value_counts().unstack().fillna(0)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "def get_question(text: str) -> str:\n", " \"\"\"Get the question from a text.\"\"\"\n", " # Regex to find text within double quotes\n", " pattern = r'\"([^\"]*)\"'\n", "\n", " # Find all occurrences\n", " questions = re.findall(pattern, text)\n", "\n", " # Assuming you want the first question if there are multiple\n", " question = questions[0] if questions else None\n", "\n", " return question" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "from web3 import Web3\n", "from typing import Optional\n", "import re\n", "import pickle\n", "\n", "def block_number_to_timestamp(block_number: int, web3: Web3) -> str:\n", " \"\"\"Convert a block number to a timestamp.\"\"\"\n", " block = web3.eth.get_block(block_number)\n", " timestamp = datetime.utcfromtimestamp(block[\"timestamp\"])\n", " return timestamp.strftime(\"%Y-%m-%d %H:%M:%S\")\n", "\n", "\n", "def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> list:\n", " \"\"\"Parallelize the timestamp conversion.\"\"\"\n", " block_numbers = df[\"request_block\"].tolist()\n", " with ThreadPoolExecutor(max_workers=10) as executor:\n", " results = list(\n", " tqdm(executor.map(function, block_numbers), total=len(block_numbers))\n", " )\n", " return results" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "def current_answer(text: str, fpmms: pd.DataFrame) -> Optional[str]:\n", " \"\"\"Get the current answer for a question.\"\"\"\n", " row = fpmms[fpmms[\"title\"] == text]\n", " if row.shape[0] == 0:\n", " return None\n", " return row[\"currentAnswer\"].values[0]" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "from functools import partial\n", "from concurrent.futures import ThreadPoolExecutor\n", "def weekly_analysis():\n", " rpc = \"https://lb.nodies.app/v1/406d8dcc043f4cb3959ed7d6673d311a\"\n", " web3 = Web3(Web3.HTTPProvider(rpc))\n", " # Get currentAnswer from FPMMS\n", " fpmms = pd.read_parquet('../data/fpmms.parquet')\n", " tools = pd.read_parquet('../data/tools.parquet')\n", "\n", " # Get the question from the tools\n", " print(\"Getting the question and current answer for the tools\")\n", " tools[\"title\"] = tools[\"prompt_request\"].apply(lambda x: get_question(x))\n", " tools[\"currentAnswer\"] = tools[\"title\"].apply(lambda x: current_answer(x, fpmms))\n", "\n", " tools[\"currentAnswer\"] = tools[\"currentAnswer\"].str.replace(\"yes\", \"Yes\")\n", " tools[\"currentAnswer\"] = tools[\"currentAnswer\"].str.replace(\"no\", \"No\")\n", "\n", " # Convert block number to timestamp\n", " print(\"Converting block number to timestamp\")\n", " t_map = pickle.load(open(\"../data/t_map.pkl\", \"rb\"))\n", " tools[\"request_time\"] = tools[\"request_block\"].map(t_map)\n", "\n", " # Identify tools with missing request_time and fill them\n", " missing_time_indices = tools[tools[\"request_time\"].isna()].index\n", " if not missing_time_indices.empty:\n", " partial_block_number_to_timestamp = partial(\n", " block_number_to_timestamp, web3=web3\n", " )\n", " missing_timestamps = parallelize_timestamp_conversion(\n", " tools.loc[missing_time_indices], partial_block_number_to_timestamp\n", " )\n", "\n", " # Update the original DataFrame with the missing timestamps\n", " for i, timestamp in zip(missing_time_indices, missing_timestamps):\n", " tools.at[i, \"request_time\"] = timestamp\n", "\n", " tools[\"request_month_year\"] = pd.to_datetime(tools[\"request_time\"]).dt.strftime(\n", " \"%Y-%m\"\n", " )\n", " tools[\"request_month_year_week\"] = (\n", " pd.to_datetime(tools[\"request_time\"]).dt.to_period(\"W\").astype(str)\n", " )\n", "\n", " # Save the tools data after the updates on the content\n", " tools.to_parquet('../data/tools.parquet', index=False)\n", "\n", " # Update t_map with new timestamps\n", " new_timestamps = (\n", " tools[[\"request_block\", \"request_time\"]]\n", " .dropna()\n", " .set_index(\"request_block\")\n", " .to_dict()[\"request_time\"]\n", " )\n", " t_map.update(new_timestamps)\n", "\n", " with open(\"../data/t_map.pkl\", \"wb\") as f:\n", " pickle.dump(t_map, f)\n", "\n", " # clean and release all memory\n", " del tools\n", " del fpmms\n", " del t_map\n", " gc.collect()\n", "\n", " print(\"Weekly analysis files generated and saved\")\n" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Getting the question and current answer for the tools\n", "Converting block number to timestamp\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%| | 0/11690 [00:00 10\u001b[0m \u001b[43mweekly_analysis\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[48], line 63\u001b[0m, in \u001b[0;36mweekly_analysis\u001b[0;34m()\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m fpmms\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m t_map\n\u001b[0;32m---> 63\u001b[0m \u001b[43mgc\u001b[49m\u001b[38;5;241m.\u001b[39mcollect()\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWeekly analysis files generated and saved\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mNameError\u001b[0m: name 'gc' is not defined" ] } ], "source": [ "import os\n", "import pickle\n", "from datetime import datetime\n", "from concurrent.futures import ThreadPoolExecutor\n", "from tqdm import tqdm\n", "from web3 import Web3\n", "import pandas as pd\n", "from pathlib import Path\n", "from functools import partial\n", "weekly_analysis()" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1187" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import gc\n", "gc.collect()" ] } ], "metadata": { "kernelspec": { "display_name": "akash", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }