{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Vp5YVvaTpIiX", "outputId": "3c5b2a63-1fb4-430d-a986-4092ee8d4891" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "--2023-11-28 19:52:23-- https://huggingface.co/spaces/lmsys/mt-bench/resolve/main/data/mt_bench/model_judgment/gpt-4_single.jsonl\n", "Resolving huggingface.co (huggingface.co)... 18.164.174.55, 18.164.174.23, 18.164.174.118, ...\n", "Connecting to huggingface.co (huggingface.co)|18.164.174.55|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://cdn-lfs.huggingface.co/repos/12/2b/122bd8e9eccbb3acc98acf73e0ecef3c96f24dcdb5f6639074ed304eb19f9cd4/76c55033c6b2b1cc3f62513458f84748a23352495fd42b1062a7401de5ff9bd9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27gpt-4_single.jsonl%3B+filename%3D%22gpt-4_single.jsonl%22%3B&Expires=1701460343&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMTQ2MDM0M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8xMi8yYi8xMjJiZDhlOWVjY2JiM2FjYzk4YWNmNzNlMGVjZWYzYzk2ZjI0ZGNkYjVmNjYzOTA3NGVkMzA0ZWIxOWY5Y2Q0Lzc2YzU1MDMzYzZiMmIxY2MzZjYyNTEzNDU4Zjg0NzQ4YTIzMzUyNDk1ZmQ0MmIxMDYyYTc0MDFkZTVmZjliZDk%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=hwTqBVlLz755xHaQaN6cSDP2FxoPBAXFcOE2uvFAYzg0Y90kGkY3A74Fj2wAkToA-dN1WJeMc%7Ef2XarD%7EbAw%7E4v2JCw9kphUxL-pcRF1uNBI2pzS-3Joff-m%7Ee3GVq5%7E8QabDfK60nWuA10CodvlaRDqVpuYEAvF2n5tY3Adf6-V-YdcaxE2DTlHXm65oJsJwWJTGiQYzTtn4rEVWKgQHVYp7CqX0IdyaILr966agOZvdUGDUZfkZtG6E9A6zKOgOBfdpJn1tjmMKEkDscDvLJvg8r9QJY7yttPHOMNVruzVtoLjpg1lFb-tXco3h%7EFZVKiOIZL%7E597WbaDu8hdZOQ__&Key-Pair-Id=KVTP0A1DKRTAX [following]\n", "--2023-11-28 19:52:23-- https://cdn-lfs.huggingface.co/repos/12/2b/122bd8e9eccbb3acc98acf73e0ecef3c96f24dcdb5f6639074ed304eb19f9cd4/76c55033c6b2b1cc3f62513458f84748a23352495fd42b1062a7401de5ff9bd9?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27gpt-4_single.jsonl%3B+filename%3D%22gpt-4_single.jsonl%22%3B&Expires=1701460343&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMTQ2MDM0M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8xMi8yYi8xMjJiZDhlOWVjY2JiM2FjYzk4YWNmNzNlMGVjZWYzYzk2ZjI0ZGNkYjVmNjYzOTA3NGVkMzA0ZWIxOWY5Y2Q0Lzc2YzU1MDMzYzZiMmIxY2MzZjYyNTEzNDU4Zjg0NzQ4YTIzMzUyNDk1ZmQ0MmIxMDYyYTc0MDFkZTVmZjliZDk%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=hwTqBVlLz755xHaQaN6cSDP2FxoPBAXFcOE2uvFAYzg0Y90kGkY3A74Fj2wAkToA-dN1WJeMc%7Ef2XarD%7EbAw%7E4v2JCw9kphUxL-pcRF1uNBI2pzS-3Joff-m%7Ee3GVq5%7E8QabDfK60nWuA10CodvlaRDqVpuYEAvF2n5tY3Adf6-V-YdcaxE2DTlHXm65oJsJwWJTGiQYzTtn4rEVWKgQHVYp7CqX0IdyaILr966agOZvdUGDUZfkZtG6E9A6zKOgOBfdpJn1tjmMKEkDscDvLJvg8r9QJY7yttPHOMNVruzVtoLjpg1lFb-tXco3h%7EFZVKiOIZL%7E597WbaDu8hdZOQ__&Key-Pair-Id=KVTP0A1DKRTAX\n", "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 18.65.25.40, 18.65.25.122, 18.65.25.124, ...\n", "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.65.25.40|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 20113128 (19M) [text/plain]\n", "Saving to: ‘gpt-4_single.jsonl’\n", "\n", "gpt-4_single.jsonl 100%[===================>] 19.18M 25.8MB/s in 0.7s \n", "\n", "2023-11-28 19:52:25 (25.8 MB/s) - ‘gpt-4_single.jsonl’ saved [20113128/20113128]\n", "\n", "--2023-11-28 19:52:25-- https://huggingface.co/spaces/lmsys/mt-bench/resolve/main/data/mt_bench/model_judgment/gpt-4_pair.jsonl\n", "Resolving huggingface.co (huggingface.co)... 18.164.174.55, 18.164.174.23, 18.164.174.118, ...\n", "Connecting to huggingface.co (huggingface.co)|18.164.174.55|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://cdn-lfs.huggingface.co/repos/12/2b/122bd8e9eccbb3acc98acf73e0ecef3c96f24dcdb5f6639074ed304eb19f9cd4/d662c0b7d1d297f0494fcb4cc09fe8f054fa22d75deb4754a483a921984bc585?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27gpt-4_pair.jsonl%3B+filename%3D%22gpt-4_pair.jsonl%22%3B&Expires=1701460345&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMTQ2MDM0NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8xMi8yYi8xMjJiZDhlOWVjY2JiM2FjYzk4YWNmNzNlMGVjZWYzYzk2ZjI0ZGNkYjVmNjYzOTA3NGVkMzA0ZWIxOWY5Y2Q0L2Q2NjJjMGI3ZDFkMjk3ZjA0OTRmY2I0Y2MwOWZlOGYwNTRmYTIyZDc1ZGViNDc1NGE0ODNhOTIxOTg0YmM1ODU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=RcHQsWboSyCegZB6o-k6-9fsGpTmhArmdubGyrc7VTT2cc9FKMoPc4vHW0RtMgS%7EkYWm2eA9sfex%7EWN%7E5A0i1CBBWP3EDq365Jt52BdOw4BbOtezicyT2eLPzNkgrw3RuLMZTApHUr6md1TVm0W15rmSaUpoQT5sKcVwq%7EvmmLXr6AFOV6vWho6vEHSadzT8GJkK%7El9xOtBGhCE-pWOsEU6siX9sw0HwZBmg1mcXJzMj2du%7Em5AmG3lXsJm2fFY0ZmhSZjm7FH%7EBxF38wTuuf3gBUeJUU%7Ecx0Lv935FSAmmdzqrXO4CiGq%7EQSTp7uga8mUJikosX6DlfLMZudAIVzg__&Key-Pair-Id=KVTP0A1DKRTAX [following]\n", "--2023-11-28 19:52:25-- https://cdn-lfs.huggingface.co/repos/12/2b/122bd8e9eccbb3acc98acf73e0ecef3c96f24dcdb5f6639074ed304eb19f9cd4/d662c0b7d1d297f0494fcb4cc09fe8f054fa22d75deb4754a483a921984bc585?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27gpt-4_pair.jsonl%3B+filename%3D%22gpt-4_pair.jsonl%22%3B&Expires=1701460345&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMTQ2MDM0NX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8xMi8yYi8xMjJiZDhlOWVjY2JiM2FjYzk4YWNmNzNlMGVjZWYzYzk2ZjI0ZGNkYjVmNjYzOTA3NGVkMzA0ZWIxOWY5Y2Q0L2Q2NjJjMGI3ZDFkMjk3ZjA0OTRmY2I0Y2MwOWZlOGYwNTRmYTIyZDc1ZGViNDc1NGE0ODNhOTIxOTg0YmM1ODU%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=RcHQsWboSyCegZB6o-k6-9fsGpTmhArmdubGyrc7VTT2cc9FKMoPc4vHW0RtMgS%7EkYWm2eA9sfex%7EWN%7E5A0i1CBBWP3EDq365Jt52BdOw4BbOtezicyT2eLPzNkgrw3RuLMZTApHUr6md1TVm0W15rmSaUpoQT5sKcVwq%7EvmmLXr6AFOV6vWho6vEHSadzT8GJkK%7El9xOtBGhCE-pWOsEU6siX9sw0HwZBmg1mcXJzMj2du%7Em5AmG3lXsJm2fFY0ZmhSZjm7FH%7EBxF38wTuuf3gBUeJUU%7Ecx0Lv935FSAmmdzqrXO4CiGq%7EQSTp7uga8mUJikosX6DlfLMZudAIVzg__&Key-Pair-Id=KVTP0A1DKRTAX\n", "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 18.65.25.40, 18.65.25.122, 18.65.25.124, ...\n", "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.65.25.40|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 48043462 (46M) [binary/octet-stream]\n", "Saving to: ‘gpt-4_pair.jsonl’\n", "\n", "gpt-4_pair.jsonl 100%[===================>] 45.82M 36.0MB/s in 1.3s \n", "\n", "2023-11-28 19:52:27 (36.0 MB/s) - ‘gpt-4_pair.jsonl’ saved [48043462/48043462]\n", "\n" ] } ], "source": [ "!wget https://huggingface.co/spaces/lmsys/mt-bench/resolve/main/data/mt_bench/model_judgment/gpt-4_single.jsonl\n", "!wget https://huggingface.co/spaces/lmsys/mt-bench/resolve/main/data/mt_bench/model_judgment/gpt-4_pair.jsonl" ] }, { "cell_type": "code", "source": [ "!pip install -U plotly kaleido" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4eYlKr9RrPu2", "outputId": "b957d1f9-0024-4c5c-eb07-dcb1a0071081" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (5.15.0)\n", "Collecting plotly\n", " Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.6/15.6 MB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting kaleido\n", " Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.9/79.9 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly) (8.2.3)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from plotly) (23.2)\n", "Installing collected packages: kaleido, plotly\n", " Attempting uninstall: plotly\n", " Found existing installation: plotly 5.15.0\n", " Uninstalling plotly-5.15.0:\n", " Successfully uninstalled plotly-5.15.0\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "lida 0.0.10 requires fastapi, which is not installed.\n", "lida 0.0.10 requires python-multipart, which is not installed.\n", "lida 0.0.10 requires uvicorn, which is not installed.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed kaleido-0.2.1 plotly-5.18.0\n" ] } ] }, { "cell_type": "code", "source": [ "import json\n", "import pandas as pd\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "\n", "\n", "CATEGORIES = [\"Writing\", \"Roleplay\", \"Reasoning\", \"Math\", \"Coding\", \"Extraction\", \"STEM\", \"Humanities\"]\n", "\n", "\n", "def get_model_df():\n", " cnt = 0\n", " q2result = []\n", " fin = open(\"gpt-4_single.jsonl\", \"r\")\n", " for line in fin:\n", " obj = json.loads(line)\n", " obj[\"category\"] = CATEGORIES[(obj[\"question_id\"]-81)//10]\n", " q2result.append(obj)\n", " df = pd.DataFrame(q2result)\n", " return df\n", "\n", "def toggle(res_str):\n", " if res_str == \"win\":\n", " return \"loss\"\n", " elif res_str == \"loss\":\n", " return \"win\"\n", " return \"tie\"\n", "\n", "def get_model_df_pair():\n", " fin = open(\"gpt-4_pair.jsonl\", \"r\")\n", " cnt = 0\n", " q2result = []\n", " for line in fin:\n", " obj = json.loads(line)\n", "\n", " result = {}\n", " result[\"qid\"] = str(obj[\"question_id\"])\n", " result[\"turn\"] = str(obj[\"turn\"])\n", " if obj[\"g1_winner\"] == \"model_1\" and obj[\"g2_winner\"] == \"model_1\":\n", " result[\"result\"] = \"win\"\n", " elif obj[\"g1_winner\"] == \"model_2\" and obj[\"g2_winner\"] == \"model_2\":\n", " result[\"result\"] = \"loss\"\n", " else:\n", " result[\"result\"] = \"tie\"\n", " result[\"category\"] = CATEGORIES[(obj[\"question_id\"]-81)//10]\n", " result[\"model\"] = obj[\"model_1\"]\n", " q2result.append(result)\n", "\n", " df = pd.DataFrame(q2result)\n", "\n", " return df\n", "\n", "df = get_model_df()\n", "df_pair = get_model_df_pair()" ], "metadata": { "id": "m2tG_vDyqWZw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df_pair" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "wUw1sxfmaGuK", "outputId": "21365f64-c2fa-47c7-9ad4-ca114eac6533" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " qid turn result category model\n", "0 81 1 loss Writing alpaca-13b\n", "1 81 2 loss Writing alpaca-13b\n", "2 82 1 loss Writing alpaca-13b\n", "3 82 2 loss Writing alpaca-13b\n", "4 83 1 loss Writing alpaca-13b\n", "... ... ... ... ... ...\n", "4795 158 2 tie Humanities wizardlm-30b\n", "4796 159 1 loss Humanities wizardlm-30b\n", "4797 159 2 win Humanities wizardlm-30b\n", "4798 160 1 loss Humanities wizardlm-30b\n", "4799 160 2 tie Humanities wizardlm-30b\n", "\n", "[4800 rows x 5 columns]" ], "text/html": [ "\n", "
\n", " | qid | \n", "turn | \n", "result | \n", "category | \n", "model | \n", "
---|---|---|---|---|---|
0 | \n", "81 | \n", "1 | \n", "loss | \n", "Writing | \n", "alpaca-13b | \n", "
1 | \n", "81 | \n", "2 | \n", "loss | \n", "Writing | \n", "alpaca-13b | \n", "
2 | \n", "82 | \n", "1 | \n", "loss | \n", "Writing | \n", "alpaca-13b | \n", "
3 | \n", "82 | \n", "2 | \n", "loss | \n", "Writing | \n", "alpaca-13b | \n", "
4 | \n", "83 | \n", "1 | \n", "loss | \n", "Writing | \n", "alpaca-13b | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4795 | \n", "158 | \n", "2 | \n", "tie | \n", "Humanities | \n", "wizardlm-30b | \n", "
4796 | \n", "159 | \n", "1 | \n", "loss | \n", "Humanities | \n", "wizardlm-30b | \n", "
4797 | \n", "159 | \n", "2 | \n", "win | \n", "Humanities | \n", "wizardlm-30b | \n", "
4798 | \n", "160 | \n", "1 | \n", "loss | \n", "Humanities | \n", "wizardlm-30b | \n", "
4799 | \n", "160 | \n", "2 | \n", "tie | \n", "Humanities | \n", "wizardlm-30b | \n", "
4800 rows × 5 columns
\n", "