diff --git "a/notebooks/06b_Open-Source-Models_analysis.ipynb" "b/notebooks/06b_Open-Source-Models_analysis.ipynb"
--- "a/notebooks/06b_Open-Source-Models_analysis.ipynb"
+++ "b/notebooks/06b_Open-Source-Models_analysis.ipynb"
@@ -1 +1,3 @@
-{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"0ea8b46b-839b-445b-8043-ccdf4e920ace","showTitle":false,"title":""},"id":"YLH80COBzi_F"},"outputs":[],"source":["%load_ext autoreload\n","%autoreload 2"]},{"cell_type":"code","execution_count":2,"metadata":{"id":"63B5exAuzq4M"},"outputs":[],"source":["from pathlib import Path\n","\n","if \"workding_dir\" not in locals():\n"," try:\n"," from google.colab import drive\n"," drive.mount('/content/drive')\n"," workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n"," except ModuleNotFoundError:\n"," workding_dir = str(Path.cwd().parent)"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":368,"status":"ok","timestamp":1719461634865,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"zFulf0bg0H-9","outputId":"debdd535-c828-40b9-efc0-8a180e5830dd"},"outputs":[{"name":"stdout","output_type":"stream","text":["workding dir: /Users/inflaton/code/engd/projects/logical-reasoning\n"]}],"source":["import os\n","import sys\n","\n","os.chdir(workding_dir)\n","sys.path.append(workding_dir)\n","print(\"workding dir:\", workding_dir)"]},{"cell_type":"code","execution_count":4,"metadata":{"application/vnd.databricks.v1+cell":{"cellMetadata":{},"inputWidgets":{},"nuid":"9f67ec60-2f24-411c-84eb-0dd664b44775","showTitle":false,"title":""},"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":589,"status":"ok","timestamp":1719462011879,"user":{"displayName":"Donghao Huang","userId":"00463591218503521679"},"user_tz":-480},"id":"DIUiweYYzi_I","outputId":"e16e9247-9077-4b0c-f8ea-17059f05a1c4"},"outputs":[{"name":"stdout","output_type":"stream","text":["loading env vars from: /Users/inflaton/code/engd/projects/logical-reasoning/.env\n"]},{"data":{"text/plain":["True"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["from dotenv import find_dotenv, load_dotenv\n","\n","found_dotenv = find_dotenv(\".env\")\n","\n","if len(found_dotenv) == 0:\n"," found_dotenv = find_dotenv(\".env.example\")\n","print(f\"loading env vars from: {found_dotenv}\")\n","load_dotenv(found_dotenv, override=True)"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat datasets/mgtv data/open_source_models_few_shots_results.csv 2048\n"]}],"source":["import os\n","\n","model_name = os.getenv(\"MODEL_NAME\")\n","data_path = os.getenv(\"LOGICAL_REASONING_DATA_PATH\")\n","results_path = os.getenv(\"LOGICAL_REASONING_RESULTS_PATH\")\n","max_new_tokens = int(os.getenv(\"MAX_NEW_TOKENS\", 2048))\n","\n","print(model_name, data_path, results_path, max_new_tokens)"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading /Users/inflaton/code/engd/projects/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n"]}],"source":["from llm_toolkit.logical_reasoning_utils import *"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["model_names = {\n"," \"gpt-4o-mini\": \"gpt-4o-mini\",\n"," \"gpt-4o\": \"gpt-4o\",\n"," \"o1-mini\": \"o1-mini\",\n"," \"o1-preview\": \"o1-preview\",\n"," \"Llama3.1-8B-Chinese-Chat\": \"Llama3.1-8B\",\n"," \"Llama3.1-70B-Chinese-Chat\": \"Llama3.1-70B\",\n"," \"Mistral-7B-v0.3-Chinese-Chat\": \"Mistral-7B\",\n"," \"internlm2_5-7b-chat\": \"InternLM2.5-7B\",\n"," \"internlm2_5-7b-chat-1m\": \"InternLM2.5-7B-1M\",\n"," \"internlm2_5-20b-chat\": \"InternLM2.5-20B\",\n"," \"Qwen2.5-0.5B-Instruct\": \"Qwen2.5-0.5B\",\n"," \"Qwen2.5-1.5B-Instruct\": \"Qwen2.5-1.5B\",\n"," \"Qwen2.5-3B-Instruct\": \"Qwen2.5-3B\",\n"," \"Qwen2.5-7B-Instruct\": \"Qwen2.5-7B\",\n"," \"Qwen2.5-72B-Instruct\": \"Qwen2.5-72B\",\n","}"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"data":{"text/plain":["{'Llama3.1-8B-Chinese-Chat': 4,\n"," 'Llama3.1-70B-Chinese-Chat': 5,\n"," 'Mistral-7B-v0.3-Chinese-Chat': 6,\n"," 'internlm2_5-7b-chat': 7,\n"," 'internlm2_5-7b-chat-1m': 8,\n"," 'internlm2_5-20b-chat': 9,\n"," 'Qwen2.5-0.5B-Instruct': 10,\n"," 'Qwen2.5-1.5B-Instruct': 11,\n"," 'Qwen2.5-3B-Instruct': 12,\n"," 'Qwen2.5-7B-Instruct': 13,\n"," 'Qwen2.5-72B-Instruct': 14}"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["model_orders = {k: i for i, k in enumerate(model_names.keys()) if i > 3}\n","model_orders"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["markers = [\n"," \"o\",\n"," \"x\",\n"," \"^\",\n"," \"s\",\n"," \"d\",\n"," \"P\",\n"," \"X\",\n"," \"*\",\n"," \"v\",\n"," \">\",\n"," \"<\",\n"," \"p\",\n"," \"h\",\n"," \"H\",\n"," \"+\",\n"," \"|\",\n"," \"_\",\n","]\n","model_markers = {k: markers[i] for i, k in enumerate(model_orders.keys())}"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n"," \n"," | \n"," shots | \n"," model | \n"," run | \n"," accuracy | \n"," precision | \n"," recall | \n"," f1 | \n"," ratio_valid_classifications | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 0 | \n"," Llama3.1-8B-Chinese-Chat | \n"," shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00 | \n"," 0.734333 | \n"," 0.737575 | \n"," 0.734333 | \n"," 0.727028 | \n"," 0.803333 | \n","
\n"," \n"," 1 | \n"," 5 | \n"," Llama3.1-8B-Chinese-Chat | \n"," shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-05 | \n"," 0.705667 | \n"," 0.750852 | \n"," 0.705667 | \n"," 0.723057 | \n"," 0.988667 | \n","
\n"," \n"," 2 | \n"," 10 | \n"," Llama3.1-8B-Chinese-Chat | \n"," shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10 | \n"," 0.673667 | \n"," 0.777600 | \n"," 0.673667 | \n"," 0.709410 | \n"," 0.962333 | \n","
\n"," \n"," 3 | \n"," 20 | \n"," Llama3.1-8B-Chinese-Chat | \n"," shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-20 | \n"," 0.767000 | \n"," 0.764983 | \n"," 0.767000 | \n"," 0.763847 | \n"," 0.979000 | \n","
\n"," \n"," 4 | \n"," 30 | \n"," Llama3.1-8B-Chinese-Chat | \n"," shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-30 | \n"," 0.771333 | \n"," 0.772569 | \n"," 0.771333 | \n"," 0.769269 | \n"," 0.732667 | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 4 | \n"," 30 | \n"," Qwen2.5-7B-Instruct | \n"," Qwen/Qwen2.5-7B-Instruct/shots-30 | \n"," 0.764667 | \n"," 0.778792 | \n"," 0.764667 | \n"," 0.752765 | \n"," 0.805000 | \n","
\n"," \n"," 5 | \n"," 40 | \n"," Qwen2.5-7B-Instruct | \n"," Qwen/Qwen2.5-7B-Instruct/shots-40 | \n"," 0.759000 | \n"," 0.773685 | \n"," 0.759000 | \n"," 0.747225 | \n"," 0.854667 | \n","
\n"," \n"," 6 | \n"," 50 | \n"," Qwen2.5-7B-Instruct | \n"," Qwen/Qwen2.5-7B-Instruct/shots-50 | \n"," 0.758667 | \n"," 0.764043 | \n"," 0.758667 | \n"," 0.741433 | \n"," 0.756333 | \n","
\n"," \n"," 0 | \n"," 0 | \n"," Qwen2.5-72B-Instruct | \n"," Qwen/Qwen2.5-72B-Instruct/shots-00 | \n"," 0.795667 | \n"," 0.809807 | \n"," 0.795667 | \n"," 0.777132 | \n"," 0.994000 | \n","
\n"," \n"," 1 | \n"," 5 | \n"," Qwen2.5-72B-Instruct | \n"," Qwen/Qwen2.5-72B-Instruct/shots-05 | \n"," 0.819000 | \n"," 0.818232 | \n"," 0.819000 | \n"," 0.809537 | \n"," 0.941667 | \n","
\n"," \n","
\n","
62 rows × 8 columns
\n","
"],"text/plain":[" shots model \\\n","0 0 Llama3.1-8B-Chinese-Chat \n","1 5 Llama3.1-8B-Chinese-Chat \n","2 10 Llama3.1-8B-Chinese-Chat \n","3 20 Llama3.1-8B-Chinese-Chat \n","4 30 Llama3.1-8B-Chinese-Chat \n",".. ... ... \n","4 30 Qwen2.5-7B-Instruct \n","5 40 Qwen2.5-7B-Instruct \n","6 50 Qwen2.5-7B-Instruct \n","0 0 Qwen2.5-72B-Instruct \n","1 5 Qwen2.5-72B-Instruct \n","\n"," run accuracy precision \\\n","0 shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00 0.734333 0.737575 \n","1 shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-05 0.705667 0.750852 \n","2 shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10 0.673667 0.777600 \n","3 shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-20 0.767000 0.764983 \n","4 shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-30 0.771333 0.772569 \n",".. ... ... ... \n","4 Qwen/Qwen2.5-7B-Instruct/shots-30 0.764667 0.778792 \n","5 Qwen/Qwen2.5-7B-Instruct/shots-40 0.759000 0.773685 \n","6 Qwen/Qwen2.5-7B-Instruct/shots-50 0.758667 0.764043 \n","0 Qwen/Qwen2.5-72B-Instruct/shots-00 0.795667 0.809807 \n","1 Qwen/Qwen2.5-72B-Instruct/shots-05 0.819000 0.818232 \n","\n"," recall f1 ratio_valid_classifications \n","0 0.734333 0.727028 0.803333 \n","1 0.705667 0.723057 0.988667 \n","2 0.673667 0.709410 0.962333 \n","3 0.767000 0.763847 0.979000 \n","4 0.771333 0.769269 0.732667 \n",".. ... ... ... \n","4 0.764667 0.752765 0.805000 \n","5 0.759000 0.747225 0.854667 \n","6 0.758667 0.741433 0.756333 \n","0 0.795667 0.777132 0.994000 \n","1 0.819000 0.809537 0.941667 \n","\n","[62 rows x 8 columns]"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["import pandas as pd\n","\n","perf_df = None\n","model_perf_dfs = {}\n","for model_name in model_orders.keys():\n"," metrics_csv = f\"data/{model_name}_shots_metrics.csv\"\n"," if not Path(metrics_csv).exists():\n"," print(f\"metrics file not found: {metrics_csv}\")\n"," continue\n"," df = pd.read_csv(metrics_csv)\n"," model_perf_dfs[model_name] = df\n"," perf_df = df if perf_df is None else pd.concat([perf_df, df])\n","\n","perf_df"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["import matplotlib.pyplot as plt\n","from matplotlib.ticker import MultipleLocator\n","\n","\n","def plot_perf(\n"," model_perf_dfs,\n"," model_markers,\n"," x_major_locator=5,\n"," y_offset=0.005,\n"," variant=\"shots\"\n","):\n"," fig, ax = plt.subplots(1, 1, figsize=(12, 6))\n","\n"," for model_name, perf_df in model_perf_dfs.items():\n"," # Ensure the lengths of perf_df[\"epoch\"], perf_df[\"accuracy\"], and perf_df[\"f1\"] are the same\n"," min_length = min(\n"," len(perf_df[variant]), len(perf_df[\"accuracy\"]), len(perf_df[\"f1\"])\n"," )\n"," perf_df = perf_df.iloc[:min_length]\n","\n"," (line,) = ax.plot(\n"," perf_df[variant],\n"," perf_df[\"f1\"],\n"," marker=model_markers[model_name],\n"," label=model_name,\n"," )\n","\n"," line_color = line.get_color()\n","\n"," best_f1 = perf_df[\"f1\"].idxmax()\n"," print(\n"," f\"Best F1 for {model_name} @ {perf_df[variant].iloc[best_f1]:.2f} {variant}: {perf_df['f1'].iloc[best_f1]}\"\n"," )\n"," ax.annotate(\n"," f\"{perf_df['f1'].iloc[best_f1]*100:.2f}%\",\n"," (perf_df[variant].iloc[best_f1], perf_df[\"f1\"].iloc[best_f1]),\n"," ha=\"center\",\n"," va=\"bottom\",\n"," xytext=(0, 0),\n"," textcoords=\"offset points\",\n"," fontsize=10,\n"," color=line_color,\n"," )\n","\n"," # Set y-axis limit\n"," y_scales = ax.get_ylim()\n"," ax.set_ylim(y_scales[0], y_scales[1] + y_offset)\n","\n"," # Add title and labels\n"," ax.set_xlabel(\"Number of Shots\")\n"," ax.set_ylabel(\"F1 Score\")\n","\n"," # Set x-axis grid spacing to 0.2\n"," ax.xaxis.set_major_locator(MultipleLocator(x_major_locator))\n"," ax.set_title(\"Performance Analysis Across Shots for Models\")\n","\n"," # Rotate x labels\n"," plt.xticks(rotation=0)\n"," plt.grid(True)\n"," # plt.tight_layout()\n","\n"," # Set legend at the right to avoid overlapping with lines\n"," plt.legend(loc=\"center left\", bbox_to_anchor=(1.0, 0.5))\n","\n"," plt.show()"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Best F1 for Llama3.1-8B-Chinese-Chat @ 30.00 shots: 0.7692692690410152\n","Best F1 for Llama3.1-70B-Chinese-Chat @ 30.00 shots: 0.7570501796584528\n","Best F1 for Mistral-7B-v0.3-Chinese-Chat @ 30.00 shots: 0.6872462947319797\n","Best F1 for internlm2_5-7b-chat @ 5.00 shots: 0.7232456014841266\n","Best F1 for internlm2_5-7b-chat-1m @ 5.00 shots: 0.7665405919258307\n","Best F1 for internlm2_5-20b-chat @ 0.00 shots: 0.6416875854199033\n","Best F1 for Qwen2.5-0.5B-Instruct @ 50.00 shots: 0.5069942984615308\n","Best F1 for Qwen2.5-1.5B-Instruct @ 10.00 shots: 0.459589777544246\n","Best F1 for Qwen2.5-3B-Instruct @ 50.00 shots: 0.6451959368825358\n","Best F1 for Qwen2.5-7B-Instruct @ 30.00 shots: 0.7527649874769439\n","Best F1 for Qwen2.5-72B-Instruct @ 5.00 shots: 0.8095367865845521\n"]},{"data":{"image/png":"","text/plain":[""]},"metadata":{},"output_type":"display_data"}],"source":["plot_perf(model_perf_dfs, model_markers)"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-8B-Chinese-Chat, Shots: 0\n","count 3000.000000\n","mean 571.091000\n","std 9.115687\n","min 512.000000\n","25% 570.000000\n","50% 571.000000\n","75% 574.000000\n","max 652.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-70B-Chinese-Chat, Shots: 0\n","count 3000.000000\n","mean 571.091000\n","std 9.115687\n","min 512.000000\n","25% 570.000000\n","50% 571.000000\n","75% 574.000000\n","max 652.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Mistral-7B-v0.3-Chinese-Chat, Shots: 0\n","count 3000.000000\n","mean 799.354000\n","std 15.567385\n","min 694.000000\n","25% 798.000000\n","50% 802.000000\n","75% 806.000000\n","max 928.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat, Shots: 0\n","count 3000.000000\n","mean 461.917667\n","std 7.767732\n","min 426.000000\n","25% 459.000000\n","50% 461.000000\n","75% 463.000000\n","max 511.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat-1m, Shots: 0\n","count 3000.000000\n","mean 461.917667\n","std 7.767732\n","min 426.000000\n","25% 459.000000\n","50% 461.000000\n","75% 463.000000\n","max 511.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-20b-chat, Shots: 0\n","count 3000.000000\n","mean 461.917667\n","std 7.767732\n","min 426.000000\n","25% 459.000000\n","50% 461.000000\n","75% 463.000000\n","max 511.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-0.5B-Instruct, Shots: 0\n","count 3000.000000\n","mean 465.338667\n","std 8.617118\n","min 426.000000\n","25% 462.000000\n","50% 464.000000\n","75% 467.000000\n","max 517.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-1.5B-Instruct, Shots: 0\n","count 3000.000000\n","mean 465.338667\n","std 8.617118\n","min 426.000000\n","25% 462.000000\n","50% 464.000000\n","75% 467.000000\n","max 517.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-3B-Instruct, Shots: 0\n","count 3000.000000\n","mean 465.338667\n","std 8.617118\n","min 426.000000\n","25% 462.000000\n","50% 464.000000\n","75% 467.000000\n","max 517.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-7B-Instruct, Shots: 0\n","count 3000.000000\n","mean 465.338667\n","std 8.617118\n","min 426.000000\n","25% 462.000000\n","50% 464.000000\n","75% 467.000000\n","max 517.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-72B-Instruct, Shots: 0\n","count 3000.000000\n","mean 465.338667\n","std 8.617118\n","min 426.000000\n","25% 462.000000\n","50% 464.000000\n","75% 467.000000\n","max 517.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-8B-Chinese-Chat, Shots: 5\n","count 3000.000000\n","mean 1737.091000\n","std 9.115687\n","min 1678.000000\n","25% 1736.000000\n","50% 1737.000000\n","75% 1740.000000\n","max 1818.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-70B-Chinese-Chat, Shots: 5\n","count 3000.000000\n","mean 1737.091000\n","std 9.115687\n","min 1678.000000\n","25% 1736.000000\n","50% 1737.000000\n","75% 1740.000000\n","max 1818.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Mistral-7B-v0.3-Chinese-Chat, Shots: 5\n","count 3000.000000\n","mean 2444.354000\n","std 15.567385\n","min 2339.000000\n","25% 2443.000000\n","50% 2447.000000\n","75% 2451.000000\n","max 2573.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat, Shots: 5\n","count 3000.000000\n","mean 1301.917667\n","std 7.767732\n","min 1266.000000\n","25% 1299.000000\n","50% 1301.000000\n","75% 1303.000000\n","max 1351.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat-1m, Shots: 5\n","count 3000.000000\n","mean 1301.917667\n","std 7.767732\n","min 1266.000000\n","25% 1299.000000\n","50% 1301.000000\n","75% 1303.000000\n","max 1351.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-20b-chat, Shots: 5\n","count 3000.000000\n","mean 1301.917667\n","std 7.767732\n","min 1266.000000\n","25% 1299.000000\n","50% 1301.000000\n","75% 1303.000000\n","max 1351.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-0.5B-Instruct, Shots: 5\n","count 3000.000000\n","mean 1329.338667\n","std 8.617118\n","min 1290.000000\n","25% 1326.000000\n","50% 1328.000000\n","75% 1331.000000\n","max 1381.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-1.5B-Instruct, Shots: 5\n","count 3000.000000\n","mean 1329.338667\n","std 8.617118\n","min 1290.000000\n","25% 1326.000000\n","50% 1328.000000\n","75% 1331.000000\n","max 1381.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-3B-Instruct, Shots: 5\n","count 3000.000000\n","mean 1329.338667\n","std 8.617118\n","min 1290.000000\n","25% 1326.000000\n","50% 1328.000000\n","75% 1331.000000\n","max 1381.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-7B-Instruct, Shots: 5\n","count 3000.000000\n","mean 1329.338667\n","std 8.617118\n","min 1290.000000\n","25% 1326.000000\n","50% 1328.000000\n","75% 1331.000000\n","max 1381.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-72B-Instruct, Shots: 5\n","count 3000.000000\n","mean 1329.338667\n","std 8.617118\n","min 1290.000000\n","25% 1326.000000\n","50% 1328.000000\n","75% 1331.000000\n","max 1381.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-8B-Chinese-Chat, Shots: 10\n","count 3000.000000\n","mean 2833.091000\n","std 9.115687\n","min 2774.000000\n","25% 2832.000000\n","50% 2833.000000\n","75% 2836.000000\n","max 2914.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-70B-Chinese-Chat, Shots: 10\n","count 3000.000000\n","mean 2833.091000\n","std 9.115687\n","min 2774.000000\n","25% 2832.000000\n","50% 2833.000000\n","75% 2836.000000\n","max 2914.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Mistral-7B-v0.3-Chinese-Chat, Shots: 10\n","count 3000.000000\n","mean 3990.354000\n","std 15.567385\n","min 3885.000000\n","25% 3989.000000\n","50% 3993.000000\n","75% 3997.000000\n","max 4119.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat, Shots: 10\n","count 3000.000000\n","mean 2195.917667\n","std 7.767732\n","min 2160.000000\n","25% 2193.000000\n","50% 2195.000000\n","75% 2197.000000\n","max 2245.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat-1m, Shots: 10\n","count 3000.000000\n","mean 2195.917667\n","std 7.767732\n","min 2160.000000\n","25% 2193.000000\n","50% 2195.000000\n","75% 2197.000000\n","max 2245.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-20b-chat, Shots: 10\n","count 3000.000000\n","mean 2195.917667\n","std 7.767732\n","min 2160.000000\n","25% 2193.000000\n","50% 2195.000000\n","75% 2197.000000\n","max 2245.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"4a5d93c643ba401ba00c3d9099e4be3c","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"9bf533ad0e064d768e215dfc6dac4c8c","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-0.5B-Instruct, Shots: 10\n","count 3000.000000\n","mean 2237.338667\n","std 8.617118\n","min 2198.000000\n","25% 2234.000000\n","50% 2236.000000\n","75% 2239.000000\n","max 2289.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"c966d4370e7e404e87abc210685cb80e","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"53bfd02c002d4737a16905d7c0b4aa90","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-1.5B-Instruct, Shots: 10\n","count 3000.000000\n","mean 2237.338667\n","std 8.617118\n","min 2198.000000\n","25% 2234.000000\n","50% 2236.000000\n","75% 2239.000000\n","max 2289.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"36c65b6b931b4c8ab915d3b8b95d8d77","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8ac092b4189b4e8fa19ea4ede0b6d559","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-3B-Instruct, Shots: 10\n","count 3000.000000\n","mean 2237.338667\n","std 8.617118\n","min 2198.000000\n","25% 2234.000000\n","50% 2236.000000\n","75% 2239.000000\n","max 2289.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-7B-Instruct, Shots: 10\n","count 3000.000000\n","mean 2237.338667\n","std 8.617118\n","min 2198.000000\n","25% 2234.000000\n","50% 2236.000000\n","75% 2239.000000\n","max 2289.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-72B-Instruct, Shots: 10\n","count 3000.000000\n","mean 2237.338667\n","std 8.617118\n","min 2198.000000\n","25% 2234.000000\n","50% 2236.000000\n","75% 2239.000000\n","max 2289.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-8B-Chinese-Chat, Shots: 20\n","count 3000.000000\n","mean 5202.091000\n","std 9.115687\n","min 5143.000000\n","25% 5201.000000\n","50% 5202.000000\n","75% 5205.000000\n","max 5283.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-70B-Chinese-Chat, Shots: 20\n","count 3000.000000\n","mean 5202.091000\n","std 9.115687\n","min 5143.000000\n","25% 5201.000000\n","50% 5202.000000\n","75% 5205.000000\n","max 5283.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Mistral-7B-v0.3-Chinese-Chat, Shots: 20\n","count 3000.000000\n","mean 7263.354000\n","std 15.567385\n","min 7158.000000\n","25% 7262.000000\n","50% 7266.000000\n","75% 7270.000000\n","max 7392.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat, Shots: 20\n","count 3000.000000\n","mean 4015.917667\n","std 7.767732\n","min 3980.000000\n","25% 4013.000000\n","50% 4015.000000\n","75% 4017.000000\n","max 4065.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat-1m, Shots: 20\n","count 3000.000000\n","mean 4015.917667\n","std 7.767732\n","min 3980.000000\n","25% 4013.000000\n","50% 4015.000000\n","75% 4017.000000\n","max 4065.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-20b-chat, Shots: 20\n","count 3000.000000\n","mean 4015.917667\n","std 7.767732\n","min 3980.000000\n","25% 4013.000000\n","50% 4015.000000\n","75% 4017.000000\n","max 4065.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f41a7aacc870496eaae4e67d3796826a","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"1f405d1f86bf494599ce97758bee3abe","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-0.5B-Instruct, Shots: 20\n","count 3000.000000\n","mean 4124.338667\n","std 8.617118\n","min 4085.000000\n","25% 4121.000000\n","50% 4123.000000\n","75% 4126.000000\n","max 4176.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d97b558254eb4596b3754ab2b1b8eadb","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"2d1a46fcef47448d91a263645e46e056","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-1.5B-Instruct, Shots: 20\n","count 3000.000000\n","mean 4124.338667\n","std 8.617118\n","min 4085.000000\n","25% 4121.000000\n","50% 4123.000000\n","75% 4126.000000\n","max 4176.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"06cd22b5216944dcb45b8ad44a7451c4","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3ba05b7053444f0a9189d753f91baed3","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-3B-Instruct, Shots: 20\n","count 3000.000000\n","mean 4124.338667\n","std 8.617118\n","min 4085.000000\n","25% 4121.000000\n","50% 4123.000000\n","75% 4126.000000\n","max 4176.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-7B-Instruct, Shots: 20\n","count 3000.000000\n","mean 4124.338667\n","std 8.617118\n","min 4085.000000\n","25% 4121.000000\n","50% 4123.000000\n","75% 4126.000000\n","max 4176.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-72B-Instruct, Shots: 20\n","count 3000.000000\n","mean 4124.338667\n","std 8.617118\n","min 4085.000000\n","25% 4121.000000\n","50% 4123.000000\n","75% 4126.000000\n","max 4176.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-8B-Chinese-Chat, Shots: 30\n","count 3000.000000\n","mean 7687.091000\n","std 9.115687\n","min 7628.000000\n","25% 7686.000000\n","50% 7687.000000\n","75% 7690.000000\n","max 7768.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-70B-Chinese-Chat, Shots: 30\n","count 3000.000000\n","mean 7687.091000\n","std 9.115687\n","min 7628.000000\n","25% 7686.000000\n","50% 7687.000000\n","75% 7690.000000\n","max 7768.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Mistral-7B-v0.3-Chinese-Chat, Shots: 30\n","count 3000.000000\n","mean 10675.354000\n","std 15.567385\n","min 10570.000000\n","25% 10674.000000\n","50% 10678.000000\n","75% 10682.000000\n","max 10804.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat, Shots: 30\n","count 3000.000000\n","mean 5853.917667\n","std 7.767732\n","min 5818.000000\n","25% 5851.000000\n","50% 5853.000000\n","75% 5855.000000\n","max 5903.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat-1m, Shots: 30\n","count 3000.000000\n","mean 5853.917667\n","std 7.767732\n","min 5818.000000\n","25% 5851.000000\n","50% 5853.000000\n","75% 5855.000000\n","max 5903.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-20b-chat, Shots: 30\n","count 3000.000000\n","mean 5853.917667\n","std 7.767732\n","min 5818.000000\n","25% 5851.000000\n","50% 5853.000000\n","75% 5855.000000\n","max 5903.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3653e64e34334d5fa27ef82076f9e172","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"63f88697fbc3406784f6b588275d1ad4","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-0.5B-Instruct, Shots: 30\n","count 3000.000000\n","mean 6055.338667\n","std 8.617118\n","min 6016.000000\n","25% 6052.000000\n","50% 6054.000000\n","75% 6057.000000\n","max 6107.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d137cdc92b7a4b88a9b47da5f70d0a38","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"af1a32214fd840bfb27a89e5aa4022ef","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-1.5B-Instruct, Shots: 30\n","count 3000.000000\n","mean 6055.338667\n","std 8.617118\n","min 6016.000000\n","25% 6052.000000\n","50% 6054.000000\n","75% 6057.000000\n","max 6107.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"dfab7500c59e414d99fe9a7756bdcead","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3e17d62403664c0a8100b7b68d51dd83","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-3B-Instruct, Shots: 30\n","count 3000.000000\n","mean 6055.338667\n","std 8.617118\n","min 6016.000000\n","25% 6052.000000\n","50% 6054.000000\n","75% 6057.000000\n","max 6107.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-7B-Instruct, Shots: 30\n","count 3000.000000\n","mean 6055.338667\n","std 8.617118\n","min 6016.000000\n","25% 6052.000000\n","50% 6054.000000\n","75% 6057.000000\n","max 6107.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-72B-Instruct, Shots: 30\n","count 3000.000000\n","mean 6055.338667\n","std 8.617118\n","min 6016.000000\n","25% 6052.000000\n","50% 6054.000000\n","75% 6057.000000\n","max 6107.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-8B-Chinese-Chat, Shots: 40\n","count 3000.000000\n","mean 10136.091000\n","std 9.115687\n","min 10077.000000\n","25% 10135.000000\n","50% 10136.000000\n","75% 10139.000000\n","max 10217.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-70B-Chinese-Chat, Shots: 40\n","count 3000.000000\n","mean 10136.091000\n","std 9.115687\n","min 10077.000000\n","25% 10135.000000\n","50% 10136.000000\n","75% 10139.000000\n","max 10217.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Mistral-7B-v0.3-Chinese-Chat, Shots: 40\n","count 3000.000000\n","mean 14023.354000\n","std 15.567385\n","min 13918.000000\n","25% 14022.000000\n","50% 14026.000000\n","75% 14030.000000\n","max 14152.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat, Shots: 40\n","count 3000.000000\n","mean 7659.917667\n","std 7.767732\n","min 7624.000000\n","25% 7657.000000\n","50% 7659.000000\n","75% 7661.000000\n","max 7709.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat-1m, Shots: 40\n","count 3000.000000\n","mean 7659.917667\n","std 7.767732\n","min 7624.000000\n","25% 7657.000000\n","50% 7659.000000\n","75% 7661.000000\n","max 7709.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-20b-chat, Shots: 40\n","count 3000.000000\n","mean 7659.917667\n","std 7.767732\n","min 7624.000000\n","25% 7657.000000\n","50% 7659.000000\n","75% 7661.000000\n","max 7709.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"74cad7e27d1747a9b922b21ca4455205","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8458ed58792e4c8ca6d2aedd731ad21a","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-0.5B-Instruct, Shots: 40\n","count 3000.000000\n","mean 7958.338667\n","std 8.617118\n","min 7919.000000\n","25% 7955.000000\n","50% 7957.000000\n","75% 7960.000000\n","max 8010.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"a84ce4d14b8942daad8a0146700250cf","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"80a7a5b379434e0f9677905d7967c40e","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-1.5B-Instruct, Shots: 40\n","count 3000.000000\n","mean 7958.338667\n","std 8.617118\n","min 7919.000000\n","25% 7955.000000\n","50% 7957.000000\n","75% 7960.000000\n","max 8010.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8fdd0391cb034eb28ae6ea6c77a75492","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d5c0356e6c9e4d28982b23f08699af6f","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-3B-Instruct, Shots: 40\n","count 3000.000000\n","mean 7958.338667\n","std 8.617118\n","min 7919.000000\n","25% 7955.000000\n","50% 7957.000000\n","75% 7960.000000\n","max 8010.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-7B-Instruct, Shots: 40\n","count 3000.000000\n","mean 7958.338667\n","std 8.617118\n","min 7919.000000\n","25% 7955.000000\n","50% 7957.000000\n","75% 7960.000000\n","max 8010.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-72B-Instruct, Shots: 40\n","count 3000.000000\n","mean 7958.338667\n","std 8.617118\n","min 7919.000000\n","25% 7955.000000\n","50% 7957.000000\n","75% 7960.000000\n","max 8010.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-8B-Chinese-Chat, Shots: 50\n","count 3000.000000\n","mean 12638.091000\n","std 9.115687\n","min 12579.000000\n","25% 12637.000000\n","50% 12638.000000\n","75% 12641.000000\n","max 12719.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Llama3.1-70B-Chinese-Chat, Shots: 50\n","count 3000.000000\n","mean 12638.091000\n","std 9.115687\n","min 12579.000000\n","25% 12637.000000\n","50% 12638.000000\n","75% 12641.000000\n","max 12719.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Mistral-7B-v0.3-Chinese-Chat, Shots: 50\n","count 3000.000000\n","mean 17459.354000\n","std 15.567385\n","min 17354.000000\n","25% 17458.000000\n","50% 17462.000000\n","75% 17466.000000\n","max 17588.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat, Shots: 50\n","count 3000.000000\n","mean 9511.917667\n","std 7.767732\n","min 9476.000000\n","25% 9509.000000\n","50% 9511.000000\n","75% 9513.000000\n","max 9561.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-7b-chat-1m, Shots: 50\n","count 3000.000000\n","mean 9511.917667\n","std 7.767732\n","min 9476.000000\n","25% 9509.000000\n","50% 9511.000000\n","75% 9513.000000\n","max 9561.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: internlm2_5-20b-chat, Shots: 50\n","count 3000.000000\n","mean 9511.917667\n","std 7.767732\n","min 9476.000000\n","25% 9509.000000\n","50% 9511.000000\n","75% 9513.000000\n","max 9561.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"1ed79f4e6862426ab1739eb15e1505f5","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"e63f59a07b7a42d6a143c53e04028316","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-0.5B-Instruct, Shots: 50\n","count 3000.000000\n","mean 9909.338667\n","std 8.617118\n","min 9870.000000\n","25% 9906.000000\n","50% 9908.000000\n","75% 9911.000000\n","max 9961.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3b44e26442bc4f9dbac30f96d0d81a78","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"e1211bd4a3e44a77860eb4617a51b530","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-1.5B-Instruct, Shots: 50\n","count 3000.000000\n","mean 9909.338667\n","std 8.617118\n","min 9870.000000\n","25% 9906.000000\n","50% 9908.000000\n","75% 9911.000000\n","max 9961.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"03720de4193c4f90af36ad30d078f16a","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/25000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"2ed60e4d9e944775a93932aac80e1cb5","version_major":2,"version_minor":0},"text/plain":["Map: 0%| | 0/3000 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-3B-Instruct, Shots: 50\n","count 3000.000000\n","mean 9909.338667\n","std 8.617118\n","min 9870.000000\n","25% 9906.000000\n","50% 9908.000000\n","75% 9911.000000\n","max 9961.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-7B-Instruct, Shots: 50\n","count 3000.000000\n","mean 9909.338667\n","std 8.617118\n","min 9870.000000\n","25% 9906.000000\n","50% 9908.000000\n","75% 9911.000000\n","max 9961.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth', 'train_text', 'prompt'],\n"," num_rows: 3000\n"," })\n","})\n","Model: Qwen2.5-72B-Instruct, Shots: 50\n","count 3000.000000\n","mean 9909.338667\n","std 8.617118\n","min 9870.000000\n","25% 9906.000000\n","50% 9908.000000\n","75% 9911.000000\n","max 9961.000000\n","Name: num_tokens, dtype: float64\n"]}],"source":["from transformers import (\n"," AutoTokenizer,\n",")\n","\n","from llm_toolkit.llm_utils import print_row_details\n","\n","model_test_dfs = {}\n","\n","for num_shots in [0, 5, 10, 20, 30, 40, 50]:\n"," for model_name in model_orders.keys():\n"," model_id = (\n"," model_perf_dfs[model_name][\"run\"].unique()[0].split(model_name)[0]\n"," + model_name\n"," )\n"," tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n","\n"," datasets = load_logical_reasoning_dataset(\n"," data_path,\n"," tokenizer=tokenizer,\n"," chinese_prompt=True,\n"," using_p1=False,\n"," num_shots=num_shots,\n"," )\n"," print(f\"Model: {model_name}, Shots: {num_shots}\")\n"," test_df = datasets[\"test\"].to_pandas()\n"," test_df[\"num_tokens\"] = test_df[\"prompt\"].apply(\n"," lambda x: len(tokenizer(x)[\"input_ids\"])\n"," )\n"," \n"," print(test_df[\"num_tokens\"].describe())\n","\n"," model_test_dfs[(model_name, num_shots)] = test_df"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["import tiktoken\n","\n","\n","def num_tokens_from_text(text, model=\"gpt-4o\"):\n"," \"\"\"Return the number of tokens used by a list of messages.\"\"\"\n"," try:\n"," encoding = tiktoken.encoding_for_model(model)\n"," except KeyError:\n"," # print(\"Warning: model not found. Using cl100k_base encoding.\")\n"," encoding = tiktoken.get_encoding(\"cl100k_base\")\n","\n"," return len(encoding.encode(text))\n","\n","\n","def num_tokens_from_row(row, num_shots, train_dataset, model=\"gpt-4o\"):\n"," prompt = get_few_shot_prompt_template(\n"," num_shots, train_dataset\n"," )\n"," text = prompt.format(row[\"puzzle\"], row[\"truth\"], row[\"text\"])\n"," return num_tokens_from_text(text, model=model)"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o, Shots: 0\n","count 3000.000000\n","mean 524.806333\n","std 10.057595\n","min 464.000000\n","25% 522.000000\n","50% 525.000000\n","75% 528.250000\n","max 606.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o-mini, Shots: 0\n","count 3000.000000\n","mean 524.806333\n","std 10.057595\n","min 464.000000\n","25% 522.000000\n","50% 525.000000\n","75% 528.250000\n","max 606.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-preview, Shots: 0\n","count 3000.000000\n","mean 797.595333\n","std 16.417250\n","min 682.000000\n","25% 797.000000\n","50% 799.000000\n","75% 803.000000\n","max 925.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-mini, Shots: 0\n","count 3000.000000\n","mean 797.595333\n","std 16.417250\n","min 682.000000\n","25% 797.000000\n","50% 799.000000\n","75% 803.000000\n","max 925.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o, Shots: 5\n","count 3000.000000\n","mean 1629.806333\n","std 10.057595\n","min 1569.000000\n","25% 1627.000000\n","50% 1630.000000\n","75% 1633.250000\n","max 1711.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o-mini, Shots: 5\n","count 3000.000000\n","mean 1629.806333\n","std 10.057595\n","min 1569.000000\n","25% 1627.000000\n","50% 1630.000000\n","75% 1633.250000\n","max 1711.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-preview, Shots: 5\n","count 3000.000000\n","mean 2521.595333\n","std 16.417250\n","min 2406.000000\n","25% 2521.000000\n","50% 2523.000000\n","75% 2527.000000\n","max 2649.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-mini, Shots: 5\n","count 3000.000000\n","mean 2521.595333\n","std 16.417250\n","min 2406.000000\n","25% 2521.000000\n","50% 2523.000000\n","75% 2527.000000\n","max 2649.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o, Shots: 10\n","count 3000.000000\n","mean 2699.806333\n","std 10.057595\n","min 2639.000000\n","25% 2697.000000\n","50% 2700.000000\n","75% 2703.250000\n","max 2781.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o-mini, Shots: 10\n","count 3000.000000\n","mean 2699.806333\n","std 10.057595\n","min 2639.000000\n","25% 2697.000000\n","50% 2700.000000\n","75% 2703.250000\n","max 2781.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-preview, Shots: 10\n","count 3000.000000\n","mean 4154.595333\n","std 16.417250\n","min 4039.000000\n","25% 4154.000000\n","50% 4156.000000\n","75% 4160.000000\n","max 4282.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-mini, Shots: 10\n","count 3000.000000\n","mean 4154.595333\n","std 16.417250\n","min 4039.000000\n","25% 4154.000000\n","50% 4156.000000\n","75% 4160.000000\n","max 4282.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o, Shots: 20\n","count 3000.000000\n","mean 5028.806333\n","std 10.057595\n","min 4968.000000\n","25% 5026.000000\n","50% 5029.000000\n","75% 5032.250000\n","max 5110.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o-mini, Shots: 20\n","count 3000.000000\n","mean 5028.806333\n","std 10.057595\n","min 4968.000000\n","25% 5026.000000\n","50% 5029.000000\n","75% 5032.250000\n","max 5110.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-preview, Shots: 20\n","count 3000.000000\n","mean 7653.595333\n","std 16.417250\n","min 7538.000000\n","25% 7653.000000\n","50% 7655.000000\n","75% 7659.000000\n","max 7781.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-mini, Shots: 20\n","count 3000.000000\n","mean 7653.595333\n","std 16.417250\n","min 7538.000000\n","25% 7653.000000\n","50% 7655.000000\n","75% 7659.000000\n","max 7781.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o, Shots: 30\n","count 3000.000000\n","mean 7490.806333\n","std 10.057595\n","min 7430.000000\n","25% 7488.000000\n","50% 7491.000000\n","75% 7494.250000\n","max 7572.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o-mini, Shots: 30\n","count 3000.000000\n","mean 7490.806333\n","std 10.057595\n","min 7430.000000\n","25% 7488.000000\n","50% 7491.000000\n","75% 7494.250000\n","max 7572.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-preview, Shots: 30\n","count 3000.000000\n","mean 11331.595333\n","std 16.417250\n","min 11216.000000\n","25% 11331.000000\n","50% 11333.000000\n","75% 11337.000000\n","max 11459.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-mini, Shots: 30\n","count 3000.000000\n","mean 11331.595333\n","std 16.417250\n","min 11216.000000\n","25% 11331.000000\n","50% 11333.000000\n","75% 11337.000000\n","max 11459.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o, Shots: 40\n","count 3000.000000\n","mean 9916.806333\n","std 10.057595\n","min 9856.000000\n","25% 9914.000000\n","50% 9917.000000\n","75% 9920.250000\n","max 9998.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o-mini, Shots: 40\n","count 3000.000000\n","mean 9916.806333\n","std 10.057595\n","min 9856.000000\n","25% 9914.000000\n","50% 9917.000000\n","75% 9920.250000\n","max 9998.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-preview, Shots: 40\n","count 3000.000000\n","mean 14933.595333\n","std 16.417250\n","min 14818.000000\n","25% 14933.000000\n","50% 14935.000000\n","75% 14939.000000\n","max 15061.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-mini, Shots: 40\n","count 3000.000000\n","mean 14933.595333\n","std 16.417250\n","min 14818.000000\n","25% 14933.000000\n","50% 14935.000000\n","75% 14939.000000\n","max 15061.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o, Shots: 50\n","count 3000.000000\n","mean 12394.806333\n","std 10.057595\n","min 12334.000000\n","25% 12392.000000\n","50% 12395.000000\n","75% 12398.250000\n","max 12476.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: gpt-4o-mini, Shots: 50\n","count 3000.000000\n","mean 12394.806333\n","std 10.057595\n","min 12334.000000\n","25% 12392.000000\n","50% 12395.000000\n","75% 12398.250000\n","max 12476.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-preview, Shots: 50\n","count 3000.000000\n","mean 18632.595333\n","std 16.417250\n","min 18517.000000\n","25% 18632.000000\n","50% 18634.000000\n","75% 18638.000000\n","max 18760.000000\n","Name: num_tokens, dtype: float64\n","loading train/test data files\n","DatasetDict({\n"," train: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 25000\n"," })\n"," test: Dataset({\n"," features: ['text', 'label', 'title', 'puzzle', 'truth'],\n"," num_rows: 3000\n"," })\n","})\n","Model: o1-mini, Shots: 50\n","count 3000.000000\n","mean 18632.595333\n","std 16.417250\n","min 18517.000000\n","25% 18632.000000\n","50% 18634.000000\n","75% 18638.000000\n","max 18760.000000\n","Name: num_tokens, dtype: float64\n"]}],"source":["for num_shots in [0, 5, 10, 20, 30, 40, 50]:\n"," for model_name in [\"gpt-4o\", \"gpt-4o-mini\", \"o1-preview\", \"o1-mini\"]:\n","\n"," datasets = load_logical_reasoning_dataset(\n"," data_path,\n"," )\n"," print(f\"Model: {model_name}, Shots: {num_shots}\")\n"," test_df = datasets[\"test\"].to_pandas()\n"," # print_row_details(test_df)\n"," test_df[\"num_tokens\"] = test_df.apply(\n"," lambda x: num_tokens_from_row(x, num_shots, datasets[\"train\"].to_pandas(), model=model_name), axis=1\n"," )\n"," print(test_df[\"num_tokens\"].describe())\n","\n"," model_test_dfs[(model_name, num_shots)] = test_df"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," model_name | \n"," num_shots | \n"," max | \n"," min | \n"," mean | \n"," std | \n","
\n"," \n"," \n"," \n"," 0 | \n"," Llama3.1-8B-Chinese-Chat | \n"," 0 | \n"," 652 | \n"," 512 | \n"," 571.091000 | \n"," 9.115687 | \n","
\n"," \n"," 1 | \n"," Llama3.1-70B-Chinese-Chat | \n"," 0 | \n"," 652 | \n"," 512 | \n"," 571.091000 | \n"," 9.115687 | \n","
\n"," \n"," 2 | \n"," Mistral-7B-v0.3-Chinese-Chat | \n"," 0 | \n"," 928 | \n"," 694 | \n"," 799.354000 | \n"," 15.567385 | \n","
\n"," \n"," 3 | \n"," internlm2_5-7b-chat | \n"," 0 | \n"," 511 | \n"," 426 | \n"," 461.917667 | \n"," 7.767732 | \n","
\n"," \n"," 4 | \n"," internlm2_5-7b-chat-1m | \n"," 0 | \n"," 511 | \n"," 426 | \n"," 461.917667 | \n"," 7.767732 | \n","
\n"," \n"," 5 | \n"," internlm2_5-20b-chat | \n"," 0 | \n"," 511 | \n"," 426 | \n"," 461.917667 | \n"," 7.767732 | \n","
\n"," \n"," 6 | \n"," Qwen2.5-0.5B-Instruct | \n"," 0 | \n"," 517 | \n"," 426 | \n"," 465.338667 | \n"," 8.617118 | \n","
\n"," \n"," 7 | \n"," Qwen2.5-1.5B-Instruct | \n"," 0 | \n"," 517 | \n"," 426 | \n"," 465.338667 | \n"," 8.617118 | \n","
\n"," \n"," 8 | \n"," Qwen2.5-3B-Instruct | \n"," 0 | \n"," 517 | \n"," 426 | \n"," 465.338667 | \n"," 8.617118 | \n","
\n"," \n"," 9 | \n"," Qwen2.5-7B-Instruct | \n"," 0 | \n"," 517 | \n"," 426 | \n"," 465.338667 | \n"," 8.617118 | \n","
\n"," \n","
\n","
"],"text/plain":[" model_name num_shots max min mean std\n","0 Llama3.1-8B-Chinese-Chat 0 652 512 571.091000 9.115687\n","1 Llama3.1-70B-Chinese-Chat 0 652 512 571.091000 9.115687\n","2 Mistral-7B-v0.3-Chinese-Chat 0 928 694 799.354000 15.567385\n","3 internlm2_5-7b-chat 0 511 426 461.917667 7.767732\n","4 internlm2_5-7b-chat-1m 0 511 426 461.917667 7.767732\n","5 internlm2_5-20b-chat 0 511 426 461.917667 7.767732\n","6 Qwen2.5-0.5B-Instruct 0 517 426 465.338667 8.617118\n","7 Qwen2.5-1.5B-Instruct 0 517 426 465.338667 8.617118\n","8 Qwen2.5-3B-Instruct 0 517 426 465.338667 8.617118\n","9 Qwen2.5-7B-Instruct 0 517 426 465.338667 8.617118"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["df_token_counts = pd.DataFrame(\n"," model_test_dfs.keys(), columns=[\"model_name\", \"num_shots\"]\n",")\n","\n","max = []\n","min = []\n","mean = []\n","std = []\n","\n","for model_name, num_shots in model_test_dfs.keys():\n"," test_df = model_test_dfs[(model_name, num_shots)]\n"," max.append(test_df[\"num_tokens\"].max())\n"," min.append(test_df[\"num_tokens\"].min())\n"," mean.append(test_df[\"num_tokens\"].mean())\n"," std.append(test_df[\"num_tokens\"].std())\n","\n","df_token_counts[\"max\"] = max\n","df_token_counts[\"min\"] = min\n","df_token_counts[\"mean\"] = mean\n","df_token_counts[\"std\"] = std\n","\n","df_token_counts.head(10)"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," model_name | \n"," num_shots | \n"," max | \n"," min | \n"," mean | \n"," std | \n","
\n"," \n"," \n"," \n"," 0 | \n"," Llama3.1-8B-Chinese-Chat | \n"," 0 | \n"," 652 | \n"," 512 | \n"," 571.091000 | \n"," 9.115687 | \n","
\n"," \n"," 1 | \n"," Llama3.1-70B-Chinese-Chat | \n"," 0 | \n"," 652 | \n"," 512 | \n"," 571.091000 | \n"," 9.115687 | \n","
\n"," \n"," 2 | \n"," Mistral-7B-v0.3-Chinese-Chat | \n"," 0 | \n"," 928 | \n"," 694 | \n"," 799.354000 | \n"," 15.567385 | \n","
\n"," \n"," 3 | \n"," internlm2_5-7b-chat | \n"," 0 | \n"," 511 | \n"," 426 | \n"," 461.917667 | \n"," 7.767732 | \n","
\n"," \n"," 4 | \n"," internlm2_5-7b-chat-1m | \n"," 0 | \n"," 511 | \n"," 426 | \n"," 461.917667 | \n"," 7.767732 | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 100 | \n"," o1-mini | \n"," 40 | \n"," 15061 | \n"," 14818 | \n"," 14933.595333 | \n"," 16.417250 | \n","
\n"," \n"," 101 | \n"," gpt-4o | \n"," 50 | \n"," 12476 | \n"," 12334 | \n"," 12394.806333 | \n"," 10.057595 | \n","
\n"," \n"," 102 | \n"," gpt-4o-mini | \n"," 50 | \n"," 12476 | \n"," 12334 | \n"," 12394.806333 | \n"," 10.057595 | \n","
\n"," \n"," 103 | \n"," o1-preview | \n"," 50 | \n"," 18760 | \n"," 18517 | \n"," 18632.595333 | \n"," 16.417250 | \n","
\n"," \n"," 104 | \n"," o1-mini | \n"," 50 | \n"," 18760 | \n"," 18517 | \n"," 18632.595333 | \n"," 16.417250 | \n","
\n"," \n","
\n","
105 rows × 6 columns
\n","
"],"text/plain":[" model_name num_shots max min mean \\\n","0 Llama3.1-8B-Chinese-Chat 0 652 512 571.091000 \n","1 Llama3.1-70B-Chinese-Chat 0 652 512 571.091000 \n","2 Mistral-7B-v0.3-Chinese-Chat 0 928 694 799.354000 \n","3 internlm2_5-7b-chat 0 511 426 461.917667 \n","4 internlm2_5-7b-chat-1m 0 511 426 461.917667 \n",".. ... ... ... ... ... \n","100 o1-mini 40 15061 14818 14933.595333 \n","101 gpt-4o 50 12476 12334 12394.806333 \n","102 gpt-4o-mini 50 12476 12334 12394.806333 \n","103 o1-preview 50 18760 18517 18632.595333 \n","104 o1-mini 50 18760 18517 18632.595333 \n","\n"," std \n","0 9.115687 \n","1 9.115687 \n","2 15.567385 \n","3 7.767732 \n","4 7.767732 \n",".. ... \n","100 16.417250 \n","101 10.057595 \n","102 10.057595 \n","103 16.417250 \n","104 16.417250 \n","\n","[105 rows x 6 columns]"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["df_token_counts"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[{"data":{"image/png":"","text/plain":["