{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenization_qwen import QWenTokenizer\n",
    "from tokenization_qwen_sub import QWenTokenizer as QWenTokenizer_SUB\n",
    "\n",
    "tokenizer = QWenTokenizer(vocab_file=\"./qwen.tiktoken\")\n",
    "tokenizer_sub = QWenTokenizer_SUB(vocab_file=\"./modified_qwen_sub.tiktoken\")\n",
    "# tokenizer = QWenTokenizer(vocab_file=\"./modified_qwen.tiktoken\")\n",
    "# tokenizer = QWenTokenizer(vocab_file=\"./modified_qwen_sub.tiktoken\")\n",
    "\n",
    "# print(tokenizer.tokenizer.encode(\"Hello World\", allowed_special=set(\"<|extra_40|>\")))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "151860\n",
      "151860\n"
     ]
    }
   ],
   "source": [
    "print(len(tokenizer))\n",
    "print(len(tokenizer_sub))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "layout_generation_exmaple = \"\"\"<text><loc_72><loc_55><loc_372><loc_20><text><loc_100><loc_118><loc_789><loc_42><text><loc_100><loc_167><loc_788><loc_56><text><loc_100><loc_229><loc_783><loc_56><text><loc_100><loc_292><loc_733><loc_42><text><loc_100><loc_341><loc_757><loc_29><text><loc_100><loc_500><loc_798><loc_42><text><loc_100><loc_587><loc_523><loc_15><text><loc_100><loc_705><loc_809><loc_56><text><loc_100><loc_768><loc_702><loc_29><text><loc_100><loc_803><loc_809><loc_56><text><loc_947><loc_987><loc_24><loc_17><title><loc_100><loc_563><loc_709><loc_19><table><loc_97><loc_370><loc_817><loc_115><table><loc_99><loc_607><loc_809><loc_86>\"\"\"\n",
    "# layout_generation_exmaple = \"<|extra_40|>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[151649, 27, 1074, 62, 22, 17, 1784, 1074, 62, 20, 20, 1784, 1074, 62, 18, 22, 17, 1784, 1074, 62, 17, 15, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 16, 16, 23, 1784, 1074, 62, 22, 23, 24, 1784, 1074, 62, 19, 17, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 16, 21, 22, 1784, 1074, 62, 22, 23, 23, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 17, 17, 24, 1784, 1074, 62, 22, 23, 18, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 17, 24, 17, 1784, 1074, 62, 22, 18, 18, 1784, 1074, 62, 19, 17, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 18, 19, 16, 1784, 1074, 62, 22, 20, 22, 1784, 1074, 62, 17, 24, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 20, 15, 15, 1784, 1074, 62, 22, 24, 23, 1784, 1074, 62, 19, 17, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 20, 23, 22, 1784, 1074, 62, 20, 17, 18, 1784, 1074, 62, 16, 20, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 22, 15, 20, 1784, 1074, 62, 23, 15, 24, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 22, 21, 23, 1784, 1074, 62, 22, 15, 17, 1784, 1074, 62, 17, 24, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 23, 15, 18, 1784, 1074, 62, 23, 15, 24, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 24, 19, 22, 1784, 1074, 62, 24, 23, 22, 1784, 1074, 62, 17, 19, 1784, 1074, 62, 16, 22, 29, 151651, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 20, 21, 18, 1784, 1074, 62, 22, 15, 24, 1784, 1074, 62, 16, 24, 29, 151652, 27, 1074, 62, 24, 22, 1784, 1074, 62, 18, 22, 15, 1784, 1074, 62, 23, 16, 22, 1784, 1074, 62, 16, 16, 20, 29, 151652, 27, 1074, 62, 24, 24, 1784, 1074, 62, 21, 15, 22, 1784, 1074, 62, 23, 15, 24, 1784, 1074, 62, 23, 21, 29]\n",
      "[151649, 150715, 150698, 151015, 150663, 151649, 150743, 150761, 151432, 150685, 151649, 150743, 150810, 151431, 150699, 151649, 150743, 150872, 151426, 150699, 151649, 150743, 150935, 151376, 150685, 151649, 150743, 150984, 151400, 150672, 151649, 150743, 151143, 151441, 150685, 151649, 150743, 151230, 151166, 150658, 151649, 150743, 151348, 151452, 150699, 151649, 150743, 151411, 151345, 150672, 151649, 150743, 151446, 151452, 150699, 151649, 151590, 151630, 150667, 150660, 151651, 150743, 151206, 151352, 150662, 151652, 150740, 151013, 151460, 150758, 151652, 150742, 151250, 151452, 150729]\n"
     ]
    }
   ],
   "source": [
    "tokens = tokenizer.tokenizer.encode(str(layout_generation_exmaple), allowed_special=\"all\")\n",
    "print(tokens)\n",
    "tokens_sub = tokenizer_sub.tokenizer.encode(str(layout_generation_exmaple), allowed_special=\"all\")\n",
    "print(tokens_sub)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# decoded = tokenizer.tokenizer.decode(tokens)\n",
    "# new_decoded = tokenizer.tokenizer.decode(new_tokens)\n",
    "# print(decoded)\n",
    "# print(new_decoded)\n",
    "for i in tokens:\n",
    "    decoded_token = tokenizer.tokenizer.decode([i])\n",
    "    print(f\"{i},{decoded_token}\\n\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(tokenizer))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# map the token id to the token\n",
    "for i in tokens:\n",
    "    decoded_token = tokenizer.tokenizer.decode([i])\n",
    "    print(f\"{i},{decoded_token}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tiktoken\n",
    "\n",
    "tokenizer = tiktoken.model.load_tiktoken_bpe(\"./qwen.tiktoken\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tiktoken\n",
    "import os\n",
    "\n",
    "def modify_tiktoken_file(input_file, output_file):\n",
    "    # Read the file contents directly\n",
    "    with open(input_file, 'rb') as f:\n",
    "        original_content = f.readlines()\n",
    "    \n",
    "    # Generate the new location unique tokens\n",
    "    LOCATION_UNIQUE_TOKENS = tuple(f\"<loc_{i}>\" for i in range(1, 1001))\n",
    "    \n",
    "    # Prepare the new content\n",
    "    new_content = []\n",
    "    \n",
    "    # Keep the version header if it exists\n",
    "    if original_content and original_content[0].startswith(b'version:'):\n",
    "        new_content.append(original_content[0])\n",
    "        original_content = original_content[1:]\n",
    "    \n",
    "    # Keep all existing tokens except the last 1000\n",
    "    existing_tokens = original_content[:-1000] if len(original_content) > 1000 else []\n",
    "    new_content.extend(existing_tokens)\n",
    "    \n",
    "    # Add the new location tokens\n",
    "    for token in LOCATION_UNIQUE_TOKENS:\n",
    "        # Encode the token and create a rank (you might want to adjust the rank strategy)\n",
    "        encoded_token = token.encode('utf-8')\n",
    "        # Use a high rank to ensure these are at the end\n",
    "        new_content.append(encoded_token + b' ' + str(len(existing_tokens) + LOCATION_UNIQUE_TOKENS.index(token)).encode('utf-8') + b'\\n')\n",
    "    \n",
    "    # Save the modified tokenizer\n",
    "    with open(output_file, 'wb') as f:\n",
    "        f.writelines(new_content)\n",
    "    \n",
    "    print(f\"Modified tokenizer saved to {output_file}\")\n",
    "    print(f\"Total tokens in new file: {len(new_content)}\")\n",
    "\n",
    "# Example usage\n",
    "input_tiktoken_file = './qwen.tiktoken'\n",
    "output_tiktoken_file = './modified_qwen.tiktoken'\n",
    "\n",
    "modify_tiktoken_file(input_tiktoken_file, output_tiktoken_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import base64\n",
    "from typing import Dict\n",
    "\n",
    "def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:\n",
    "    with open(tiktoken_bpe_file, \"rb\") as f:\n",
    "        contents = f.read()\n",
    "    return {\n",
    "        base64.b64decode(token): int(rank)\n",
    "        for token, rank in (line.split() for line in contents.splitlines() if line)\n",
    "    }\n",
    "\n",
    "# Path to your .tiktoken file\n",
    "tiktoken_bpe_file = \"./qwen.tiktoken\"\n",
    "\n",
    "# Load the BPE encoding\n",
    "bpe_data = _load_tiktoken_bpe(tiktoken_bpe_file)\n",
    "\n",
    "# Example usage\n",
    "# print(\"Loaded BPE Data:\", bpe_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./qwen.tiktoken\", 'rb') as f:\n",
    "    contents = f.read()\n",
    "\n",
    "# Parse the original vocabulary\n",
    "vocab = {\n",
    "    base64.b64decode(token): int(rank)\n",
    "    for token, rank in (line.split() for line in contents.splitlines() if line)\n",
    "}\n",
    "\n",
    "base_vocab_size = len(vocab)\n",
    "print(base_vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "location_tokens = [f\"<loc_{i}>\" for i in range(1, 1001)]\n",
    "print(location_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokens_to_remove = sorted(vocab.items(), key=lambda x: x[1])[-1000:]\n",
    "print(tokens_to_remove)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import base64\n",
    "import tiktoken\n",
    "\n",
    "def modify_tokenizer(input_file='./qwen.tiktoken', output_file='./modified_qwen.tiktoken'):\n",
    "    # Read the original tokenizer file\n",
    "    with open(input_file, 'rb') as f:\n",
    "        contents = f.read()\n",
    "    \n",
    "    # Parse the original vocabulary\n",
    "    vocab = {\n",
    "        base64.b64decode(token): int(rank)\n",
    "        for token, rank in (line.split() for line in contents.splitlines() if line)\n",
    "    }\n",
    "    \n",
    "    # Get the base vocabulary size (excluding special tokens)\n",
    "    base_vocab_size = len(vocab)\n",
    "    \n",
    "    # Create location tokens\n",
    "    location_tokens = [f\"<loc_{i}>\" for i in range(1, 1001)]\n",
    "    \n",
    "    # Remove the last 1000 tokens from the vocabulary\n",
    "    tokens_to_remove = sorted(vocab.items(), key=lambda x: x[1])[-1000:]\n",
    "    for token, _ in tokens_to_remove:\n",
    "        del vocab[token]\n",
    "    \n",
    "    # Add location tokens\n",
    "    for i, token in enumerate(location_tokens):\n",
    "        vocab[token.encode('utf-8')] = base_vocab_size - 1000 + i\n",
    "    \n",
    "    # Write the modified vocabulary to the new file\n",
    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
    "        for token, rank in sorted(vocab.items(), key=lambda x: x[1]):\n",
    "            # Encode the token in base64\n",
    "            token_b64 = base64.b64encode(token).decode('utf-8')\n",
    "            f.write(f\"{token_b64} {rank}\\n\")\n",
    "    \n",
    "    print(f\"Modified tokenizer saved to {output_file}\")\n",
    "    print(f\"Added {len(location_tokens)} location tokens\")\n",
    "    print(f\"Final vocabulary size: {len(vocab)}\")\n",
    "\n",
    "\n",
    "modify_tokenizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import base64\n",
    "import tiktoken\n",
    "from typing import Dict\n",
    "from tabulate import tabulate\n",
    "\n",
    "# Define special tokens from the original tokenizer\n",
    "ENDOFTEXT = \"<|endoftext|>\"\n",
    "IMSTART = \"<|im_start|>\"\n",
    "IMEND = \"<|im_end|>\"\n",
    "DOCUMENT_UNIQUE_TOKENS = tuple([\"<caption>\", \"<formula>\", \"<list>\", \"<text>\", \"<image>\", \"<title>\", \"<table>\", \"<LD>\", \"<TE>\", \"<MF>\", \"<IC>\", \"<OCR>\", \"<POCR>\", \"<VQA>\", \"<DVQA>\"])\n",
    "LOCATION_UNIQUE_TOKENS = tuple([f\"<loc_{i}>\" for i in range(0, 1001)])\n",
    "EXTRAS = tuple((f\"<|extra_{i}|>\" for i in range(len(DOCUMENT_UNIQUE_TOKENS), 205)))\n",
    "\n",
    "# Include location tokens in special tokens\n",
    "SPECIAL_TOKENS = (ENDOFTEXT, IMSTART, IMEND) + DOCUMENT_UNIQUE_TOKENS + EXTRAS + LOCATION_UNIQUE_TOKENS\n",
    "\n",
    "def load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:\n",
    "    with open(tiktoken_bpe_file, \"rb\") as f:\n",
    "        contents = f.read()\n",
    "    return {\n",
    "        base64.b64decode(token): int(rank)\n",
    "        for token, rank in (line.split() for line in contents.splitlines() if line)\n",
    "    }\n",
    "\n",
    "def decode_token(token_bytes):\n",
    "    \"\"\"Attempt to decode bytes to string, fallback to base64 if not UTF-8 decodable\"\"\"\n",
    "    try:\n",
    "        return token_bytes.decode('utf-8')\n",
    "    except UnicodeDecodeError:\n",
    "        return f\"<bytes>{base64.b64encode(token_bytes).decode('utf-8')}\"\n",
    "\n",
    "def compare_tokenizers(old_file='./qwen.tiktoken', new_file='./modified_qwen.tiktoken'):\n",
    "    # Load both tokenizers\n",
    "    old_vocab = load_tiktoken_bpe(old_file)\n",
    "    new_vocab = load_tiktoken_bpe(new_file)\n",
    "    \n",
    "    # Create special tokens dictionary including location tokens\n",
    "    special_tokens = {\n",
    "        token: index\n",
    "        for index, token in enumerate(SPECIAL_TOKENS, start=len(new_vocab))\n",
    "    }\n",
    "    \n",
    "    # Add location tokens to the special tokens dictionary with their proper ranks\n",
    "    for i, token in enumerate(LOCATION_UNIQUE_TOKENS):\n",
    "        special_tokens[token] = len(new_vocab) - 1000 + i\n",
    "    \n",
    "    print(f\"Old vocabulary size: {len(old_vocab)}\")\n",
    "    print(f\"New vocabulary size: {len(new_vocab)}\")\n",
    "    print(f\"Difference in size: {len(new_vocab) - len(old_vocab)}\")\n",
    "    print(f\"Number of special tokens: {len(special_tokens)}\")\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    # Find tokens that were removed\n",
    "    removed_tokens = set(old_vocab.keys()) - set(new_vocab.keys())\n",
    "    print(f\"Number of removed tokens: {len(removed_tokens)}\")\n",
    "    \n",
    "    # Find new tokens that were added\n",
    "    added_tokens = set(new_vocab.keys()) - set(old_vocab.keys())\n",
    "    print(f\"Number of added tokens: {len(added_tokens)}\")\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    # Create comparison tables\n",
    "    print(\"Sample of removed tokens (last 10):\")\n",
    "    removed_data = []\n",
    "    for token in sorted(removed_tokens, key=lambda x: old_vocab[x])[-10:]:\n",
    "        removed_data.append([\n",
    "            decode_token(token),\n",
    "            old_vocab[token]\n",
    "        ])\n",
    "    print(tabulate(removed_data, headers=['Token', 'Rank'], tablefmt='grid'))\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Sample of added tokens (first 10):\")\n",
    "    added_data = []\n",
    "    for token in sorted(added_tokens, key=lambda x: new_vocab[x])[:10]:\n",
    "        added_data.append([\n",
    "            decode_token(token),\n",
    "            new_vocab[token]\n",
    "        ])\n",
    "    print(tabulate(added_data, headers=['Token', 'Rank'], tablefmt='grid'))\n",
    "    \n",
    "    # Test tokenization of a sample text\n",
    "    print(\"\\nTokenization comparison for sample text:\")\n",
    "    # sample_text = \"This is a test sentence with location markers <loc_1> and <loc_999>\"\n",
    "    sample_text = layout_generation_exmaple\n",
    "    \n",
    "    # Create encodings for both tokenizers\n",
    "    old_enc = tiktoken.Encoding(\n",
    "        \"old_qwen\",\n",
    "        pat_str=r\"\"\"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+\"\"\",\n",
    "        mergeable_ranks=old_vocab,\n",
    "        special_tokens=special_tokens\n",
    "    )\n",
    "    \n",
    "    new_enc = tiktoken.Encoding(\n",
    "        \"new_qwen\",\n",
    "        pat_str=r\"\"\"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+\"\"\",\n",
    "        mergeable_ranks=new_vocab,\n",
    "        special_tokens=special_tokens\n",
    "    )\n",
    "    \n",
    "    try:\n",
    "        old_tokens = old_enc.encode(sample_text , allowed_special=\"all\")\n",
    "        print(\"\\nOld tokenizer:\")\n",
    "        print(f\"Token IDs: {old_tokens}\")\n",
    "        print(f\"Decoded: {old_enc.decode(old_tokens)}\")\n",
    "    except Exception as e:\n",
    "        print(\"\\nError with old tokenizer:\", str(e))\n",
    "    \n",
    "    try:\n",
    "        new_tokens = new_enc.encode(sample_text , allowed_special=\"all\")\n",
    "        print(\"\\nNew tokenizer:\")\n",
    "        print(f\"Token IDs: {new_tokens}\")\n",
    "        print(f\"Decoded: {new_enc.decode(new_tokens)}\")\n",
    "    except Exception as e:\n",
    "        print(\"\\nError with new tokenizer:\", str(e))\n",
    "\n",
    "# if __name__ == \"__main__\":\n",
    "compare_tokenizers()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original vocab size: 151643\n",
      "Truncated vocab size: 150643\n",
      "Modified tokenizer saved to ./modified_qwen_sub.tiktoken\n",
      "Added 1000 location tokens\n"
     ]
    }
   ],
   "source": [
    "import base64\n",
    "\n",
    "def modify_qwen_tokenizer_small(input_file='./qwen.tiktoken', output_file='./modified_qwen_sub.tiktoken'):\n",
    "    # Read the original tokenizer file\n",
    "    with open(input_file, 'rb') as f:\n",
    "        contents = f.read()\n",
    "    \n",
    "    # Parse the original vocabulary\n",
    "    vocab = {\n",
    "        base64.b64decode(token): int(rank)\n",
    "        for token, rank in (line.split() for line in contents.splitlines() if line)\n",
    "    }\n",
    "    \n",
    "    # Sort tokens by rank and remove last 1001 tokens\n",
    "    sorted_tokens = sorted(vocab.items(), key=lambda x: x[1])\n",
    "    truncated_vocab = dict(sorted_tokens[:-1000])\n",
    "    base_vocab_size = len(truncated_vocab)\n",
    "    \n",
    "    print(f\"Original vocab size: {len(vocab)}\")\n",
    "    print(f\"Truncated vocab size: {len(truncated_vocab)}\")\n",
    "    \n",
    "    # Write the modified vocabulary with location tokens first\n",
    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
    "        # First write the truncated base vocabulary\n",
    "        for token, rank in sorted(truncated_vocab.items(), key=lambda x: x[1]):\n",
    "            token_b64 = base64.b64encode(token).decode('utf-8')\n",
    "            f.write(f\"{token_b64} {rank}\\n\")\n",
    "            \n",
    "    print(f\"Modified tokenizer saved to {output_file}\")\n",
    "    print(f\"Added {1000} location tokens\")\n",
    "\n",
    "# if __name__ == \"__main__\":\n",
    "modify_qwen_tokenizer_small()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Comparison saved to tokenizer_comparison.txt\n",
      "\n",
      "Statistics:\n",
      "Old tokenizer size: 151860\n",
      "New tokenizer size: 151860\n",
      "\n",
      "Sample of differences:\n",
      "Token 150643: '∉' -> '<loc_0>'\n",
      "Token 150644: '∊' -> '<loc_1>'\n",
      "Token 150645: '∖' -> '<loc_2>'\n",
      "Token 150646: '∜' -> '<loc_3>'\n",
      "Token 150647: '∾' -> '<loc_4>'\n",
      "Token 150648: '≀' -> '<loc_5>'\n",
      "Token 150649: '≋' -> '<loc_6>'\n",
      "Token 150650: '≌' -> '<loc_7>'\n",
      "Token 150651: '≓' -> '<loc_8>'\n",
      "Token 150652: '≜' -> '<loc_9>'\n"
     ]
    }
   ],
   "source": [
    "from tokenization_qwen import QWenTokenizer\n",
    "from tabulate import tabulate\n",
    "\n",
    "def compare_tokenizers(old_file=\"./qwen.tiktoken\", new_file=\"./modified_qwen_sub.tiktoken\", output_file=\"tokenizer_comparison.txt\"):\n",
    "    # Initialize both tokenizers\n",
    "    old_tokenizer = QWenTokenizer(vocab_file=old_file)\n",
    "    new_tokenizer = QWenTokenizer_SUB(vocab_file=new_file)\n",
    "    \n",
    "    # Get vocabulary size\n",
    "    vocab_size = max(len(old_tokenizer), len(new_tokenizer))\n",
    "    \n",
    "    # Prepare comparison data\n",
    "    comparison_data = []\n",
    "    for token_id in range(vocab_size):\n",
    "        try:\n",
    "            old_token = old_tokenizer.tokenizer.decode([token_id])\n",
    "        except:\n",
    "            old_token = \"N/A\"\n",
    "            \n",
    "        try:\n",
    "            new_token = new_tokenizer.tokenizer.decode([token_id])\n",
    "        except:\n",
    "            new_token = \"N/A\"\n",
    "            \n",
    "        comparison_data.append([token_id, old_token, new_token])\n",
    "    \n",
    "    # just write the data to a file\n",
    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
    "        for row in comparison_data:\n",
    "            f.write(f\"{row[0]},{row[1]},{row[2]}\\n\")\n",
    "    \n",
    "\n",
    "    print(f\"Comparison saved to {output_file}\")\n",
    "    \n",
    "    # Print some statistics\n",
    "    print(f\"\\nStatistics:\")\n",
    "    print(f\"Old tokenizer size: {len(old_tokenizer)}\")\n",
    "    print(f\"New tokenizer size: {len(new_tokenizer)}\")\n",
    "    \n",
    "    # Print sample of differences\n",
    "    print(\"\\nSample of differences:\")\n",
    "    differences = [(i, old, new) for i, old, new in comparison_data if old != new]\n",
    "    for i, (token_id, old, new) in enumerate(differences[:10]):\n",
    "        print(f\"Token {token_id}: '{old}' -> '{new}'\")\n",
    "\n",
    "# Run the comparison\n",
    "compare_tokenizers()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}