{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from tokenization_qwen import QWenTokenizer\n", "from tokenization_qwen_sub import QWenTokenizer as QWenTokenizer_SUB\n", "\n", "tokenizer = QWenTokenizer(vocab_file=\"./qwen.tiktoken\")\n", "tokenizer_sub = QWenTokenizer_SUB(vocab_file=\"./modified_qwen_sub.tiktoken\")\n", "# tokenizer = QWenTokenizer(vocab_file=\"./modified_qwen.tiktoken\")\n", "# tokenizer = QWenTokenizer(vocab_file=\"./modified_qwen_sub.tiktoken\")\n", "\n", "# print(tokenizer.tokenizer.encode(\"Hello World\", allowed_special=set(\"<|extra_40|>\")))\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "151860\n", "151860\n" ] } ], "source": [ "print(len(tokenizer))\n", "print(len(tokenizer_sub))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "layout_generation_exmaple = \"\"\"<loc_100><loc_563><loc_709><loc_19><table><loc_97><loc_370><loc_817><loc_115><table><loc_99><loc_607><loc_809><loc_86>\"\"\"\n", "# layout_generation_exmaple = \"<|extra_40|>\"" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[151649, 27, 1074, 62, 22, 17, 1784, 1074, 62, 20, 20, 1784, 1074, 62, 18, 22, 17, 1784, 1074, 62, 17, 15, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 16, 16, 23, 1784, 1074, 62, 22, 23, 24, 1784, 1074, 62, 19, 17, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 16, 21, 22, 1784, 1074, 62, 22, 23, 23, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 17, 17, 24, 1784, 1074, 62, 22, 23, 18, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 17, 24, 17, 1784, 1074, 62, 22, 18, 18, 1784, 1074, 62, 19, 17, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 18, 19, 16, 1784, 1074, 62, 22, 20, 22, 1784, 1074, 62, 17, 24, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 20, 15, 15, 1784, 1074, 62, 22, 24, 23, 1784, 1074, 62, 19, 17, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 20, 23, 22, 1784, 1074, 62, 20, 17, 18, 1784, 1074, 62, 16, 20, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 22, 15, 20, 1784, 1074, 62, 23, 15, 24, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 22, 21, 23, 1784, 1074, 62, 22, 15, 17, 1784, 1074, 62, 17, 24, 29, 151649, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 23, 15, 18, 1784, 1074, 62, 23, 15, 24, 1784, 1074, 62, 20, 21, 29, 151649, 27, 1074, 62, 24, 19, 22, 1784, 1074, 62, 24, 23, 22, 1784, 1074, 62, 17, 19, 1784, 1074, 62, 16, 22, 29, 151651, 27, 1074, 62, 16, 15, 15, 1784, 1074, 62, 20, 21, 18, 1784, 1074, 62, 22, 15, 24, 1784, 1074, 62, 16, 24, 29, 151652, 27, 1074, 62, 24, 22, 1784, 1074, 62, 18, 22, 15, 1784, 1074, 62, 23, 16, 22, 1784, 1074, 62, 16, 16, 20, 29, 151652, 27, 1074, 62, 24, 24, 1784, 1074, 62, 21, 15, 22, 1784, 1074, 62, 23, 15, 24, 1784, 1074, 62, 23, 21, 29]\n", "[151649, 150715, 150698, 151015, 150663, 151649, 150743, 150761, 151432, 150685, 151649, 150743, 150810, 151431, 150699, 151649, 150743, 150872, 151426, 150699, 151649, 150743, 150935, 151376, 150685, 151649, 150743, 150984, 151400, 150672, 151649, 150743, 151143, 151441, 150685, 151649, 150743, 151230, 151166, 150658, 151649, 150743, 151348, 151452, 150699, 151649, 150743, 151411, 151345, 150672, 151649, 150743, 151446, 151452, 150699, 151649, 151590, 151630, 150667, 150660, 151651, 150743, 151206, 151352, 150662, 151652, 150740, 151013, 151460, 150758, 151652, 150742, 151250, 151452, 150729]\n" ] } ], "source": [ "tokens = tokenizer.tokenizer.encode(str(layout_generation_exmaple), allowed_special=\"all\")\n", "print(tokens)\n", "tokens_sub = tokenizer_sub.tokenizer.encode(str(layout_generation_exmaple), allowed_special=\"all\")\n", "print(tokens_sub)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# decoded = tokenizer.tokenizer.decode(tokens)\n", "# new_decoded = tokenizer.tokenizer.decode(new_tokens)\n", "# print(decoded)\n", "# print(new_decoded)\n", "for i in tokens:\n", " decoded_token = tokenizer.tokenizer.decode([i])\n", " print(f\"{i},{decoded_token}\\n\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(len(tokenizer))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# map the token id to the token\n", "for i in tokens:\n", " decoded_token = tokenizer.tokenizer.decode([i])\n", " print(f\"{i},{decoded_token}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", "\n", "tokenizer = tiktoken.model.load_tiktoken_bpe(\"./qwen.tiktoken\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tiktoken\n", "import os\n", "\n", "def modify_tiktoken_file(input_file, output_file):\n", " # Read the file contents directly\n", " with open(input_file, 'rb') as f:\n", " original_content = f.readlines()\n", " \n", " # Generate the new location unique tokens\n", " LOCATION_UNIQUE_TOKENS = tuple(f\"<loc_{i}>\" for i in range(1, 1001))\n", " \n", " # Prepare the new content\n", " new_content = []\n", " \n", " # Keep the version header if it exists\n", " if original_content and original_content[0].startswith(b'version:'):\n", " new_content.append(original_content[0])\n", " original_content = original_content[1:]\n", " \n", " # Keep all existing tokens except the last 1000\n", " existing_tokens = original_content[:-1000] if len(original_content) > 1000 else []\n", " new_content.extend(existing_tokens)\n", " \n", " # Add the new location tokens\n", " for token in LOCATION_UNIQUE_TOKENS:\n", " # Encode the token and create a rank (you might want to adjust the rank strategy)\n", " encoded_token = token.encode('utf-8')\n", " # Use a high rank to ensure these are at the end\n", " new_content.append(encoded_token + b' ' + str(len(existing_tokens) + LOCATION_UNIQUE_TOKENS.index(token)).encode('utf-8') + b'\\n')\n", " \n", " # Save the modified tokenizer\n", " with open(output_file, 'wb') as f:\n", " f.writelines(new_content)\n", " \n", " print(f\"Modified tokenizer saved to {output_file}\")\n", " print(f\"Total tokens in new file: {len(new_content)}\")\n", "\n", "# Example usage\n", "input_tiktoken_file = './qwen.tiktoken'\n", "output_tiktoken_file = './modified_qwen.tiktoken'\n", "\n", "modify_tiktoken_file(input_tiktoken_file, output_tiktoken_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import base64\n", "from typing import Dict\n", "\n", "def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:\n", " with open(tiktoken_bpe_file, \"rb\") as f:\n", " contents = f.read()\n", " return {\n", " base64.b64decode(token): int(rank)\n", " for token, rank in (line.split() for line in contents.splitlines() if line)\n", " }\n", "\n", "# Path to your .tiktoken file\n", "tiktoken_bpe_file = \"./qwen.tiktoken\"\n", "\n", "# Load the BPE encoding\n", "bpe_data = _load_tiktoken_bpe(tiktoken_bpe_file)\n", "\n", "# Example usage\n", "# print(\"Loaded BPE Data:\", bpe_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open(\"./qwen.tiktoken\", 'rb') as f:\n", " contents = f.read()\n", "\n", "# Parse the original vocabulary\n", "vocab = {\n", " base64.b64decode(token): int(rank)\n", " for token, rank in (line.split() for line in contents.splitlines() if line)\n", "}\n", "\n", "base_vocab_size = len(vocab)\n", "print(base_vocab_size)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "location_tokens = [f\"<loc_{i}>\" for i in range(1, 1001)]\n", "print(location_tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokens_to_remove = sorted(vocab.items(), key=lambda x: x[1])[-1000:]\n", "print(tokens_to_remove)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import base64\n", "import tiktoken\n", "\n", "def modify_tokenizer(input_file='./qwen.tiktoken', output_file='./modified_qwen.tiktoken'):\n", " # Read the original tokenizer file\n", " with open(input_file, 'rb') as f:\n", " contents = f.read()\n", " \n", " # Parse the original vocabulary\n", " vocab = {\n", " base64.b64decode(token): int(rank)\n", " for token, rank in (line.split() for line in contents.splitlines() if line)\n", " }\n", " \n", " # Get the base vocabulary size (excluding special tokens)\n", " base_vocab_size = len(vocab)\n", " \n", " # Create location tokens\n", " location_tokens = [f\"<loc_{i}>\" for i in range(1, 1001)]\n", " \n", " # Remove the last 1000 tokens from the vocabulary\n", " tokens_to_remove = sorted(vocab.items(), key=lambda x: x[1])[-1000:]\n", " for token, _ in tokens_to_remove:\n", " del vocab[token]\n", " \n", " # Add location tokens\n", " for i, token in enumerate(location_tokens):\n", " vocab[token.encode('utf-8')] = base_vocab_size - 1000 + i\n", " \n", " # Write the modified vocabulary to the new file\n", " with open(output_file, 'w', encoding='utf-8') as f:\n", " for token, rank in sorted(vocab.items(), key=lambda x: x[1]):\n", " # Encode the token in base64\n", " token_b64 = base64.b64encode(token).decode('utf-8')\n", " f.write(f\"{token_b64} {rank}\\n\")\n", " \n", " print(f\"Modified tokenizer saved to {output_file}\")\n", " print(f\"Added {len(location_tokens)} location tokens\")\n", " print(f\"Final vocabulary size: {len(vocab)}\")\n", "\n", "\n", "modify_tokenizer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import base64\n", "import tiktoken\n", "from typing import Dict\n", "from tabulate import tabulate\n", "\n", "# Define special tokens from the original tokenizer\n", "ENDOFTEXT = \"<|endoftext|>\"\n", "IMSTART = \"<|im_start|>\"\n", "IMEND = \"<|im_end|>\"\n", "DOCUMENT_UNIQUE_TOKENS = tuple([\"<caption>\", \"<formula>\", \"<list>\", \"<text>\", \"<image>\", \"<title>\", \"<table>\", \"<LD>\", \"<TE>\", \"<MF>\", \"<IC>\", \"<OCR>\", \"<POCR>\", \"<VQA>\", \"<DVQA>\"])\n", "LOCATION_UNIQUE_TOKENS = tuple([f\"<loc_{i}>\" for i in range(0, 1001)])\n", "EXTRAS = tuple((f\"<|extra_{i}|>\" for i in range(len(DOCUMENT_UNIQUE_TOKENS), 205)))\n", "\n", "# Include location tokens in special tokens\n", "SPECIAL_TOKENS = (ENDOFTEXT, IMSTART, IMEND) + DOCUMENT_UNIQUE_TOKENS + EXTRAS + LOCATION_UNIQUE_TOKENS\n", "\n", "def load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:\n", " with open(tiktoken_bpe_file, \"rb\") as f:\n", " contents = f.read()\n", " return {\n", " base64.b64decode(token): int(rank)\n", " for token, rank in (line.split() for line in contents.splitlines() if line)\n", " }\n", "\n", "def decode_token(token_bytes):\n", " \"\"\"Attempt to decode bytes to string, fallback to base64 if not UTF-8 decodable\"\"\"\n", " try:\n", " return token_bytes.decode('utf-8')\n", " except UnicodeDecodeError:\n", " return f\"<bytes>{base64.b64encode(token_bytes).decode('utf-8')}\"\n", "\n", "def compare_tokenizers(old_file='./qwen.tiktoken', new_file='./modified_qwen.tiktoken'):\n", " # Load both tokenizers\n", " old_vocab = load_tiktoken_bpe(old_file)\n", " new_vocab = load_tiktoken_bpe(new_file)\n", " \n", " # Create special tokens dictionary including location tokens\n", " special_tokens = {\n", " token: index\n", " for index, token in enumerate(SPECIAL_TOKENS, start=len(new_vocab))\n", " }\n", " \n", " # Add location tokens to the special tokens dictionary with their proper ranks\n", " for i, token in enumerate(LOCATION_UNIQUE_TOKENS):\n", " special_tokens[token] = len(new_vocab) - 1000 + i\n", " \n", " print(f\"Old vocabulary size: {len(old_vocab)}\")\n", " print(f\"New vocabulary size: {len(new_vocab)}\")\n", " print(f\"Difference in size: {len(new_vocab) - len(old_vocab)}\")\n", " print(f\"Number of special tokens: {len(special_tokens)}\")\n", " print(\"\\n\")\n", " \n", " # Find tokens that were removed\n", " removed_tokens = set(old_vocab.keys()) - set(new_vocab.keys())\n", " print(f\"Number of removed tokens: {len(removed_tokens)}\")\n", " \n", " # Find new tokens that were added\n", " added_tokens = set(new_vocab.keys()) - set(old_vocab.keys())\n", " print(f\"Number of added tokens: {len(added_tokens)}\")\n", " print(\"\\n\")\n", " \n", " # Create comparison tables\n", " print(\"Sample of removed tokens (last 10):\")\n", " removed_data = []\n", " for token in sorted(removed_tokens, key=lambda x: old_vocab[x])[-10:]:\n", " removed_data.append([\n", " decode_token(token),\n", " old_vocab[token]\n", " ])\n", " print(tabulate(removed_data, headers=['Token', 'Rank'], tablefmt='grid'))\n", " print(\"\\n\")\n", " \n", " print(\"Sample of added tokens (first 10):\")\n", " added_data = []\n", " for token in sorted(added_tokens, key=lambda x: new_vocab[x])[:10]:\n", " added_data.append([\n", " decode_token(token),\n", " new_vocab[token]\n", " ])\n", " print(tabulate(added_data, headers=['Token', 'Rank'], tablefmt='grid'))\n", " \n", " # Test tokenization of a sample text\n", " print(\"\\nTokenization comparison for sample text:\")\n", " # sample_text = \"This is a test sentence with location markers <loc_1> and <loc_999>\"\n", " sample_text = layout_generation_exmaple\n", " \n", " # Create encodings for both tokenizers\n", " old_enc = tiktoken.Encoding(\n", " \"old_qwen\",\n", " pat_str=r\"\"\"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+\"\"\",\n", " mergeable_ranks=old_vocab,\n", " special_tokens=special_tokens\n", " )\n", " \n", " new_enc = tiktoken.Encoding(\n", " \"new_qwen\",\n", " pat_str=r\"\"\"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+\"\"\",\n", " mergeable_ranks=new_vocab,\n", " special_tokens=special_tokens\n", " )\n", " \n", " try:\n", " old_tokens = old_enc.encode(sample_text , allowed_special=\"all\")\n", " print(\"\\nOld tokenizer:\")\n", " print(f\"Token IDs: {old_tokens}\")\n", " print(f\"Decoded: {old_enc.decode(old_tokens)}\")\n", " except Exception as e:\n", " print(\"\\nError with old tokenizer:\", str(e))\n", " \n", " try:\n", " new_tokens = new_enc.encode(sample_text , allowed_special=\"all\")\n", " print(\"\\nNew tokenizer:\")\n", " print(f\"Token IDs: {new_tokens}\")\n", " print(f\"Decoded: {new_enc.decode(new_tokens)}\")\n", " except Exception as e:\n", " print(\"\\nError with new tokenizer:\", str(e))\n", "\n", "# if __name__ == \"__main__\":\n", "compare_tokenizers()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original vocab size: 151643\n", "Truncated vocab size: 150643\n", "Modified tokenizer saved to ./modified_qwen_sub.tiktoken\n", "Added 1000 location tokens\n" ] } ], "source": [ "import base64\n", "\n", "def modify_qwen_tokenizer_small(input_file='./qwen.tiktoken', output_file='./modified_qwen_sub.tiktoken'):\n", " # Read the original tokenizer file\n", " with open(input_file, 'rb') as f:\n", " contents = f.read()\n", " \n", " # Parse the original vocabulary\n", " vocab = {\n", " base64.b64decode(token): int(rank)\n", " for token, rank in (line.split() for line in contents.splitlines() if line)\n", " }\n", " \n", " # Sort tokens by rank and remove last 1001 tokens\n", " sorted_tokens = sorted(vocab.items(), key=lambda x: x[1])\n", " truncated_vocab = dict(sorted_tokens[:-1000])\n", " base_vocab_size = len(truncated_vocab)\n", " \n", " print(f\"Original vocab size: {len(vocab)}\")\n", " print(f\"Truncated vocab size: {len(truncated_vocab)}\")\n", " \n", " # Write the modified vocabulary with location tokens first\n", " with open(output_file, 'w', encoding='utf-8') as f:\n", " # First write the truncated base vocabulary\n", " for token, rank in sorted(truncated_vocab.items(), key=lambda x: x[1]):\n", " token_b64 = base64.b64encode(token).decode('utf-8')\n", " f.write(f\"{token_b64} {rank}\\n\")\n", " \n", " print(f\"Modified tokenizer saved to {output_file}\")\n", " print(f\"Added {1000} location tokens\")\n", "\n", "# if __name__ == \"__main__\":\n", "modify_qwen_tokenizer_small()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Comparison saved to tokenizer_comparison.txt\n", "\n", "Statistics:\n", "Old tokenizer size: 151860\n", "New tokenizer size: 151860\n", "\n", "Sample of differences:\n", "Token 150643: '∉' -> '<loc_0>'\n", "Token 150644: '∊' -> '<loc_1>'\n", "Token 150645: '∖' -> '<loc_2>'\n", "Token 150646: '∜' -> '<loc_3>'\n", "Token 150647: '∾' -> '<loc_4>'\n", "Token 150648: '≀' -> '<loc_5>'\n", "Token 150649: '≋' -> '<loc_6>'\n", "Token 150650: '≌' -> '<loc_7>'\n", "Token 150651: '≓' -> '<loc_8>'\n", "Token 150652: '≜' -> '<loc_9>'\n" ] } ], "source": [ "from tokenization_qwen import QWenTokenizer\n", "from tabulate import tabulate\n", "\n", "def compare_tokenizers(old_file=\"./qwen.tiktoken\", new_file=\"./modified_qwen_sub.tiktoken\", output_file=\"tokenizer_comparison.txt\"):\n", " # Initialize both tokenizers\n", " old_tokenizer = QWenTokenizer(vocab_file=old_file)\n", " new_tokenizer = QWenTokenizer_SUB(vocab_file=new_file)\n", " \n", " # Get vocabulary size\n", " vocab_size = max(len(old_tokenizer), len(new_tokenizer))\n", " \n", " # Prepare comparison data\n", " comparison_data = []\n", " for token_id in range(vocab_size):\n", " try:\n", " old_token = old_tokenizer.tokenizer.decode([token_id])\n", " except:\n", " old_token = \"N/A\"\n", " \n", " try:\n", " new_token = new_tokenizer.tokenizer.decode([token_id])\n", " except:\n", " new_token = \"N/A\"\n", " \n", " comparison_data.append([token_id, old_token, new_token])\n", " \n", " # just write the data to a file\n", " with open(output_file, 'w', encoding='utf-8') as f:\n", " for row in comparison_data:\n", " f.write(f\"{row[0]},{row[1]},{row[2]}\\n\")\n", " \n", "\n", " print(f\"Comparison saved to {output_file}\")\n", " \n", " # Print some statistics\n", " print(f\"\\nStatistics:\")\n", " print(f\"Old tokenizer size: {len(old_tokenizer)}\")\n", " print(f\"New tokenizer size: {len(new_tokenizer)}\")\n", " \n", " # Print sample of differences\n", " print(\"\\nSample of differences:\")\n", " differences = [(i, old, new) for i, old, new in comparison_data if old != new]\n", " for i, (token_id, old, new) in enumerate(differences[:10]):\n", " print(f\"Token {token_id}: '{old}' -> '{new}'\")\n", "\n", "# Run the comparison\n", "compare_tokenizers()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }