File size: 3,254 Bytes
b347aa0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_path = \"/Users/mohammad.ibrahim/Desktop/TSAI/combined_text.txt\"\n",
    "with open(file_path, 'r', encoding='utf-8') as file:\n",
    "    text = file.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "pattern = r\"\"\"'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}।•]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}।•]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+|।|•\"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import regex as re\n",
    "text_chunks = re.findall(pattern, text)\n",
    "\n",
    "        # input text preprocessing\n",
    "tokens = [list(ch.encode(\"utf-8\")) for ch in text_chunks]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tokens = text.encode(\"utf-8\") # raw bytes\n",
    "tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tokens length: 179910393\n",
      "ids length: 32798069\n",
      "compression ratio: 5.49X\n"
     ]
    }
   ],
   "source": [
    "def get_stats(ids):\n",
    "    counts = {}\n",
    "    for pair in zip(ids, ids[1:]):\n",
    "        counts[pair] = counts.get(pair, 0) + 1\n",
    "    return counts\n",
    "\n",
    "def merge(ids, pair, idx):\n",
    "  newids = []\n",
    "  i = 0\n",
    "  while i < len(ids):\n",
    "    if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:\n",
    "      newids.append(idx)\n",
    "      i += 2\n",
    "    else:\n",
    "      newids.append(ids[i])\n",
    "      i += 1\n",
    "  return newids\n",
    "\n",
    "# ---\n",
    "vocab_size = 1000 # the desired final vocabulary size\n",
    "num_merges = vocab_size - 256\n",
    "ids = list(tokens) # copy so we don't destroy the original list\n",
    "\n",
    "merges = {} # (int, int) -> int\n",
    "for i in range(num_merges):\n",
    "  stats = get_stats(ids)\n",
    "  pair = max(stats, key=stats.get)\n",
    "  idx = 256 + i\n",
    "  # print(f\"merging {pair} into a new token {idx}\")\n",
    "  ids = merge(ids, pair, idx)\n",
    "  merges[pair] = idx\n",
    "\n",
    "print(\"tokens length:\", len(tokens))\n",
    "print(\"ids length:\", len(ids))\n",
    "print(f\"compression ratio: {len(tokens) / len(ids):.2f}X\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}