{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "application/javascript": "IPython.notebook.set_autosave_interval(300000)" }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Autosaving every 300 seconds\n" ] } ], "source": [ "%autosave 300\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%config Completer.use_jedi = False" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_Transformers/S20_tokenizer\n" ] } ], "source": [ "import os\n", "\n", "os.chdir(\n", " \"/mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_S20_tokenization\"\n", ")\n", "print(os.getcwd())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from tokenizer.basic_bpe import BasicTokenizer\n", "from tokenizer.regex_bpe import RegexTokenizer" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "file_name = \"tiny_shakespeare.txt\"\n", "file_path = os.path.join(os.getcwd(), \"data\", file_name)\n", "\n", "with open(file_path, \"r\") as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The length of the text is 1115394\n" ] } ], "source": [ "print(f\"The length of the text is {len(text)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "creating the model file for basic tokenizer" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "model_path = os.path.join(os.getcwd(), \"tokenizer_model\")\n", "os.makedirs(model_path, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "merge 1/256: (101, 32) -> 256 (b'e ') had 27643 occurrences\n", "merge 2/256: (116, 104) -> 257 (b'th') had 22739 occurrences\n", "merge 3/256: (116, 32) -> 258 (b't ') had 16508 occurrences\n", "merge 4/256: (115, 32) -> 259 (b's ') had 15364 occurrences\n", "merge 5/256: (100, 32) -> 260 (b'd ') had 14165 occurrences\n", "merge 6/256: (44, 32) -> 261 (b', ') had 14098 occurrences\n", "merge 7/256: (111, 117) -> 262 (b'ou') had 12730 occurrences\n", "merge 8/256: (101, 114) -> 263 (b'er') had 11771 occurrences\n", "merge 9/256: (105, 110) -> 264 (b'in') had 10606 occurrences\n", "merge 10/256: (121, 32) -> 265 (b'y ') had 10283 occurrences\n", "merge 11/256: (97, 110) -> 266 (b'an') had 10197 occurrences\n", "merge 12/256: (58, 10) -> 267 (b':\\n') had 8762 occurrences\n", "merge 13/256: (111, 114) -> 268 (b'or') had 8458 occurrences\n", "merge 14/256: (111, 32) -> 269 (b'o ') had 8134 occurrences\n", "merge 15/256: (101, 110) -> 270 (b'en') had 7568 occurrences\n", "merge 16/256: (10, 10) -> 271 (b'\\n\\n') had 7098 occurrences\n", "merge 17/256: (97, 114) -> 272 (b'ar') had 7081 occurrences\n", "merge 18/256: (32, 257) -> 273 (b' th') had 6662 occurrences\n", "merge 19/256: (111, 110) -> 274 (b'on') had 6435 occurrences\n", "merge 20/256: (108, 108) -> 275 (b'll') had 6357 occurrences\n", "merge 21/256: (104, 97) -> 276 (b'ha') had 6055 occurrences\n", "merge 22/256: (44, 10) -> 277 (b',\\n') had 5501 occurrences\n", "merge 23/256: (46, 271) -> 278 (b'.\\n\\n') had 5018 occurrences\n", "merge 24/256: (105, 259) -> 279 (b'is ') had 4913 occurrences\n", "merge 25/256: (101, 115) -> 280 (b'es') had 4878 occurrences\n", "merge 26/256: (121, 262) -> 281 (b'you') had 4649 occurrences\n", "merge 27/256: (32, 115) -> 282 (b' s') had 4207 occurrences\n", "merge 28/256: (116, 269) -> 283 (b'to ') had 4099 occurrences\n", "merge 29/256: (266, 260) -> 284 (b'and ') had 3968 occurrences\n", "merge 30/256: (111, 119) -> 285 (b'ow') had 3964 occurrences\n", "merge 31/256: (101, 97) -> 286 (b'ea') had 3943 occurrences\n", "merge 32/256: (32, 109) -> 287 (b' m') had 3906 occurrences\n", "merge 33/256: (32, 119) -> 288 (b' w') had 3854 occurrences\n", "merge 34/256: (111, 102) -> 289 (b'of') had 3836 occurrences\n", "merge 35/256: (32, 104) -> 290 (b' h') had 3668 occurrences\n", "merge 36/256: (264, 103) -> 291 (b'ing') had 3660 occurrences\n", "merge 37/256: (111, 109) -> 292 (b'om') had 3614 occurrences\n", "merge 38/256: (32, 97) -> 293 (b' a') had 3124 occurrences\n", "merge 39/256: (99, 104) -> 294 (b'ch') had 2974 occurrences\n", "merge 40/256: (257, 256) -> 295 (b'the ') had 2967 occurrences\n", "merge 41/256: (115, 116) -> 296 (b'st') had 2961 occurrences\n", "merge 42/256: (32, 98) -> 297 (b' b') had 2855 occurrences\n", "merge 43/256: (110, 111) -> 298 (b'no') had 2756 occurrences\n", "merge 44/256: (105, 114) -> 299 (b'ir') had 2700 occurrences\n", "merge 45/256: (102, 268) -> 300 (b'for') had 2698 occurrences\n", "merge 46/256: (118, 256) -> 301 (b've ') had 2650 occurrences\n", "merge 47/256: (101, 261) -> 302 (b'e, ') had 2591 occurrences\n", "merge 48/256: (105, 257) -> 303 (b'ith') had 2421 occurrences\n", "merge 49/256: (273, 256) -> 304 (b' the ') had 2397 occurrences\n", "merge 50/256: (115, 101) -> 305 (b'se') had 2373 occurrences\n", "merge 51/256: (108, 105) -> 306 (b'li') had 2358 occurrences\n", "merge 52/256: (84, 104) -> 307 (b'Th') had 2356 occurrences\n", "merge 53/256: (275, 32) -> 308 (b'll ') had 2246 occurrences\n", "merge 54/256: (114, 101) -> 309 (b're') had 2164 occurrences\n", "merge 55/256: (115, 258) -> 310 (b'st ') had 2125 occurrences\n", "merge 56/256: (97, 258) -> 311 (b'at ') had 2124 occurrences\n", "merge 57/256: (65, 110) -> 312 (b'An') had 2105 occurrences\n", "merge 58/256: (73, 32) -> 313 (b'I ') had 2092 occurrences\n", "merge 59/256: (101, 272) -> 314 (b'ear') had 2081 occurrences\n", "merge 60/256: (105, 109) -> 315 (b'im') had 2077 occurrences\n", "merge 61/256: (105, 116) -> 316 (b'it') had 2070 occurrences\n", "merge 62/256: (111, 111) -> 317 (b'oo') had 2025 occurrences\n", "merge 63/256: (103, 104) -> 318 (b'gh') had 1981 occurrences\n", "merge 64/256: (97, 116) -> 319 (b'at') had 1977 occurrences\n", "merge 65/256: (105, 115) -> 320 (b'is') had 1941 occurrences\n", "merge 66/256: (108, 101) -> 321 (b'le') had 1896 occurrences\n", "merge 67/256: (263, 32) -> 322 (b'er ') had 1847 occurrences\n", "merge 68/256: (262, 114) -> 323 (b'our') had 1816 occurrences\n", "merge 69/256: (312, 260) -> 324 (b'And ') had 1801 occurrences\n", "merge 70/256: (39, 259) -> 325 (b\"'s \") had 1767 occurrences\n", "merge 71/256: (101, 101) -> 326 (b'ee') had 1763 occurrences\n", "merge 72/256: (298, 258) -> 327 (b'not ') had 1749 occurrences\n", "merge 73/256: (109, 265) -> 328 (b'my ') had 1725 occurrences\n", "merge 74/256: (59, 10) -> 329 (b';\\n') had 1688 occurrences\n", "merge 75/256: (114, 97) -> 330 (b'ra') had 1667 occurrences\n", "merge 76/256: (46, 10) -> 331 (b'.\\n') had 1658 occurrences\n", "merge 77/256: (281, 114) -> 332 (b'your') had 1634 occurrences\n", "merge 78/256: (117, 114) -> 333 (b'ur') had 1632 occurrences\n", "merge 79/256: (276, 258) -> 334 (b'hat ') had 1562 occurrences\n", "merge 80/256: (114, 105) -> 335 (b'ri') had 1560 occurrences\n", "merge 81/256: (117, 258) -> 336 (b'ut ') had 1555 occurrences\n", "merge 82/256: (108, 260) -> 337 (b'ld ') had 1545 occurrences\n", "merge 83/256: (289, 32) -> 338 (b'of ') had 1494 occurrences\n", "merge 84/256: (79, 267) -> 339 (b'O:\\n') had 1494 occurrences\n", "merge 85/256: (101, 260) -> 340 (b'ed ') had 1479 occurrences\n", "merge 86/256: (108, 97) -> 341 (b'la') had 1460 occurrences\n", "merge 87/256: (105, 258) -> 342 (b'it ') had 1444 occurrences\n", "merge 88/256: (114, 111) -> 343 (b'ro') had 1434 occurrences\n", "merge 89/256: (263, 256) -> 344 (b'ere ') had 1397 occurrences\n", "merge 90/256: (101, 259) -> 345 (b'es ') had 1385 occurrences\n", "merge 91/256: (100, 261) -> 346 (b'd, ') had 1381 occurrences\n", "merge 92/256: (117, 110) -> 347 (b'un') had 1374 occurrences\n", "merge 93/256: (69, 78) -> 348 (b'EN') had 1373 occurrences\n", "merge 94/256: (107, 256) -> 349 (b'ke ') had 1367 occurrences\n", "merge 95/256: (121, 261) -> 350 (b'y, ') had 1339 occurrences\n", "merge 96/256: (73, 78) -> 351 (b'IN') had 1313 occurrences\n", "merge 97/256: (32, 100) -> 352 (b' d') had 1295 occurrences\n", "merge 98/256: (63, 271) -> 353 (b'?\\n\\n') had 1294 occurrences\n", "merge 99/256: (97, 259) -> 354 (b'as ') had 1294 occurrences\n", "merge 100/256: (102, 97) -> 355 (b'fa') had 1267 occurrences\n", "merge 101/256: (119, 303) -> 356 (b'with') had 1258 occurrences\n", "merge 102/256: (276, 301) -> 357 (b'have ') had 1240 occurrences\n", "merge 103/256: (83, 267) -> 358 (b'S:\\n') had 1230 occurrences\n", "merge 104/256: (32, 99) -> 359 (b' c') had 1228 occurrences\n", "merge 105/256: (87, 104) -> 360 (b'Wh') had 1226 occurrences\n", "merge 106/256: (257, 311) -> 361 (b'that ') had 1222 occurrences\n", "merge 107/256: (270, 116) -> 362 (b'ent') had 1221 occurrences\n", "merge 108/256: (257, 101) -> 363 (b'the') had 1213 occurrences\n", "merge 109/256: (99, 101) -> 364 (b'ce') had 1206 occurrences\n", "merge 110/256: (115, 104) -> 365 (b'sh') had 1195 occurrences\n", "merge 111/256: (109, 97) -> 366 (b'ma') had 1173 occurrences\n", "merge 112/256: (32, 112) -> 367 (b' p') had 1167 occurrences\n", "merge 113/256: (257, 263) -> 368 (b'ther') had 1133 occurrences\n", "merge 114/256: (98, 101) -> 369 (b'be') had 1131 occurrences\n", "merge 115/256: (46, 32) -> 370 (b'. ') had 1127 occurrences\n", "merge 116/256: (65, 82) -> 371 (b'AR') had 1124 occurrences\n", "merge 117/256: (99, 256) -> 372 (b'ce ') had 1116 occurrences\n", "merge 118/256: (291, 32) -> 373 (b'ing ') had 1113 occurrences\n", "merge 119/256: (97, 108) -> 374 (b'al') had 1098 occurrences\n", "merge 120/256: (59, 32) -> 375 (b'; ') had 1091 occurrences\n", "merge 121/256: (257, 262) -> 376 (b'thou') had 1088 occurrences\n", "merge 122/256: (115, 261) -> 377 (b's, ') had 1086 occurrences\n", "merge 123/256: (109, 256) -> 378 (b'me ') had 1081 occurrences\n", "merge 124/256: (115, 256) -> 379 (b'se ') had 1078 occurrences\n", "merge 125/256: (108, 111) -> 380 (b'lo') had 1077 occurrences\n", "merge 126/256: (99, 107) -> 381 (b'ck') had 1061 occurrences\n", "merge 127/256: (119, 104) -> 382 (b'wh') had 1057 occurrences\n", "merge 128/256: (105, 108) -> 383 (b'il') had 1046 occurrences\n", "merge 129/256: (39, 260) -> 384 (b\"'d \") had 1026 occurrences\n", "merge 130/256: (73, 339) -> 385 (b'IO:\\n') had 1025 occurrences\n", "merge 131/256: (110, 285) -> 386 (b'now') had 1022 occurrences\n", "merge 132/256: (105, 275) -> 387 (b'ill') had 1016 occurrences\n", "merge 133/256: (98, 256) -> 388 (b'be ') had 982 occurrences\n", "merge 134/256: (101, 275) -> 389 (b'ell') had 982 occurrences\n", "merge 135/256: (114, 286) -> 390 (b'rea') had 978 occurrences\n", "merge 136/256: (32, 116) -> 391 (b' t') had 972 occurrences\n", "merge 137/256: (116, 261) -> 392 (b't, ') had 971 occurrences\n", "merge 138/256: (262, 337) -> 393 (b'ould ') had 970 occurrences\n", "merge 139/256: (101, 10) -> 394 (b'e\\n') had 962 occurrences\n", "merge 140/256: (287, 265) -> 395 (b' my ') had 959 occurrences\n", "merge 141/256: (118, 263) -> 396 (b'ver') had 955 occurrences\n", "merge 142/256: (99, 292) -> 397 (b'com') had 952 occurrences\n", "merge 143/256: (104, 256) -> 398 (b'he ') had 929 occurrences\n", "merge 144/256: (32, 283) -> 399 (b' to ') had 926 occurrences\n", "merge 145/256: (32, 73) -> 400 (b' I') had 906 occurrences\n", "merge 146/256: (101, 108) -> 401 (b'el') had 902 occurrences\n", "merge 147/256: (85, 358) -> 402 (b'US:\\n') had 879 occurrences\n", "merge 148/256: (111, 108) -> 403 (b'ol') had 871 occurrences\n", "merge 149/256: (100, 105) -> 404 (b'di') had 869 occurrences\n", "merge 150/256: (32, 103) -> 405 (b' g') had 864 occurrences\n", "merge 151/256: (97, 265) -> 406 (b'ay ') had 849 occurrences\n", "merge 152/256: (116, 263) -> 407 (b'ter') had 849 occurrences\n", "merge 153/256: (97, 264) -> 408 (b'ain') had 844 occurrences\n", "merge 154/256: (32, 281) -> 409 (b' you') had 844 occurrences\n", "merge 155/256: (307, 256) -> 410 (b'The ') had 843 occurrences\n", "merge 156/256: (108, 256) -> 411 (b'le ') had 839 occurrences\n", "merge 157/256: (105, 274) -> 412 (b'ion') had 838 occurrences\n", "merge 158/256: (32, 102) -> 413 (b' f') had 826 occurrences\n", "merge 159/256: (114, 117) -> 414 (b'ru') had 819 occurrences\n", "merge 160/256: (105, 102) -> 415 (b'if') had 817 occurrences\n", "merge 161/256: (101, 109) -> 416 (b'em') had 810 occurrences\n", "merge 162/256: (266, 100) -> 417 (b'and') had 801 occurrences\n", "merge 163/256: (84, 269) -> 418 (b'To ') had 800 occurrences\n", "merge 164/256: (105, 318) -> 419 (b'igh') had 798 occurrences\n", "merge 165/256: (272, 256) -> 420 (b'are ') had 796 occurrences\n", "merge 166/256: (117, 112) -> 421 (b'up') had 779 occurrences\n", "merge 167/256: (277, 324) -> 422 (b',\\nAnd ') had 774 occurrences\n", "merge 168/256: (104, 315) -> 423 (b'him') had 761 occurrences\n", "merge 169/256: (101, 100) -> 424 (b'ed') had 751 occurrences\n", "merge 170/256: (105, 308) -> 425 (b'ill ') had 743 occurrences\n", "merge 171/256: (268, 100) -> 426 (b'ord') had 736 occurrences\n", "merge 172/256: (105, 294) -> 427 (b'ich') had 733 occurrences\n", "merge 173/256: (108, 265) -> 428 (b'ly ') had 732 occurrences\n", "merge 174/256: (317, 260) -> 429 (b'ood ') had 726 occurrences\n", "merge 175/256: (85, 67) -> 430 (b'UC') had 725 occurrences\n", "merge 176/256: (285, 110) -> 431 (b'own') had 717 occurrences\n", "merge 177/256: (104, 279) -> 432 (b'his ') had 706 occurrences\n", "merge 178/256: (351, 71) -> 433 (b'ING') had 703 occurrences\n", "merge 179/256: (32, 284) -> 434 (b' and ') had 701 occurrences\n", "merge 180/256: (99, 274) -> 435 (b'con') had 700 occurrences\n", "merge 181/256: (110, 101) -> 436 (b'ne') had 699 occurrences\n", "merge 182/256: (97, 121) -> 437 (b'ay') had 697 occurrences\n", "merge 183/256: (101, 278) -> 438 (b'e.\\n\\n') had 693 occurrences\n", "merge 184/256: (114, 292) -> 439 (b'rom') had 690 occurrences\n", "merge 185/256: (105, 100) -> 440 (b'id') had 681 occurrences\n", "merge 186/256: (117, 115) -> 441 (b'us') had 679 occurrences\n", "merge 187/256: (262, 110) -> 442 (b'oun') had 677 occurrences\n", "merge 188/256: (65, 78) -> 443 (b'AN') had 677 occurrences\n", "merge 189/256: (109, 266) -> 444 (b'man') had 675 occurrences\n", "merge 190/256: (97, 103) -> 445 (b'ag') had 669 occurrences\n", "merge 191/256: (69, 82) -> 446 (b'ER') had 665 occurrences\n", "merge 192/256: (79, 82) -> 447 (b'OR') had 663 occurrences\n", "merge 193/256: (101, 258) -> 448 (b'et ') had 657 occurrences\n", "merge 194/256: (114, 280) -> 449 (b'res') had 655 occurrences\n", "merge 195/256: (305, 108) -> 450 (b'sel') had 649 occurrences\n", "merge 196/256: (290, 279) -> 451 (b' his ') had 647 occurrences\n", "merge 197/256: (101, 277) -> 452 (b'e,\\n') had 646 occurrences\n", "merge 198/256: (101, 116) -> 453 (b'et') had 643 occurrences\n", "merge 199/256: (99, 97) -> 454 (b'ca') had 642 occurrences\n", "merge 200/256: (32, 264) -> 455 (b' in') had 641 occurrences\n", "merge 201/256: (115, 276) -> 456 (b'sha') had 636 occurrences\n", "merge 202/256: (33, 10) -> 457 (b'!\\n') had 635 occurrences\n", "merge 203/256: (69, 84) -> 458 (b'ET') had 628 occurrences\n", "merge 204/256: (84, 334) -> 459 (b'That ') had 615 occurrences\n", "merge 205/256: (112, 111) -> 460 (b'po') had 611 occurrences\n", "merge 206/256: (113, 117) -> 461 (b'qu') had 609 occurrences\n", "merge 207/256: (257, 265) -> 462 (b'thy ') had 596 occurrences\n", "merge 208/256: (33, 271) -> 463 (b'!\\n\\n') had 594 occurrences\n", "merge 209/256: (109, 268) -> 464 (b'mor') had 584 occurrences\n", "merge 210/256: (117, 108) -> 465 (b'ul') had 581 occurrences\n", "merge 211/256: (110, 269) -> 466 (b'no ') had 579 occurrences\n", "merge 212/256: (97, 109) -> 467 (b'am') had 577 occurrences\n", "merge 213/256: (273, 101) -> 468 (b' the') had 572 occurrences\n", "merge 214/256: (65, 267) -> 469 (b'A:\\n') had 570 occurrences\n", "merge 215/256: (118, 270) -> 470 (b'ven') had 569 occurrences\n", "merge 216/256: (98, 265) -> 471 (b'by ') had 566 occurrences\n", "merge 217/256: (115, 10) -> 472 (b's\\n') had 560 occurrences\n", "merge 218/256: (115, 112) -> 473 (b'sp') had 556 occurrences\n", "merge 219/256: (75, 433) -> 474 (b'KING') had 556 occurrences\n", "merge 220/256: (290, 315) -> 475 (b' him') had 553 occurrences\n", "merge 221/256: (257, 279) -> 476 (b'this ') had 552 occurrences\n", "merge 222/256: (273, 279) -> 477 (b' this ') had 552 occurrences\n", "merge 223/256: (104, 263) -> 478 (b'her') had 552 occurrences\n", "merge 224/256: (273, 311) -> 479 (b' that ') had 542 occurrences\n", "merge 225/256: (111, 257) -> 480 (b'oth') had 539 occurrences\n", "merge 226/256: (63, 10) -> 481 (b'?\\n') had 539 occurrences\n", "merge 227/256: (274, 103) -> 482 (b'ong') had 537 occurrences\n", "merge 228/256: (66, 336) -> 483 (b'But ') had 536 occurrences\n", "merge 229/256: (280, 258) -> 484 (b'est ') had 532 occurrences\n", "merge 230/256: (111, 261) -> 485 (b'o, ') had 528 occurrences\n", "merge 231/256: (98, 336) -> 486 (b'but ') had 526 occurrences\n", "merge 232/256: (32, 289) -> 487 (b' of') had 525 occurrences\n", "merge 233/256: (70, 268) -> 488 (b'For') had 522 occurrences\n", "merge 234/256: (115, 117) -> 489 (b'su') had 521 occurrences\n", "merge 235/256: (288, 303) -> 490 (b' with') had 517 occurrences\n", "merge 236/256: (117, 116) -> 491 (b'ut') had 517 occurrences\n", "merge 237/256: (274, 256) -> 492 (b'one ') had 514 occurrences\n", "merge 238/256: (97, 275) -> 493 (b'all') had 512 occurrences\n", "merge 239/256: (73, 67) -> 494 (b'IC') had 509 occurrences\n", "merge 240/256: (270, 100) -> 495 (b'end') had 500 occurrences\n", "merge 241/256: (79, 76) -> 496 (b'OL') had 497 occurrences\n", "merge 242/256: (100, 269) -> 497 (b'do ') had 495 occurrences\n", "merge 243/256: (73, 288) -> 498 (b'I w') had 495 occurrences\n", "merge 244/256: (292, 256) -> 499 (b'ome ') had 494 occurrences\n", "merge 245/256: (107, 386) -> 500 (b'know') had 493 occurrences\n", "merge 246/256: (115, 277) -> 501 (b's,\\n') had 493 occurrences\n", "merge 247/256: (115, 299) -> 502 (b'sir') had 493 occurrences\n", "merge 248/256: (261, 284) -> 503 (b', and ') had 490 occurrences\n", "merge 249/256: (99, 116) -> 504 (b'ct') had 490 occurrences\n", "merge 250/256: (117, 259) -> 505 (b'us ') had 489 occurrences\n", "merge 251/256: (280, 259) -> 506 (b'ess ') had 486 occurrences\n", "merge 252/256: (450, 102) -> 507 (b'self') had 486 occurrences\n", "merge 253/256: (282, 116) -> 508 (b' st') had 486 occurrences\n", "merge 254/256: (97, 349) -> 509 (b'ake ') had 481 occurrences\n", "merge 255/256: (69, 76) -> 510 (b'EL') had 481 occurrences\n", "merge 256/256: (107, 291) -> 511 (b'king') had 479 occurrences\n" ] } ], "source": [ "basic_tokenizer = BasicTokenizer()\n", "basic_tokenizer.train(text, vocab_size=512, verbose=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "prefix = os.path.join(model_path, \"shakespeare_basic\")\n", "basic_tokenizer.save(prefix)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Regex tokenizer" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "merge 1/256: (32, 116) -> 256 (b' t') had 23837 occurrences\n", "merge 2/256: (104, 101) -> 257 (b'he') had 18203 occurrences\n", "merge 3/256: (32, 97) -> 258 (b' a') had 13541 occurrences\n", "merge 4/256: (111, 117) -> 259 (b'ou') had 12730 occurrences\n", "merge 5/256: (32, 115) -> 260 (b' s') had 12287 occurrences\n", "merge 6/256: (32, 109) -> 261 (b' m') had 10786 occurrences\n", "merge 7/256: (105, 110) -> 262 (b'in') had 10606 occurrences\n", "merge 8/256: (32, 119) -> 263 (b' w') had 10546 occurrences\n", "merge 9/256: (114, 101) -> 264 (b're') had 9843 occurrences\n", "merge 10/256: (104, 97) -> 265 (b'ha') had 9673 occurrences\n", "merge 11/256: (58, 10) -> 266 (b':\\n') had 8762 occurrences\n", "merge 12/256: (110, 100) -> 267 (b'nd') had 8730 occurrences\n", "merge 13/256: (256, 257) -> 268 (b' the') had 8684 occurrences\n", "merge 14/256: (32, 98) -> 269 (b' b') had 8463 occurrences\n", "merge 15/256: (105, 115) -> 270 (b'is') had 7526 occurrences\n", "merge 16/256: (111, 114) -> 271 (b'or') had 7297 occurrences\n", "merge 17/256: (10, 10) -> 272 (b'\\n\\n') had 7098 occurrences\n", "merge 18/256: (32, 102) -> 273 (b' f') had 6563 occurrences\n", "merge 19/256: (101, 114) -> 274 (b'er') had 6515 occurrences\n", "merge 20/256: (108, 108) -> 275 (b'll') had 6357 occurrences\n", "merge 21/256: (105, 116) -> 276 (b'it') had 6114 occurrences\n", "merge 22/256: (111, 110) -> 277 (b'on') had 5973 occurrences\n", "merge 23/256: (44, 10) -> 278 (b',\\n') had 5501 occurrences\n", "merge 24/256: (32, 100) -> 279 (b' d') had 5478 occurrences\n", "merge 25/256: (32, 99) -> 280 (b' c') had 5404 occurrences\n", "merge 26/256: (101, 115) -> 281 (b'es') had 5202 occurrences\n", "merge 27/256: (101, 110) -> 282 (b'en') had 5181 occurrences\n", "merge 28/256: (32, 110) -> 283 (b' n') had 5176 occurrences\n", "merge 29/256: (32, 108) -> 284 (b' l') had 5173 occurrences\n", "merge 30/256: (32, 121) -> 285 (b' y') had 5140 occurrences\n", "merge 31/256: (46, 272) -> 286 (b'.\\n\\n') had 5018 occurrences\n", "merge 32/256: (256, 104) -> 287 (b' th') had 4940 occurrences\n", "merge 33/256: (97, 114) -> 288 (b'ar') had 4884 occurrences\n", "merge 34/256: (32, 104) -> 289 (b' h') had 4702 occurrences\n", "merge 35/256: (32, 111) -> 290 (b' o') had 4693 occurrences\n", "merge 36/256: (256, 111) -> 291 (b' to') had 4666 occurrences\n", "merge 37/256: (285, 259) -> 292 (b' you') had 4587 occurrences\n", "merge 38/256: (32, 112) -> 293 (b' p') had 4490 occurrences\n", "merge 39/256: (265, 116) -> 294 (b'hat') had 4407 occurrences\n", "merge 40/256: (32, 73) -> 295 (b' I') had 4079 occurrences\n", "merge 41/256: (32, 257) -> 296 (b' he') had 4022 occurrences\n", "merge 42/256: (118, 101) -> 297 (b've') had 3900 occurrences\n", "merge 43/256: (111, 116) -> 298 (b'ot') had 3891 occurrences\n", "merge 44/256: (115, 116) -> 299 (b'st') had 3709 occurrences\n", "merge 45/256: (258, 267) -> 300 (b' and') had 3703 occurrences\n", "merge 46/256: (111, 119) -> 301 (b'ow') had 3686 occurrences\n", "merge 47/256: (262, 103) -> 302 (b'ing') had 3660 occurrences\n", "merge 48/256: (97, 110) -> 303 (b'an') had 3635 occurrences\n", "merge 49/256: (290, 102) -> 304 (b' of') had 3605 occurrences\n", "merge 50/256: (111, 109) -> 305 (b'om') had 3584 occurrences\n", "merge 51/256: (32, 103) -> 306 (b' g') had 3512 occurrences\n", "merge 52/256: (97, 116) -> 307 (b'at') had 3357 occurrences\n", "merge 53/256: (269, 101) -> 308 (b' be') had 3219 occurrences\n", "merge 54/256: (115, 101) -> 309 (b'se') had 3065 occurrences\n", "merge 55/256: (261, 121) -> 310 (b' my') had 2829 occurrences\n", "merge 56/256: (32, 262) -> 311 (b' in') had 2746 occurrences\n", "merge 57/256: (99, 101) -> 312 (b'ce') had 2740 occurrences\n", "merge 58/256: (32, 265) -> 313 (b' ha') had 2713 occurrences\n", "merge 59/256: (108, 101) -> 314 (b'le') had 2636 occurrences\n", "merge 60/256: (97, 121) -> 315 (b'ay') had 2568 occurrences\n", "merge 61/256: (108, 100) -> 316 (b'ld') had 2392 occurrences\n", "merge 62/256: (105, 114) -> 317 (b'ir') had 2385 occurrences\n", "merge 63/256: (101, 116) -> 318 (b'et') had 2375 occurrences\n", "merge 64/256: (101, 100) -> 319 (b'ed') had 2344 occurrences\n", "merge 65/256: (117, 116) -> 320 (b'ut') had 2304 occurrences\n", "merge 66/256: (261, 101) -> 321 (b' me') had 2132 occurrences\n", "merge 67/256: (105, 109) -> 322 (b'im') had 2125 occurrences\n", "merge 68/256: (276, 104) -> 323 (b'ith') had 2089 occurrences\n", "merge 69/256: (39, 115) -> 324 (b\"'s\") had 2063 occurrences\n", "merge 70/256: (283, 298) -> 325 (b' not') had 2048 occurrences\n", "merge 71/256: (99, 104) -> 326 (b'ch') had 2016 occurrences\n", "merge 72/256: (256, 294) -> 327 (b' that') had 1962 occurrences\n", "merge 73/256: (32, 270) -> 328 (b' is') had 1955 occurrences\n", "merge 74/256: (103, 104) -> 329 (b'gh') had 1947 occurrences\n", "merge 75/256: (65, 267) -> 330 (b'And') had 1927 occurrences\n", "merge 76/256: (273, 271) -> 331 (b' for') had 1890 occurrences\n", "merge 77/256: (107, 101) -> 332 (b'ke') had 1864 occurrences\n", "merge 78/256: (32, 117) -> 333 (b' u') had 1850 occurrences\n", "merge 79/256: (259, 114) -> 334 (b'our') had 1837 occurrences\n", "merge 80/256: (263, 101) -> 335 (b' we') had 1816 occurrences\n", "merge 81/256: (111, 111) -> 336 (b'oo') had 1800 occurrences\n", "merge 82/256: (105, 275) -> 337 (b'ill') had 1763 occurrences\n", "merge 83/256: (32, 101) -> 338 (b' e') had 1733 occurrences\n", "merge 84/256: (257, 114) -> 339 (b'her') had 1710 occurrences\n", "merge 85/256: (59, 10) -> 340 (b';\\n') had 1688 occurrences\n", "merge 86/256: (263, 323) -> 341 (b' with') had 1676 occurrences\n", "merge 87/256: (46, 10) -> 342 (b'.\\n') had 1658 occurrences\n", "merge 88/256: (282, 116) -> 343 (b'ent') had 1638 occurrences\n", "merge 89/256: (32, 276) -> 344 (b' it') had 1627 occurrences\n", "merge 90/256: (292, 114) -> 345 (b' your') had 1610 occurrences\n", "merge 91/256: (97, 100) -> 346 (b'ad') had 1598 occurrences\n", "merge 92/256: (114, 105) -> 347 (b'ri') had 1545 occurrences\n", "merge 93/256: (287, 259) -> 348 (b' thou') had 1496 occurrences\n", "merge 94/256: (260, 116) -> 349 (b' st') had 1478 occurrences\n", "merge 95/256: (39, 100) -> 350 (b\"'d\") had 1451 occurrences\n", "merge 96/256: (32, 107) -> 351 (b' k') had 1438 occurrences\n", "merge 97/256: (305, 101) -> 352 (b'ome') had 1436 occurrences\n", "merge 98/256: (289, 270) -> 353 (b' his') had 1415 occurrences\n", "merge 99/256: (329, 116) -> 354 (b'ght') had 1379 occurrences\n", "merge 100/256: (69, 78) -> 355 (b'EN') had 1373 occurrences\n", "merge 101/256: (271, 100) -> 356 (b'ord') had 1353 occurrences\n", "merge 102/256: (105, 100) -> 357 (b'id') had 1350 occurrences\n", "merge 103/256: (97, 115) -> 358 (b'as') had 1347 occurrences\n", "merge 104/256: (84, 257) -> 359 (b'The') had 1345 occurrences\n", "merge 105/256: (32, 264) -> 360 (b' re') had 1330 occurrences\n", "merge 106/256: (313, 297) -> 361 (b' have') had 1325 occurrences\n", "merge 107/256: (73, 78) -> 362 (b'IN') had 1313 occurrences\n", "merge 108/256: (108, 121) -> 363 (b'ly') had 1312 occurrences\n", "merge 109/256: (114, 97) -> 364 (b'ra') had 1303 occurrences\n", "merge 110/256: (284, 105) -> 365 (b' li') had 1299 occurrences\n", "merge 111/256: (63, 272) -> 366 (b'?\\n\\n') had 1294 occurrences\n", "merge 112/256: (289, 322) -> 367 (b' him') had 1293 occurrences\n", "merge 113/256: (117, 114) -> 368 (b'ur') had 1263 occurrences\n", "merge 114/256: (287, 270) -> 369 (b' this') had 1261 occurrences\n", "merge 115/256: (97, 108) -> 370 (b'al') had 1256 occurrences\n", "merge 116/256: (73, 79) -> 371 (b'IO') had 1254 occurrences\n", "merge 117/256: (260, 111) -> 372 (b' so') had 1238 occurrences\n", "merge 118/256: (258, 115) -> 373 (b' as') had 1209 occurrences\n", "merge 119/256: (279, 101) -> 374 (b' de') had 1202 occurrences\n", "merge 120/256: (32, 277) -> 375 (b' on') had 1178 occurrences\n", "merge 121/256: (111, 264) -> 376 (b'ore') had 1147 occurrences\n", "merge 122/256: (114, 111) -> 377 (b'ro') had 1127 occurrences\n", "merge 123/256: (65, 82) -> 378 (b'AR') had 1124 occurrences\n", "merge 124/256: (104, 105) -> 379 (b'hi') had 1120 occurrences\n", "merge 125/256: (259, 316) -> 380 (b'ould') had 1098 occurrences\n", "merge 126/256: (336, 100) -> 381 (b'ood') had 1092 occurrences\n", "merge 127/256: (99, 107) -> 382 (b'ck') had 1056 occurrences\n", "merge 128/256: (97, 262) -> 383 (b'ain') had 1051 occurrences\n", "merge 129/256: (118, 274) -> 384 (b'ver') had 1042 occurrences\n", "merge 130/256: (281, 116) -> 385 (b'est') had 1008 occurrences\n", "merge 131/256: (287, 121) -> 386 (b' thy') had 994 occurrences\n", "merge 132/256: (260, 265) -> 387 (b' sha') had 993 occurrences\n", "merge 133/256: (281, 115) -> 388 (b'ess') had 990 occurrences\n", "merge 134/256: (101, 97) -> 389 (b'ea') had 972 occurrences\n", "merge 135/256: (279, 111) -> 390 (b' do') had 968 occurrences\n", "merge 136/256: (263, 337) -> 391 (b' will') had 966 occurrences\n", "merge 137/256: (97, 109) -> 392 (b'am') had 954 occurrences\n", "merge 138/256: (283, 111) -> 393 (b' no') had 943 occurrences\n", "merge 139/256: (269, 320) -> 394 (b' but') had 912 occurrences\n", "merge 140/256: (117, 115) -> 395 (b'us') had 907 occurrences\n", "merge 141/256: (97, 267) -> 396 (b'and') had 897 occurrences\n", "merge 142/256: (85, 83) -> 397 (b'US') had 895 occurrences\n", "merge 143/256: (105, 102) -> 398 (b'if') had 894 occurrences\n", "merge 144/256: (260, 101) -> 399 (b' se') had 881 occurrences\n", "merge 145/256: (103, 101) -> 400 (b'ge') had 877 occurrences\n", "merge 146/256: (258, 275) -> 401 (b' all') had 849 occurrences\n", "merge 147/256: (84, 104) -> 402 (b'Th') had 845 occurrences\n", "merge 148/256: (260, 117) -> 403 (b' su') had 830 occurrences\n", "merge 149/256: (97, 332) -> 404 (b'ake') had 830 occurrences\n", "merge 150/256: (84, 111) -> 405 (b'To') had 828 occurrences\n", "merge 151/256: (296, 114) -> 406 (b' her') had 811 occurrences\n", "merge 152/256: (114, 117) -> 407 (b'ru') had 810 occurrences\n", "merge 153/256: (105, 277) -> 408 (b'ion') had 808 occurrences\n", "merge 154/256: (116, 104) -> 409 (b'th') had 800 occurrences\n", "merge 155/256: (258, 110) -> 410 (b' an') had 789 occurrences\n", "merge 156/256: (116, 274) -> 411 (b'ter') had 786 occurrences\n", "merge 157/256: (288, 100) -> 412 (b'ard') had 786 occurrences\n", "merge 158/256: (284, 111) -> 413 (b' lo') had 782 occurrences\n", "merge 159/256: (265, 110) -> 414 (b'han') had 779 occurrences\n", "merge 160/256: (101, 275) -> 415 (b'ell') had 771 occurrences\n", "merge 161/256: (101, 288) -> 416 (b'ear') had 769 occurrences\n", "merge 162/256: (260, 112) -> 417 (b' sp') had 763 occurrences\n", "merge 163/256: (268, 101) -> 418 (b' thee') had 751 occurrences\n", "merge 164/256: (32, 334) -> 419 (b' our') had 742 occurrences\n", "merge 165/256: (273, 97) -> 420 (b' fa') had 740 occurrences\n", "merge 166/256: (387, 275) -> 421 (b' shall') had 740 occurrences\n", "merge 167/256: (269, 121) -> 422 (b' by') had 739 occurrences\n", "merge 168/256: (85, 67) -> 423 (b'UC') had 725 occurrences\n", "merge 169/256: (105, 108) -> 424 (b'il') had 709 occurrences\n", "merge 170/256: (258, 264) -> 425 (b' are') had 703 occurrences\n", "merge 171/256: (362, 71) -> 426 (b'ING') had 703 occurrences\n", "merge 172/256: (32, 67) -> 427 (b' C') had 693 occurrences\n", "merge 173/256: (283, 101) -> 428 (b' ne') had 690 occurrences\n", "merge 174/256: (114, 305) -> 429 (b'rom') had 690 occurrences\n", "merge 175/256: (104, 111) -> 430 (b'ho') had 686 occurrences\n", "merge 176/256: (351, 110) -> 431 (b' kn') had 683 occurrences\n", "merge 177/256: (65, 78) -> 432 (b'AN') had 677 occurrences\n", "merge 178/256: (32, 82) -> 433 (b' R') had 675 occurrences\n", "merge 179/256: (84, 294) -> 434 (b'That') had 671 occurrences\n", "merge 180/256: (32, 118) -> 435 (b' v') had 670 occurrences\n", "merge 181/256: (69, 82) -> 436 (b'ER') had 665 occurrences\n", "merge 182/256: (97, 299) -> 437 (b'ast') had 663 occurrences\n", "merge 183/256: (79, 82) -> 438 (b'OR') had 663 occurrences\n", "merge 184/256: (99, 116) -> 439 (b'ct') had 661 occurrences\n", "merge 185/256: (259, 115) -> 440 (b'ous') had 661 occurrences\n", "merge 186/256: (263, 294) -> 441 (b' what') had 655 occurrences\n", "merge 187/256: (105, 354) -> 442 (b'ight') had 642 occurrences\n", "merge 188/256: (260, 104) -> 443 (b' sh') had 639 occurrences\n", "merge 189/256: (33, 10) -> 444 (b'!\\n') had 635 occurrences\n", "merge 190/256: (117, 108) -> 445 (b'ul') had 633 occurrences\n", "merge 191/256: (32, 39) -> 446 (b\" '\") had 628 occurrences\n", "merge 192/256: (69, 84) -> 447 (b'ET') had 628 occurrences\n", "merge 193/256: (303, 116) -> 448 (b'ant') had 627 occurrences\n", "merge 194/256: (69, 83) -> 449 (b'ES') had 624 occurrences\n", "merge 195/256: (333, 112) -> 450 (b' up') had 616 occurrences\n", "merge 196/256: (309, 108) -> 451 (b'sel') had 614 occurrences\n", "merge 197/256: (113, 117) -> 452 (b'qu') had 609 occurrences\n", "merge 198/256: (66, 320) -> 453 (b'But') had 608 occurrences\n", "merge 199/256: (288, 116) -> 454 (b'art') had 607 occurrences\n", "merge 200/256: (306, 381) -> 455 (b' good') had 599 occurrences\n", "merge 201/256: (33, 272) -> 456 (b'!\\n\\n') had 594 occurrences\n", "merge 202/256: (114, 301) -> 457 (b'row') had 592 occurrences\n", "merge 203/256: (307, 104) -> 458 (b'ath') had 591 occurrences\n", "merge 204/256: (262, 101) -> 459 (b'ine') had 591 occurrences\n", "merge 205/256: (284, 356) -> 460 (b' lord') had 589 occurrences\n", "merge 206/256: (379, 326) -> 461 (b'hich') had 587 occurrences\n", "merge 207/256: (110, 116) -> 462 (b'nt') had 586 occurrences\n", "merge 208/256: (117, 299) -> 463 (b'ust') had 585 occurrences\n", "merge 209/256: (39, 275) -> 464 (b\"'ll\") had 580 occurrences\n", "merge 210/256: (277, 101) -> 465 (b'one') had 578 occurrences\n", "merge 211/256: (293, 114) -> 466 (b' pr') had 577 occurrences\n", "merge 212/256: (280, 305) -> 467 (b' com') had 575 occurrences\n", "merge 213/256: (258, 116) -> 468 (b' at') had 574 occurrences\n", "merge 214/256: (261, 303) -> 469 (b' man') had 569 occurrences\n", "merge 215/256: (32, 77) -> 470 (b' M') had 567 occurrences\n", "merge 216/256: (87, 294) -> 471 (b'What') had 567 occurrences\n", "merge 217/256: (263, 257) -> 472 (b' whe') had 563 occurrences\n", "merge 218/256: (32, 69) -> 473 (b' E') had 559 occurrences\n", "merge 219/256: (75, 426) -> 474 (b'KING') had 556 occurrences\n", "merge 220/256: (258, 109) -> 475 (b' am') had 554 occurrences\n", "merge 221/256: (101, 267) -> 476 (b'end') had 553 occurrences\n", "merge 222/256: (105, 99) -> 477 (b'ic') had 548 occurrences\n", "merge 223/256: (280, 277) -> 478 (b' con') had 548 occurrences\n", "merge 224/256: (98, 314) -> 479 (b'ble') had 545 occurrences\n", "merge 225/256: (114, 121) -> 480 (b'ry') had 542 occurrences\n", "merge 226/256: (63, 10) -> 481 (b'?\\n') had 539 occurrences\n", "merge 227/256: (277, 103) -> 482 (b'ong') had 534 occurrences\n", "merge 228/256: (105, 101) -> 483 (b'ie') had 533 occurrences\n", "merge 229/256: (105, 297) -> 484 (b'ive') had 527 occurrences\n", "merge 230/256: (273, 429) -> 485 (b' from') had 527 occurrences\n", "merge 231/256: (269, 108) -> 486 (b' bl') had 527 occurrences\n", "merge 232/256: (118, 282) -> 487 (b'ven') had 521 occurrences\n", "merge 233/256: (32, 71) -> 488 (b' G') had 518 occurrences\n", "merge 234/256: (70, 271) -> 489 (b'For') had 517 occurrences\n", "merge 235/256: (260, 257) -> 490 (b' she') had 517 occurrences\n", "merge 236/256: (101, 109) -> 491 (b'em') had 515 occurrences\n", "merge 237/256: (306, 111) -> 492 (b' go') had 515 occurrences\n", "merge 238/256: (97, 264) -> 493 (b'are') had 514 occurrences\n", "merge 239/256: (261, 376) -> 494 (b' more') had 510 occurrences\n", "merge 240/256: (268, 109) -> 495 (b' them') had 509 occurrences\n", "merge 241/256: (259, 116) -> 496 (b'out') had 509 occurrences\n", "merge 242/256: (73, 67) -> 497 (b'IC') had 509 occurrences\n", "merge 243/256: (263, 358) -> 498 (b' was') had 504 occurrences\n", "merge 244/256: (97, 117) -> 499 (b'au') had 504 occurrences\n", "merge 245/256: (298, 339) -> 500 (b'other') had 503 occurrences\n", "merge 246/256: (298, 104) -> 501 (b'oth') had 503 occurrences\n", "merge 247/256: (72, 101) -> 502 (b'He') had 500 occurrences\n", "merge 248/256: (260, 317) -> 503 (b' sir') had 500 occurrences\n", "merge 249/256: (111, 108) -> 504 (b'ol') had 498 occurrences\n", "merge 250/256: (283, 301) -> 505 (b' now') had 490 occurrences\n", "merge 251/256: (32, 76) -> 506 (b' L') had 489 occurrences\n", "merge 252/256: (32, 294) -> 507 (b' hat') had 488 occurrences\n", "merge 253/256: (32, 398) -> 508 (b' if') had 484 occurrences\n", "merge 254/256: (111, 299) -> 509 (b'ost') had 484 occurrences\n", "merge 255/256: (76, 79) -> 510 (b'LO') had 481 occurrences\n", "merge 256/256: (262, 100) -> 511 (b'ind') had 480 occurrences\n" ] } ], "source": [ "regex_tokenizer = RegexTokenizer()\n", "regex_tokenizer.train(text, vocab_size=512, verbose=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "prefix = os.path.join(model_path, \"shakespeare_regex\")\n", "regex_tokenizer.save(prefix)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "loading the tokenizer files and using it" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "model_path = os.path.join(os.getcwd(), \"tokenizer_model\")\n", "model_path = os.path.join(model_path, \"shakespeare_basic.model\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "basic_tokenizer = BasicTokenizer()\n", "basic_tokenizer.load(model_path)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "ids = basic_tokenizer.encode(\"Hello World\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "reverse = basic_tokenizer.decode(ids)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "mapping = [(str(i),basic_tokenizer.decode([i])) for i in ids]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def test_tokenizer(text):\n", " ids = basic_tokenizer.encode(text)\n", " decoded = basic_tokenizer.decode(ids)\n", " mapping = [(str(i), basic_tokenizer.decode([i])) for i in ids]\n", "\n", " return ids, decoded, mapping" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "([72, 389, 269, 87, 268, 108, 100],\n", " 'Hello World',\n", " [('72', 'H'),\n", " ('389', 'ell'),\n", " ('269', 'o '),\n", " ('87', 'W'),\n", " ('268', 'or'),\n", " ('108', 'l'),\n", " ('100', 'd')])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_tokenizer(\"Hello World\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import gradio as gr" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "with gr.Blocks() as demo:\n", " gr.HTML(\"

Token Generation for Hindi Dataset

\")\n", "\n", " with gr.Row():\n", " with gr.Column():\n", " inputs = [gr.TextArea(label = \"Enter initial text to generate tokens in Hindi\", lines = 10)]\n", " generate_btn = gr.Button(value = 'Generate Text')\n", " with gr.Column():\n", " enc = gr.Textbox(label = \"Encoded Tokens\")\n", " txt = gr.Textbox(label = \"Decoded Text from tokens\")\n", " map = gr.Textbox(label = \"Mapping of the tokens and respective texts\")\n", " outputs = [\n", " enc,\n", " txt,\n", " map\n", " ]\n", " \n", " generate_btn.click(fn = test_tokenizer, inputs= inputs, outputs = outputs)\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rerunning server... use `close()` to stop if you need to change `launch()` parameters.\n", "----\n", "Running on public URL: https://9ce8a9238acdc2d242.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "demo.launch(share=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "demo.close()" ] } ], "metadata": { "kernelspec": { "display_name": "torch_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }