{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4", "authorship_tag": "ABX9TyPXgKZqJoVuio+h58qyoujZ", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "g6isgzoaxWTr", "outputId": "8ea96348-ea45-4d2e-e0a5-b76f3cfbb255" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "source": [ "!pip install sentencepiece" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2ZYK4GgzzSG4", "outputId": "be748680-ccff-45e0-d0b5-91f9a9887608" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting sentencepiece\n", " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: sentencepiece\n", "Successfully installed sentencepiece-0.1.99\n" ] } ] }, { "cell_type": "code", "source": [ "import sentencepiece as spm" ], "metadata": { "id": "N3j11OrLzxFC" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "def train_sentencepiece_model(input_file, model_prefix, model_type='bpe', vocab_size=16000):\n", " # Train SentencePiece model\n", " spm.SentencePieceTrainer.train(\n", " input=input_file,\n", " model_prefix=model_prefix,\n", " vocab_size=vocab_size,\n", " model_type=model_type,\n", " max_sentence_length=10000,\n", " input_sentence_size=1000000,\n", " split_by_whitespace=\"false\",\n", " )" ], "metadata": { "id": "LITW3pSpz2Vp" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "train_sentencepiece_model('drive/MyDrive/vi.txt', 'tokenizer.vi_bpe')" ], "metadata": { "id": "DuMgFiV60C_K" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "def tokenize_text(text, model_path):\n", " # Load trained model\n", " sp = spm.SentencePieceProcessor()\n", " sp.load(model_path)\n", "\n", " # Tokenize Lao sentence\n", " tokens = sp.encode(text, out_type=str)\n", " return tokens" ], "metadata": { "id": "n3f4z8Ky6PcQ" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "sentence = \"Cuộc thi sáng tác truyện tranh đến từ Nhật Bản, dành cho các họa sĩ Việt Nam!\"\n", "model_path = 'tokenizer.vi_bpe.model'\n", "tokens = tokenize_text(sentence, model_path)\n", "print(\"Tokens:\", tokens)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zyjy-JJgHxUm", "outputId": "e797ad00-0192-456a-963f-eed44274eae9" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tokens: ['▁Cuộc▁thi', '▁sáng▁tác', '▁truyện', '▁tranh', '▁đến▁từ', '▁Nhật▁Bản', ',', '▁dành▁cho▁các', '▁họa▁sĩ', '▁Việt▁Nam', '!']\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "IN85a_AcHz2G" }, "execution_count": null, "outputs": [] } ] }