File size: 5,139 Bytes

cd81fc7

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyPXgKZqJoVuio+h58qyoujZ",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/Erioldeth/Viet-Laos-Translator/blob/main/tokenizer_training.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "g6isgzoaxWTr",
        "outputId": "8ea96348-ea45-4d2e-e0a5-b76f3cfbb255"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install sentencepiece"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2ZYK4GgzzSG4",
        "outputId": "be748680-ccff-45e0-d0b5-91f9a9887608"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting sentencepiece\n",
            "  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: sentencepiece\n",
            "Successfully installed sentencepiece-0.1.99\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import sentencepiece as spm"
      ],
      "metadata": {
        "id": "N3j11OrLzxFC"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def train_sentencepiece_model(input_file, model_prefix, model_type='bpe', vocab_size=16000):\n",
        "    # Train SentencePiece model\n",
        "    spm.SentencePieceTrainer.train(\n",
        "        input=input_file,\n",
        "        model_prefix=model_prefix,\n",
        "        vocab_size=vocab_size,\n",
        "        model_type=model_type,\n",
        "        max_sentence_length=10000,\n",
        "        input_sentence_size=1000000,\n",
        "        split_by_whitespace=\"false\",\n",
        "    )"
      ],
      "metadata": {
        "id": "LITW3pSpz2Vp"
      },
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_sentencepiece_model('drive/MyDrive/vi.txt', 'tokenizer.vi_bpe')"
      ],
      "metadata": {
        "id": "DuMgFiV60C_K"
      },
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def tokenize_text(text, model_path):\n",
        "    # Load trained model\n",
        "    sp = spm.SentencePieceProcessor()\n",
        "    sp.load(model_path)\n",
        "\n",
        "    # Tokenize Lao sentence\n",
        "    tokens = sp.encode(text, out_type=str)\n",
        "    return tokens"
      ],
      "metadata": {
        "id": "n3f4z8Ky6PcQ"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sentence = \"Cuộc thi sáng tác truyện tranh đến từ Nhật Bản, dành cho các họa sĩ Việt Nam!\"\n",
        "model_path = 'tokenizer.vi_bpe.model'\n",
        "tokens = tokenize_text(sentence, model_path)\n",
        "print(\"Tokens:\", tokens)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Zyjy-JJgHxUm",
        "outputId": "e797ad00-0192-456a-963f-eed44274eae9"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Tokens: ['▁Cuộc▁thi', '▁sáng▁tác', '▁truyện', '▁tranh', '▁đến▁từ', '▁Nhật▁Bản', ',', '▁dành▁cho▁các', '▁họa▁sĩ', '▁Việt▁Nam', '!']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "IN85a_AcHz2G"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}