{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "# Environment setup" ], "metadata": { "id": "5CJSWZ0seqdi" } }, { "cell_type": "code", "source": [ "!npm install vietnamese-stopwords" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "chVA0nDcanqO", "outputId": "603f1824-4dd1-4eb9-c21c-1ad81a495300" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[K\u001b[?25h\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35msaveError\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m \u001b[0m\u001b[35menoent\u001b[0m ENOENT: no such file or directory, open '/content/package.json'\n", "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No description\n", "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No repository field.\n", "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No README data\n", "\u001b[0m\u001b[37;40mnpm\u001b[0m \u001b[0m\u001b[30;43mWARN\u001b[0m\u001b[35m\u001b[0m content No license field.\n", "\u001b[0m\n", "\u001b[K\u001b[?25h+ vietnamese-stopwords@0.0.2\n", "updated 1 package and audited 1 package in 0.851s\n", "found \u001b[92m0\u001b[0m vulnerabilities\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XGEfwBRWYkXq", "outputId": "72dba600-dc2e-4899-d541-2d8a7df97a68" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ] }, { "cell_type": "code", "source": [ "cd '/content/drive/MyDrive/CustomerReviewSentiment'" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_sH-b_JWYwVe", "outputId": "8f14dd56-8326-4321-91c6-978cc59b102b" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "/content/drive/MyDrive/CustomerReviewSentiment\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install sentence_transformers" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yVsu_gxoqhQX", "outputId": "4362a21f-a065-4761-d79f-826e82fda4d8" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: sentence_transformers in /usr/local/lib/python3.10/dist-packages (2.2.2)\n", "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (4.35.2)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (4.66.1)\n", "Requirement already satisfied: torch>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (2.1.0+cu118)\n", "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (0.16.0+cu118)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (1.23.5)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (1.2.2)\n", "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (1.11.4)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (3.8.1)\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (0.1.99)\n", "Requirement already satisfied: huggingface-hub>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from sentence_transformers) (0.19.4)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.13.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.31.0)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.5.0)\n", "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.4.0->sentence_transformers) (23.2)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (3.2.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (3.1.2)\n", "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.6.0->sentence_transformers) (2.1.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n", "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.15.0)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.4.1)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->sentence_transformers) (8.1.7)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->sentence_transformers) (1.3.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence_transformers) (3.2.0)\n", "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.10/dist-packages (from torchvision->sentence_transformers) (9.4.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.6.0->sentence_transformers) (2.1.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (3.6)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.4.0->sentence_transformers) (2023.11.17)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.6.0->sentence_transformers) (1.3.0)\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Data preparation" ], "metadata": { "id": "fmmgYwpCfP6L" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "data = pd.read_csv('clean_data.csv')" ], "metadata": { "id": "w9XJerFjfWAd" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "data" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "WlzOKiMBY1sI", "outputId": "957bc541-2ff4-4463-f1d3-857d393cc848" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Unnamed: 0 content score thumbsUpCount \\\n", "0 0 rất thuận tiện , có lợi ích . 5 2 \n", "1 1 tốt 4 213 \n", "2 2 ok 5 1 \n", "3 3 xài rất tốt , nên tải nha 5 8 \n", "4 4 gútttt 5 1 \n", "... ... ... ... ... \n", "310741 99995 ok 5 0 \n", "310742 99996 g o o d 5 0 \n", "310743 99997 có 5 0 \n", "310744 99998 nhiều lúc tôi k lấy đc hiệu ứng 1 0 \n", "310745 99999 tui chưa chơi ko biết có hay ko . 3 0 \n", "\n", " Application \n", "0 Messenger \n", "1 Messenger \n", "2 Messenger \n", "3 Messenger \n", "4 Messenger \n", "... ... \n", "310741 Instagram \n", "310742 Instagram \n", "310743 Instagram \n", "310744 Instagram \n", "310745 Instagram \n", "\n", "[310746 rows x 5 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0contentscorethumbsUpCountApplication
00rất thuận tiện , có lợi ích .52Messenger
11tốt4213Messenger
22ok51Messenger
33xài rất tốt , nên tải nha58Messenger
44gútttt51Messenger
..................
31074199995ok50Instagram
31074299996g o o d50Instagram
3107439999750Instagram
31074499998nhiều lúc tôi k lấy đc hiệu ứng10Instagram
31074599999tui chưa chơi ko biết có hay ko .30Instagram
\n", "

310746 rows × 5 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "print(data.columns[0])\n", "data = data.drop(data.columns[0], axis=1)\n", "data\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 441 }, "id": "OuEfYt0lY6dX", "outputId": "0c2b08dc-94b1-4dce-e3f4-fe30e6d85e93" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Unnamed: 0\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " content score thumbsUpCount Application\n", "0 rất thuận tiện , có lợi ích . 5 2 Messenger\n", "1 tốt 4 213 Messenger\n", "2 ok 5 1 Messenger\n", "3 xài rất tốt , nên tải nha 5 8 Messenger\n", "4 gútttt 5 1 Messenger\n", "... ... ... ... ...\n", "310741 ok 5 0 Instagram\n", "310742 g o o d 5 0 Instagram\n", "310743 có 5 0 Instagram\n", "310744 nhiều lúc tôi k lấy đc hiệu ứng 1 0 Instagram\n", "310745 tui chưa chơi ko biết có hay ko . 3 0 Instagram\n", "\n", "[310746 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contentscorethumbsUpCountApplication
0rất thuận tiện , có lợi ích .52Messenger
1tốt4213Messenger
2ok51Messenger
3xài rất tốt , nên tải nha58Messenger
4gútttt51Messenger
...............
310741ok50Instagram
310742g o o d50Instagram
31074350Instagram
310744nhiều lúc tôi k lấy đc hiệu ứng10Instagram
310745tui chưa chơi ko biết có hay ko .30Instagram
\n", "

310746 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "markdown", "source": [ "#Text Preprocessing with NLTK" ], "metadata": { "id": "18LqqCwofbHv" } }, { "cell_type": "code", "source": [ "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "import re\n", "\n", "nltk.download('wordnet')\n", "\n", "def read_stop_words(file_path):\n", " with open(file_path, 'r', encoding='utf-8') as file:\n", " stop_words = [line.strip() for line in file]\n", " return stop_words\n", "\n", "stop_words_path = 'vietnamese-stopwords.txt'\n", "stop_words = set(read_stop_words(stop_words_path))\n", "\n", "lemmatizer = WordNetLemmatizer()\n", "\n", "def clean_text(text):\n", " if not isinstance(text, str):\n", " return ''\n", "\n", " text = re.sub(r'[^\\w\\s]', '', text, re.UNICODE)\n", " text = text.lower()\n", " text = [lemmatizer.lemmatize(token) for token in text.split(\" \")]\n", " text = [word for word in text if not word in stop_words]\n", " text = \" \".join(text)\n", " return text\n", "\n", "data['content'] = data.content.apply(lambda x: clean_text(x))\n" ], "metadata": { "id": "Ei4nRSH6ffk1", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "509b7890-deb4-42fa-f7d6-7ced353c1475" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] } ] }, { "cell_type": "code", "source": [ "def count_sentences(text):\n", " sentences = text.split(\".\")\n", " return len(sentences)\n", "\n", "data['sentence_count'] = data['content'].apply(count_sentences)\n", "data = data[data['sentence_count'] <= 1].drop(columns=['sentence_count'])\n", "data = data[0:1000]" ], "metadata": { "id": "kZeUdxbLM5xk" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "data" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "U2e1cuWwod7C", "outputId": "7941fc93-b7b9-406f-d648-72eef32936ca" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " content score thumbsUpCount Application\n", "0 thuận tiện lợi ích 5 2 Messenger\n", "1 4 213 Messenger\n", "2 ok 5 1 Messenger\n", "3 xài tải nha 5 8 Messenger\n", "4 gútttt 5 1 Messenger\n", ".. ... ... ... ...\n", "995 cập nhật clllll xóa mẹ mess đi 1 1 Messenger\n", "996 ứng dụng kém 1 0 Messenger\n", "997 tiếng vọng 1 0 Messenger\n", "998 ok 5 0 Messenger\n", "999 ghi 1 0 Messenger\n", "\n", "[1000 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contentscorethumbsUpCountApplication
0thuận tiện lợi ích52Messenger
14213Messenger
2ok51Messenger
3xài tải nha58Messenger
4gútttt51Messenger
...............
995cập nhật clllll xóa mẹ mess đi11Messenger
996ứng dụng kém10Messenger
997tiếng vọng10Messenger
998ok50Messenger
999ghi10Messenger
\n", "

1000 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "markdown", "source": [ "#Feature Extraction" ], "metadata": { "id": "ckq3FCj-fkyo" } }, { "cell_type": "code", "source": [ "from sentence_transformers import SentenceTransformer\n", "import pandas as pd\n", "encode_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')\n", "encode_model.eval()\n", "encoded_values = encode_model.encode(list(data['content']), batch_size = 1000)\n", "encoded_df = pd.DataFrame(encoded_values, columns=[f'feature_{i}' for i in range(encoded_values.shape[1])])\n", "data = pd.concat([data, encoded_df], axis=1)\n", "data" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "y3Wj22KVqv6q", "outputId": "92d46382-853d-48d3-bb99-f493c89b13ca" }, "execution_count": 31, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " content score thumbsUpCount Application \\\n", "0 thuận tiện lợi ích 5 2 Messenger \n", "1 4 213 Messenger \n", "2 ok 5 1 Messenger \n", "3 xài tải nha 5 8 Messenger \n", "4 gútttt 5 1 Messenger \n", ".. ... ... ... ... \n", "995 cập nhật clllll xóa mẹ mess đi 1 1 Messenger \n", "996 ứng dụng kém 1 0 Messenger \n", "997 tiếng vọng 1 0 Messenger \n", "998 ok 5 0 Messenger \n", "999 ghi 1 0 Messenger \n", "\n", " feature_0 feature_1 feature_2 feature_3 feature_4 feature_5 ... \\\n", "0 0.007724 -0.029080 0.013690 -0.009045 0.023238 -0.000431 ... \n", "1 0.017581 -0.002738 -0.038254 -0.041900 -0.053605 0.027044 ... \n", "2 0.028191 -0.010388 -0.052607 -0.094258 0.026439 0.025329 ... \n", "3 -0.012584 -0.066111 -0.015364 -0.006006 -0.049788 0.031777 ... \n", "4 0.011783 -0.014584 -0.033242 -0.039972 -0.041038 0.000820 ... \n", ".. ... ... ... ... ... ... ... \n", "995 -0.003216 -0.013464 0.004712 0.002270 -0.053654 -0.008947 ... \n", "996 0.003408 0.011238 -0.003270 0.009822 0.010839 0.013731 ... \n", "997 0.009905 0.016094 -0.045669 -0.000068 -0.040511 0.034474 ... \n", "998 0.028191 -0.010388 -0.052607 -0.094258 0.026439 0.025329 ... \n", "999 0.028111 0.017755 -0.039428 -0.035738 -0.039431 0.019438 ... \n", "\n", " feature_502 feature_503 feature_504 feature_505 feature_506 \\\n", "0 -0.016954 -0.036230 -0.035321 0.018905 -0.004153 \n", "1 0.016105 -0.003971 0.019439 0.017352 -0.038702 \n", "2 0.044415 0.029121 0.023854 0.024050 0.003137 \n", "3 0.029615 -0.012054 0.038802 0.006297 0.011572 \n", "4 0.018747 -0.020979 0.011736 0.006028 -0.020612 \n", ".. ... ... ... ... ... \n", "995 0.005085 -0.007261 0.017898 0.018807 -0.013562 \n", "996 0.005113 -0.001555 0.023239 0.026508 -0.008224 \n", "997 0.018995 -0.012872 0.006261 -0.016766 -0.041759 \n", "998 0.044415 0.029121 0.023854 0.024050 0.003137 \n", "999 0.013316 -0.008019 0.014297 0.013871 -0.029063 \n", "\n", " feature_507 feature_508 feature_509 feature_510 feature_511 \n", "0 -0.050888 0.026344 -0.036549 0.005494 -0.022665 \n", "1 -0.057055 -0.019953 -0.015060 0.021243 -0.026957 \n", "2 -0.067067 -0.010747 0.015472 -0.007338 -0.064763 \n", "3 -0.025683 0.032493 -0.004576 0.034961 -0.028953 \n", "4 -0.021013 0.004732 -0.011790 -0.021903 0.000539 \n", ".. ... ... ... ... ... \n", "995 -0.015285 -0.032608 0.012056 0.017730 -0.023654 \n", "996 -0.027951 -0.023829 -0.005397 -0.006893 0.011582 \n", "997 -0.059727 -0.005747 -0.008027 0.004528 -0.001835 \n", "998 -0.067067 -0.010747 0.015472 -0.007338 -0.064763 \n", "999 -0.038971 -0.016512 -0.027867 0.003646 -0.013748 \n", "\n", "[1000 rows x 1028 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contentscorethumbsUpCountApplicationfeature_0feature_1feature_2feature_3feature_4feature_5...feature_502feature_503feature_504feature_505feature_506feature_507feature_508feature_509feature_510feature_511
0thuận tiện lợi ích52Messenger0.007724-0.0290800.013690-0.0090450.023238-0.000431...-0.016954-0.036230-0.0353210.018905-0.004153-0.0508880.026344-0.0365490.005494-0.022665
14213Messenger0.017581-0.002738-0.038254-0.041900-0.0536050.027044...0.016105-0.0039710.0194390.017352-0.038702-0.057055-0.019953-0.0150600.021243-0.026957
2ok51Messenger0.028191-0.010388-0.052607-0.0942580.0264390.025329...0.0444150.0291210.0238540.0240500.003137-0.067067-0.0107470.015472-0.007338-0.064763
3xài tải nha58Messenger-0.012584-0.066111-0.015364-0.006006-0.0497880.031777...0.029615-0.0120540.0388020.0062970.011572-0.0256830.032493-0.0045760.034961-0.028953
4gútttt51Messenger0.011783-0.014584-0.033242-0.039972-0.0410380.000820...0.018747-0.0209790.0117360.006028-0.020612-0.0210130.004732-0.011790-0.0219030.000539
..................................................................
995cập nhật clllll xóa mẹ mess đi11Messenger-0.003216-0.0134640.0047120.002270-0.053654-0.008947...0.005085-0.0072610.0178980.018807-0.013562-0.015285-0.0326080.0120560.017730-0.023654
996ứng dụng kém10Messenger0.0034080.011238-0.0032700.0098220.0108390.013731...0.005113-0.0015550.0232390.026508-0.008224-0.027951-0.023829-0.005397-0.0068930.011582
997tiếng vọng10Messenger0.0099050.016094-0.045669-0.000068-0.0405110.034474...0.018995-0.0128720.006261-0.016766-0.041759-0.059727-0.005747-0.0080270.004528-0.001835
998ok50Messenger0.028191-0.010388-0.052607-0.0942580.0264390.025329...0.0444150.0291210.0238540.0240500.003137-0.067067-0.0107470.015472-0.007338-0.064763
999ghi10Messenger0.0281110.017755-0.039428-0.035738-0.0394310.019438...0.013316-0.0080190.0142970.013871-0.029063-0.038971-0.016512-0.0278670.003646-0.013748
\n", "

1000 rows × 1028 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 31 } ] }, { "cell_type": "code", "source": [ "data" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "q_R3rVBmL1ZW", "outputId": "7ce0b744-967b-4848-d80f-e72ad0c98bf5" }, "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " content score thumbsUpCount Application \\\n", "0 thuận tiện lợi ích 5 2 Messenger \n", "1 4 213 Messenger \n", "2 ok 5 1 Messenger \n", "3 xài tải nha 5 8 Messenger \n", "4 gútttt 5 1 Messenger \n", ".. ... ... ... ... \n", "995 cập nhật clllll xóa mẹ mess đi 1 1 Messenger \n", "996 ứng dụng kém 1 0 Messenger \n", "997 tiếng vọng 1 0 Messenger \n", "998 ok 5 0 Messenger \n", "999 ghi 1 0 Messenger \n", "\n", " feature_0 feature_1 feature_2 feature_3 feature_4 feature_5 ... \\\n", "0 0.007724 -0.029080 0.013690 -0.009045 0.023238 -0.000431 ... \n", "1 0.017581 -0.002738 -0.038254 -0.041900 -0.053605 0.027044 ... \n", "2 0.028191 -0.010388 -0.052607 -0.094258 0.026439 0.025329 ... \n", "3 -0.012584 -0.066111 -0.015364 -0.006006 -0.049788 0.031777 ... \n", "4 0.011783 -0.014584 -0.033242 -0.039972 -0.041038 0.000820 ... \n", ".. ... ... ... ... ... ... ... \n", "995 -0.003216 -0.013464 0.004712 0.002270 -0.053654 -0.008947 ... \n", "996 0.003408 0.011238 -0.003270 0.009822 0.010839 0.013731 ... \n", "997 0.009905 0.016094 -0.045669 -0.000068 -0.040511 0.034474 ... \n", "998 0.028191 -0.010388 -0.052607 -0.094258 0.026439 0.025329 ... \n", "999 0.028111 0.017755 -0.039428 -0.035738 -0.039431 0.019438 ... \n", "\n", " feature_502 feature_503 feature_504 feature_505 feature_506 \\\n", "0 -0.016954 -0.036230 -0.035321 0.018905 -0.004153 \n", "1 0.016105 -0.003971 0.019439 0.017352 -0.038702 \n", "2 0.044415 0.029121 0.023854 0.024050 0.003137 \n", "3 0.029615 -0.012054 0.038802 0.006297 0.011572 \n", "4 0.018747 -0.020979 0.011736 0.006028 -0.020612 \n", ".. ... ... ... ... ... \n", "995 0.005085 -0.007261 0.017898 0.018807 -0.013562 \n", "996 0.005113 -0.001555 0.023239 0.026508 -0.008224 \n", "997 0.018995 -0.012872 0.006261 -0.016766 -0.041759 \n", "998 0.044415 0.029121 0.023854 0.024050 0.003137 \n", "999 0.013316 -0.008019 0.014297 0.013871 -0.029063 \n", "\n", " feature_507 feature_508 feature_509 feature_510 feature_511 \n", "0 -0.050888 0.026344 -0.036549 0.005494 -0.022665 \n", "1 -0.057055 -0.019953 -0.015060 0.021243 -0.026957 \n", "2 -0.067067 -0.010747 0.015472 -0.007338 -0.064763 \n", "3 -0.025683 0.032493 -0.004576 0.034961 -0.028953 \n", "4 -0.021013 0.004732 -0.011790 -0.021903 0.000539 \n", ".. ... ... ... ... ... \n", "995 -0.015285 -0.032608 0.012056 0.017730 -0.023654 \n", "996 -0.027951 -0.023829 -0.005397 -0.006893 0.011582 \n", "997 -0.059727 -0.005747 -0.008027 0.004528 -0.001835 \n", "998 -0.067067 -0.010747 0.015472 -0.007338 -0.064763 \n", "999 -0.038971 -0.016512 -0.027867 0.003646 -0.013748 \n", "\n", "[1000 rows x 516 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contentscorethumbsUpCountApplicationfeature_0feature_1feature_2feature_3feature_4feature_5...feature_502feature_503feature_504feature_505feature_506feature_507feature_508feature_509feature_510feature_511
0thuận tiện lợi ích52Messenger0.007724-0.0290800.013690-0.0090450.023238-0.000431...-0.016954-0.036230-0.0353210.018905-0.004153-0.0508880.026344-0.0365490.005494-0.022665
14213Messenger0.017581-0.002738-0.038254-0.041900-0.0536050.027044...0.016105-0.0039710.0194390.017352-0.038702-0.057055-0.019953-0.0150600.021243-0.026957
2ok51Messenger0.028191-0.010388-0.052607-0.0942580.0264390.025329...0.0444150.0291210.0238540.0240500.003137-0.067067-0.0107470.015472-0.007338-0.064763
3xài tải nha58Messenger-0.012584-0.066111-0.015364-0.006006-0.0497880.031777...0.029615-0.0120540.0388020.0062970.011572-0.0256830.032493-0.0045760.034961-0.028953
4gútttt51Messenger0.011783-0.014584-0.033242-0.039972-0.0410380.000820...0.018747-0.0209790.0117360.006028-0.020612-0.0210130.004732-0.011790-0.0219030.000539
..................................................................
995cập nhật clllll xóa mẹ mess đi11Messenger-0.003216-0.0134640.0047120.002270-0.053654-0.008947...0.005085-0.0072610.0178980.018807-0.013562-0.015285-0.0326080.0120560.017730-0.023654
996ứng dụng kém10Messenger0.0034080.011238-0.0032700.0098220.0108390.013731...0.005113-0.0015550.0232390.026508-0.008224-0.027951-0.023829-0.005397-0.0068930.011582
997tiếng vọng10Messenger0.0099050.016094-0.045669-0.000068-0.0405110.034474...0.018995-0.0128720.006261-0.016766-0.041759-0.059727-0.005747-0.0080270.004528-0.001835
998ok50Messenger0.028191-0.010388-0.052607-0.0942580.0264390.025329...0.0444150.0291210.0238540.0240500.003137-0.067067-0.0107470.015472-0.007338-0.064763
999ghi10Messenger0.0281110.017755-0.039428-0.035738-0.0394310.019438...0.013316-0.0080190.0142970.013871-0.029063-0.038971-0.016512-0.0278670.003646-0.013748
\n", "

1000 rows × 516 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ] }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "data_features = data.drop(columns=['Application', 'score', 'content']).values\n", "data_features.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wVbeG3K9u4Lv", "outputId": "a885bba0-97e8-4cbe-d830-9dfae35825b5" }, "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(1000, 513)" ] }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "markdown", "source": [ "#Splitting the Dataset" ], "metadata": { "id": "2KxVFsypfruX" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(data_features, data['score'], test_size=0.2, random_state=42)" ], "metadata": { "id": "2vn3y0qwfwKP" }, "execution_count": 26, "outputs": [] }, { "cell_type": "markdown", "source": [ "#Training the XGBoost Classifier" ], "metadata": { "id": "cpcjVty9f4rd" } }, { "cell_type": "code", "source": [ "import xgboost as xgb\n", "\n", "from sklearn.preprocessing import LabelEncoder\n", "le = LabelEncoder()\n", "y_train = le.fit_transform(y_train)\n", "\n", "model = xgb.XGBClassifier(max_depth=10, n_estimators=1000, learning_rate=0.01)\n", "model.fit(X_train, y_train)" ], "metadata": { "id": "hPq1NZpnf6Iz", "colab": { "base_uri": "https://localhost:8080/", "height": 248 }, "outputId": "7c98091f-72b4-4dcc-a763-f9221c3c5517" }, "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.01, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=10, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " multi_strategy=None, n_estimators=1000, n_jobs=None,\n", " num_parallel_tree=None, objective='multi:softprob', ...)" ], "text/html": [ "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
              "              colsample_bylevel=None, colsample_bynode=None,\n",
              "              colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
              "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
              "              gamma=None, grow_policy=None, importance_type=None,\n",
              "              interaction_constraints=None, learning_rate=0.01, max_bin=None,\n",
              "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
              "              max_delta_step=None, max_depth=10, max_leaves=None,\n",
              "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
              "              multi_strategy=None, n_estimators=1000, n_jobs=None,\n",
              "              num_parallel_tree=None, objective='multi:softprob', ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "markdown", "source": [ "#Making Predictions and Evaluating the Model" ], "metadata": { "id": "JxOsuc18f9rY" } }, { "cell_type": "code", "source": [ "predictions = model.predict(X_test)" ], "metadata": { "id": "z69ZrEEhgCZm" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.metrics import classification_report\n", "\n", "print(classification_report(y_test, predictions))" ], "metadata": { "id": "vIrg6tzPgF43", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7df30b19-c0a0-4641-f482-d7a20943d7a3" }, "execution_count": 29, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " precision recall f1-score support\n", "\n", " 0 0.00 0.00 0.00 0\n", " 1 0.33 0.01 0.03 77\n", " 2 0.40 0.12 0.19 16\n", " 3 0.15 0.10 0.12 21\n", " 4 0.06 0.24 0.10 17\n", " 5 0.00 0.00 0.00 69\n", "\n", " accuracy 0.04 200\n", " macro avg 0.16 0.08 0.07 200\n", "weighted avg 0.18 0.04 0.05 200\n", "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ] }, { "cell_type": "code", "source": [ "from sklearn.metrics import roc_auc_score\n", "y_test_encoded = le.fit_transform(y_test)\n", "pred_prob = model.predict_proba(X_test)\n", "print(y_test_encoded.shape)\n", "print(predictions.shape)\n", "auc = roc_auc_score(y_test_encoded, pred_prob, multi_class='ovr')\n", "print('AUC: %.2f' % auc)\n" ], "metadata": { "id": "iwAy6scFgK6i", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "15fea9d3-c33d-4763-d7bc-1d601aa79fa7" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(200,)\n", "(200,)\n", "AUC: 0.70\n" ] } ] } ] }