{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XeyJCRFOLOvg"
      },
      "source": [
        "# **Evaluating the Recommendation Model**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 305,
      "metadata": {
        "id": "EWiqFUizLOvh"
      },
      "outputs": [],
      "source": [
        "import gradio as gr\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "from joblib import load\n",
        "import sklearn"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 306,
      "metadata": {
        "id": "egV9aaWzLOvk"
      },
      "outputs": [],
      "source": [
        "user_preferences = pd.read_csv('user_preferences.zip')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 307,
      "metadata": {
        "id": "-7EqGsy7LOvj"
      },
      "outputs": [],
      "source": [
        "# Define the same neural network model\n",
        "class ImprovedSongRecommender(nn.Module):\n",
        "    def __init__(self, input_size, num_titles):\n",
        "        super(ImprovedSongRecommender, self).__init__()\n",
        "        self.fc1 = nn.Linear(input_size, 128)\n",
        "        self.bn1 = nn.BatchNorm1d(128)\n",
        "        self.fc2 = nn.Linear(128, 256)\n",
        "        self.bn2 = nn.BatchNorm1d(256)\n",
        "        self.fc3 = nn.Linear(256, 128)\n",
        "        self.bn3 = nn.BatchNorm1d(128)\n",
        "        self.output = nn.Linear(128, num_titles)\n",
        "        self.dropout = nn.Dropout(0.5)\n",
        "\n",
        "    def forward(self, x):\n",
        "        x = torch.relu(self.bn1(self.fc1(x)))\n",
        "        x = self.dropout(x)\n",
        "        x = torch.relu(self.bn2(self.fc2(x)))\n",
        "        x = self.dropout(x)\n",
        "        x = torch.relu(self.bn3(self.fc3(x)))\n",
        "        x = self.dropout(x)\n",
        "        x = self.output(x)\n",
        "        return x\n",
        "\n",
        "# Load the trained model\n",
        "model_path = \"improved_model.pth\"\n",
        "num_unique_titles = 4855"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 308,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WnWXqoEeLOvk",
        "outputId": "bc9d2c9a-6e8c-40b8-8cff-303d23b38cbd"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "ImprovedSongRecommender(\n",
              "  (fc1): Linear(in_features=2, out_features=128, bias=True)\n",
              "  (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
              "  (fc2): Linear(in_features=128, out_features=256, bias=True)\n",
              "  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
              "  (fc3): Linear(in_features=256, out_features=128, bias=True)\n",
              "  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
              "  (output): Linear(in_features=128, out_features=4855, bias=True)\n",
              "  (dropout): Dropout(p=0.5, inplace=False)\n",
              ")"
            ]
          },
          "metadata": {},
          "execution_count": 308
        }
      ],
      "source": [
        "model = ImprovedSongRecommender(input_size=2, num_titles=num_unique_titles)\n",
        "model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))\n",
        "model.eval()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 309,
      "metadata": {
        "id": "s5acd8QeLOvk"
      },
      "outputs": [],
      "source": [
        "# Load the label encoders and scaler\n",
        "label_encoders_path = \"new_label_encoders.joblib\"\n",
        "scaler_path = \"new_scaler.joblib\"\n",
        "\n",
        "label_encoders = load(label_encoders_path)\n",
        "scaler = load(scaler_path)\n",
        "\n",
        "# Create a mapping from encoded indices to actual song titles\n",
        "index_to_song_title = {index: title for index, title in enumerate(label_encoders['title'].classes_)}\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
        "import joblib\n",
        "import re\n",
        "\n",
        "# Function to clean tags and artist names\n",
        "def clean_text(text):\n",
        "    # Convert to lowercase\n",
        "    text = text.lower()\n",
        "    # Remove special characters and digits\n",
        "    text = re.sub(r'[^a-zA-Z\\s]', '', text)\n",
        "    # Remove extra white spaces\n",
        "    text = re.sub(r'\\s+', ' ', text).strip()\n",
        "    return text\n",
        "\n",
        "columns_to_check = ['tags', 'artist', 'tags', 'song', 'listeners', 'playcount']  # Specify the columns you want to check for NaN values\n",
        "user_preferences = user_preferences.dropna(subset=columns_to_check)\n",
        "\n",
        "\n",
        "# Clean 'tags' and 'artist_name' columns\n",
        "user_preferences['tags'] = user_preferences['tags'].apply(clean_text)\n",
        "user_preferences['artist'] = user_preferences['artist'].apply(clean_text)\n",
        "\n",
        "def label_encode_data(df):\n",
        "    df = df.copy(deep=True)\n",
        "    label_encoders = {}\n",
        "    unknown_label = 'unknown'  # Define an unknown label\n",
        "\n",
        "    for column in ['tags', 'song', 'artist']:\n",
        "        le = LabelEncoder()\n",
        "        unique_categories = df[column].unique().tolist()\n",
        "        unique_categories.append(unknown_label)\n",
        "        le.fit(unique_categories)\n",
        "        df[column] = le.transform(df[column].astype(str))\n",
        "        label_encoders[column] = le\n",
        "\n",
        "    return df, label_encoders\n",
        "\n",
        "# Normalize numerical features\n",
        "scaler = MinMaxScaler()\n",
        "user_preferences[['listeners', 'playcount']] = scaler.fit_transform(user_preferences[['listeners', 'playcount']])\n",
        "\n",
        "# Label encode categorical features\n",
        "df_scaled, label_encoders = label_encode_data(user_preferences.loc[:, ['tags', 'artist', 'listeners', 'playcount', 'song']])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qeuVdOrZMX2H",
        "outputId": "3e38f50d-a6fe-4ec4-eafe-c119ef4228fe"
      },
      "execution_count": 310,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "<ipython-input-310-b2dbd9207146>:20: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  user_preferences['tags'] = user_preferences['tags'].apply(clean_text)\n",
            "<ipython-input-310-b2dbd9207146>:21: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  user_preferences['artist'] = user_preferences['artist'].apply(clean_text)\n",
            "<ipython-input-310-b2dbd9207146>:40: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  user_preferences[['listeners', 'playcount']] = scaler.fit_transform(user_preferences[['listeners', 'playcount']])\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split"
      ],
      "metadata": {
        "id": "f8Z0xtfCOWkC"
      },
      "execution_count": 311,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Split data into features and target\n",
        "X = df_scaled[['tags', 'artist']]\n",
        "y = df_scaled['song']\n",
        "\n",
        "# Split the dataset into training and testing sets\n",
        "X_valid, X_test, y_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
        "print(\"Data split into validation and testing sets.\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tuyHessoL9AS",
        "outputId": "9af89ed4-5ce3-423a-a60e-e6c012b35421"
      },
      "execution_count": 312,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Data split into validation and testing sets.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "import torch.optim as optim\n",
        "from torch.utils.data import DataLoader\n",
        "import numpy as np\n",
        "from sklearn.metrics import accuracy_score"
      ],
      "metadata": {
        "id": "YO3SpUROPRIL"
      },
      "execution_count": 313,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "valid_loader = DataLoader(list(zip(X_valid.values.astype(float), y_valid)), batch_size=1, shuffle=True)\n",
        "test_loader = DataLoader(list(zip(X_test.values.astype(float), y_test)), batch_size=1, shuffle=False)\n"
      ],
      "metadata": {
        "id": "ddLMncl-Paj5"
      },
      "execution_count": 314,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "valid_accuracy = 0\n",
        "test_accuracy = 0\n",
        "for features, labels in valid_loader:\n",
        "    preds = model(features.float().detach())\n",
        "\n",
        "    # Get the predicted class (the one with the highest score)\n",
        "    _, predicted_class = torch.max(preds, 1)\n",
        "\n",
        "    # Convert to numpy arrays\n",
        "    predicted_class_np = predicted_class.numpy()\n",
        "    labels_np = labels.numpy()\n",
        "\n",
        "    # Calculate accuracy\n",
        "    accuracy = accuracy_score(labels_np, predicted_class_np)\n",
        "    valid_accuracy += accuracy\n",
        "\n",
        "for features, labels in test_loader:\n",
        "    preds = model(features.float())\n",
        "    # Get the predicted class (the one with the highest score)\n",
        "    _, predicted_class = torch.max(preds, 1)\n",
        "\n",
        "    # Convert to numpy arrays\n",
        "    predicted_class_np = predicted_class.numpy()\n",
        "    labels_np = labels.numpy()\n",
        "\n",
        "    # Calculate accuracy\n",
        "    accuracy = accuracy_score(labels_np, predicted_class_np)\n",
        "    test_accuracy += accuracy"
      ],
      "metadata": {
        "id": "CIH4yNETOR6r"
      },
      "execution_count": 315,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print('The loss of the model on the unseen validation dataset is: ', valid_accuracy)\n",
        "print('The loss of the model on the unseen test dataset is: ', test_accuracy)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Tf5kf1dMOpdw",
        "outputId": "5377af95-5412-4593-e4b7-c74ec03425a0"
      },
      "execution_count": 316,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "The loss of the model on the unseen validation dataset is:  2.0\n",
            "The loss of the model on the unseen test dataset is:  0.0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "TYbj1lHYQZtg"
      },
      "execution_count": 316,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "base",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.1"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}