{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "XeyJCRFOLOvg" }, "source": [ "# **Evaluating the Recommendation Model**" ] }, { "cell_type": "code", "execution_count": 305, "metadata": { "id": "EWiqFUizLOvh" }, "outputs": [], "source": [ "import gradio as gr\n", "import torch\n", "import torch.nn as nn\n", "from joblib import load\n", "import sklearn" ] }, { "cell_type": "code", "execution_count": 306, "metadata": { "id": "egV9aaWzLOvk" }, "outputs": [], "source": [ "user_preferences = pd.read_csv('user_preferences.zip')" ] }, { "cell_type": "code", "execution_count": 307, "metadata": { "id": "-7EqGsy7LOvj" }, "outputs": [], "source": [ "# Define the same neural network model\n", "class ImprovedSongRecommender(nn.Module):\n", " def __init__(self, input_size, num_titles):\n", " super(ImprovedSongRecommender, self).__init__()\n", " self.fc1 = nn.Linear(input_size, 128)\n", " self.bn1 = nn.BatchNorm1d(128)\n", " self.fc2 = nn.Linear(128, 256)\n", " self.bn2 = nn.BatchNorm1d(256)\n", " self.fc3 = nn.Linear(256, 128)\n", " self.bn3 = nn.BatchNorm1d(128)\n", " self.output = nn.Linear(128, num_titles)\n", " self.dropout = nn.Dropout(0.5)\n", "\n", " def forward(self, x):\n", " x = torch.relu(self.bn1(self.fc1(x)))\n", " x = self.dropout(x)\n", " x = torch.relu(self.bn2(self.fc2(x)))\n", " x = self.dropout(x)\n", " x = torch.relu(self.bn3(self.fc3(x)))\n", " x = self.dropout(x)\n", " x = self.output(x)\n", " return x\n", "\n", "# Load the trained model\n", "model_path = \"improved_model.pth\"\n", "num_unique_titles = 4855" ] }, { "cell_type": "code", "execution_count": 308, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WnWXqoEeLOvk", "outputId": "bc9d2c9a-6e8c-40b8-8cff-303d23b38cbd" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "ImprovedSongRecommender(\n", " (fc1): Linear(in_features=2, out_features=128, bias=True)\n", " (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", " (fc2): Linear(in_features=128, out_features=256, bias=True)\n", " (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", " (fc3): Linear(in_features=256, out_features=128, bias=True)\n", " (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", " (output): Linear(in_features=128, out_features=4855, bias=True)\n", " (dropout): Dropout(p=0.5, inplace=False)\n", ")" ] }, "metadata": {}, "execution_count": 308 } ], "source": [ "model = ImprovedSongRecommender(input_size=2, num_titles=num_unique_titles)\n", "model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))\n", "model.eval()" ] }, { "cell_type": "code", "execution_count": 309, "metadata": { "id": "s5acd8QeLOvk" }, "outputs": [], "source": [ "# Load the label encoders and scaler\n", "label_encoders_path = \"new_label_encoders.joblib\"\n", "scaler_path = \"new_scaler.joblib\"\n", "\n", "label_encoders = load(label_encoders_path)\n", "scaler = load(scaler_path)\n", "\n", "# Create a mapping from encoded indices to actual song titles\n", "index_to_song_title = {index: title for index, title in enumerate(label_encoders['title'].classes_)}\n" ] }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n", "import joblib\n", "import re\n", "\n", "# Function to clean tags and artist names\n", "def clean_text(text):\n", " # Convert to lowercase\n", " text = text.lower()\n", " # Remove special characters and digits\n", " text = re.sub(r'[^a-zA-Z\\s]', '', text)\n", " # Remove extra white spaces\n", " text = re.sub(r'\\s+', ' ', text).strip()\n", " return text\n", "\n", "columns_to_check = ['tags', 'artist', 'tags', 'song', 'listeners', 'playcount'] # Specify the columns you want to check for NaN values\n", "user_preferences = user_preferences.dropna(subset=columns_to_check)\n", "\n", "\n", "# Clean 'tags' and 'artist_name' columns\n", "user_preferences['tags'] = user_preferences['tags'].apply(clean_text)\n", "user_preferences['artist'] = user_preferences['artist'].apply(clean_text)\n", "\n", "def label_encode_data(df):\n", " df = df.copy(deep=True)\n", " label_encoders = {}\n", " unknown_label = 'unknown' # Define an unknown label\n", "\n", " for column in ['tags', 'song', 'artist']:\n", " le = LabelEncoder()\n", " unique_categories = df[column].unique().tolist()\n", " unique_categories.append(unknown_label)\n", " le.fit(unique_categories)\n", " df[column] = le.transform(df[column].astype(str))\n", " label_encoders[column] = le\n", "\n", " return df, label_encoders\n", "\n", "# Normalize numerical features\n", "scaler = MinMaxScaler()\n", "user_preferences[['listeners', 'playcount']] = scaler.fit_transform(user_preferences[['listeners', 'playcount']])\n", "\n", "# Label encode categorical features\n", "df_scaled, label_encoders = label_encode_data(user_preferences.loc[:, ['tags', 'artist', 'listeners', 'playcount', 'song']])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qeuVdOrZMX2H", "outputId": "3e38f50d-a6fe-4ec4-eafe-c119ef4228fe" }, "execution_count": 310, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":20: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " user_preferences['tags'] = user_preferences['tags'].apply(clean_text)\n", ":21: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " user_preferences['artist'] = user_preferences['artist'].apply(clean_text)\n", ":40: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " user_preferences[['listeners', 'playcount']] = scaler.fit_transform(user_preferences[['listeners', 'playcount']])\n" ] } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split" ], "metadata": { "id": "f8Z0xtfCOWkC" }, "execution_count": 311, "outputs": [] }, { "cell_type": "code", "source": [ "# Split data into features and target\n", "X = df_scaled[['tags', 'artist']]\n", "y = df_scaled['song']\n", "\n", "# Split the dataset into training and testing sets\n", "X_valid, X_test, y_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "print(\"Data split into validation and testing sets.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tuyHessoL9AS", "outputId": "9af89ed4-5ce3-423a-a60e-e6c012b35421" }, "execution_count": 312, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Data split into validation and testing sets.\n" ] } ] }, { "cell_type": "code", "source": [ "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "from torch.utils.data import DataLoader\n", "import numpy as np\n", "from sklearn.metrics import accuracy_score" ], "metadata": { "id": "YO3SpUROPRIL" }, "execution_count": 313, "outputs": [] }, { "cell_type": "code", "source": [ "valid_loader = DataLoader(list(zip(X_valid.values.astype(float), y_valid)), batch_size=1, shuffle=True)\n", "test_loader = DataLoader(list(zip(X_test.values.astype(float), y_test)), batch_size=1, shuffle=False)\n" ], "metadata": { "id": "ddLMncl-Paj5" }, "execution_count": 314, "outputs": [] }, { "cell_type": "code", "source": [ "valid_accuracy = 0\n", "test_accuracy = 0\n", "for features, labels in valid_loader:\n", " preds = model(features.float().detach())\n", "\n", " # Get the predicted class (the one with the highest score)\n", " _, predicted_class = torch.max(preds, 1)\n", "\n", " # Convert to numpy arrays\n", " predicted_class_np = predicted_class.numpy()\n", " labels_np = labels.numpy()\n", "\n", " # Calculate accuracy\n", " accuracy = accuracy_score(labels_np, predicted_class_np)\n", " valid_accuracy += accuracy\n", "\n", "for features, labels in test_loader:\n", " preds = model(features.float())\n", " # Get the predicted class (the one with the highest score)\n", " _, predicted_class = torch.max(preds, 1)\n", "\n", " # Convert to numpy arrays\n", " predicted_class_np = predicted_class.numpy()\n", " labels_np = labels.numpy()\n", "\n", " # Calculate accuracy\n", " accuracy = accuracy_score(labels_np, predicted_class_np)\n", " test_accuracy += accuracy" ], "metadata": { "id": "CIH4yNETOR6r" }, "execution_count": 315, "outputs": [] }, { "cell_type": "code", "source": [ "print('The loss of the model on the unseen validation dataset is: ', valid_accuracy)\n", "print('The loss of the model on the unseen test dataset is: ', test_accuracy)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Tf5kf1dMOpdw", "outputId": "5377af95-5412-4593-e4b7-c74ec03425a0" }, "execution_count": 316, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The loss of the model on the unseen validation dataset is: 2.0\n", "The loss of the model on the unseen test dataset is: 0.0\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "TYbj1lHYQZtg" }, "execution_count": 316, "outputs": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.1" }, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 0 }