{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d8c593f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-26T07:31:34.213141Z",
     "start_time": "2023-03-26T07:31:14.082603Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install huggingface_hub\n",
    "!pip install datasets\n",
    "!pip install keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bca92d1d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-26T14:03:42.287776Z",
     "start_time": "2023-03-26T14:03:39.989670Z"
    }
   },
   "outputs": [],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "from datasets import load_dataset\n",
    "import pandas as pd\n",
    "from datasets import load_dataset\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras.applications.vgg16 import VGG16\n",
    "from tensorflow.keras.models import Model\n",
    "from tensorflow.keras.layers import Dense, GlobalAveragePooling2D\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from tensorflow.keras.utils import to_categorical\n",
    "from PIL import Image\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62254f94",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-26T14:03:42.317000Z",
     "start_time": "2023-03-26T14:03:42.289947Z"
    }
   },
   "outputs": [],
   "source": [
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57308b59",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-26T14:03:52.591875Z",
     "start_time": "2023-03-26T14:03:48.476822Z"
    }
   },
   "outputs": [],
   "source": [
    "# load dataset from hugging face\n",
    "# prepare data for training, validation and testing\n",
    "train_ds, val_ds = load_dataset('competitions/aiornot', split=\"train\").train_test_split(test_size=0.15).values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b83b1536",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-03-26T14:04:10.210069Z",
     "start_time": "2023-03-26T14:03:53.833533Z"
    }
   },
   "outputs": [],
   "source": [
    "data_sz = 1000\n",
    "X_train = train_ds[:data_sz]['image']\n",
    "X_val = val_ds[:data_sz]['image']\n",
    "Y_train = to_categorical(train_ds[:data_sz]['label'])\n",
    "Y_val = to_categorical(val_ds[:data_sz]['label'])\n",
    "# Convert the input data to a NumPy array\n",
    "X_train = np.stack([np.array(image) for image in X_train])\n",
    "X_val = np.stack([np.array(image) for image in X_val])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72df9419",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-03-26T14:04:33.658Z"
    }
   },
   "outputs": [],
   "source": [
    "with tf.device('/device:GPU:3'):\n",
    "    # Load the VGG16 model pre-trained on ImageNet\n",
    "    base_model = VGG16(weights='imagenet', include_top=False)\n",
    "\n",
    "    # Add a global spatial average pooling layer\n",
    "    x = base_model.output\n",
    "    x = GlobalAveragePooling2D()(x)\n",
    "\n",
    "    # Add a fully-connected layer\n",
    "    x = Dense(1024, activation='relu')(x)\n",
    "\n",
    "    # Add a logistic layer with the number of classes of target variable\n",
    "    num_classes = 2\n",
    "    predictions = Dense(num_classes, activation='softmax')(x)\n",
    "\n",
    "    # Create the final model\n",
    "    model = Model(inputs=base_model.input, outputs=predictions)\n",
    "\n",
    "    # Freeze all layers in the base VGG16 model\n",
    "    for layer in base_model.layers:\n",
    "        layer.trainable = False\n",
    "\n",
    "    # Compile the model\n",
    "    model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])\n",
    "\n",
    "# Train the model on your new dataset\n",
    "model.fit(X_train, Y_train, epochs=10, validation_data=(X_val, Y_val))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbf079b7",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-03-26T14:05:03.786Z"
    }
   },
   "outputs": [],
   "source": [
    "# Generate predictions for the data\n",
    "y_pred = model.predict(X_val)\n",
    "# Convert predictions and true labels to class indices\n",
    "y_pred_classes = y_pred.argmax(axis=1)\n",
    "y_true_classes = Y_val.argmax(axis=1)\n",
    "# Find the indices of the misclassified samples\n",
    "misclassified_indices = np.where(y_pred_classes != y_true_classes)[0]\n",
    "\n",
    "# Get the misclassified samples\n",
    "# x_misclassified = X_val[misclassified_indices]\n",
    "# y_misclassified_true = Y_val[misclassified_indices]\n",
    "# y_misclassified_pred = y_pred[misclassified_indices]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e639f6b",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-03-26T14:05:06.090Z"
    }
   },
   "outputs": [],
   "source": [
    "# a helper function to view missclassfied data with the image and prediction\n",
    "def checkMiss(idx):\n",
    "    print(\"\\ncorrect:\", Y_val[idx])\n",
    "    print(\"miss:\", y_pred[idx])\n",
    "    img = Image.fromarray(X_val[idx])\n",
    "    img.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "951ff24e",
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-03-26T14:05:07.650Z"
    }
   },
   "outputs": [],
   "source": [
    "# view 5 miss classified data to see what could be improved\n",
    "for i in range(10):\n",
    "    checkMiss(misclassified_indices[i])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}