diff --git "a/notebooks/weather_classification.ipynb" "b/notebooks/weather_classification.ipynb"
new file mode 100755--- /dev/null
+++ "b/notebooks/weather_classification.ipynb"
@@ -0,0 +1,10562 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 136,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import plotly.express as px\n",
+ "import plotly.graph_objects as go\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder\n",
+ "from sklearn.metrics import mean_squared_error\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "import torch\n",
+ "import torch.nn as nn\n",
+ "import math, time, pickle\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " date | \n",
+ " precipitation | \n",
+ " temp_max | \n",
+ " temp_min | \n",
+ " wind | \n",
+ " weather | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2012-01-01 | \n",
+ " 0.0 | \n",
+ " 12.8 | \n",
+ " 5.0 | \n",
+ " 4.7 | \n",
+ " drizzle | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2012-01-02 | \n",
+ " 10.9 | \n",
+ " 10.6 | \n",
+ " 2.8 | \n",
+ " 4.5 | \n",
+ " rain | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2012-01-03 | \n",
+ " 0.8 | \n",
+ " 11.7 | \n",
+ " 7.2 | \n",
+ " 2.3 | \n",
+ " rain | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2012-01-04 | \n",
+ " 20.3 | \n",
+ " 12.2 | \n",
+ " 5.6 | \n",
+ " 4.7 | \n",
+ " rain | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2012-01-05 | \n",
+ " 1.3 | \n",
+ " 8.9 | \n",
+ " 2.8 | \n",
+ " 6.1 | \n",
+ " rain | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " date precipitation temp_max temp_min wind weather\n",
+ "0 2012-01-01 0.0 12.8 5.0 4.7 drizzle\n",
+ "1 2012-01-02 10.9 10.6 2.8 4.5 rain\n",
+ "2 2012-01-03 0.8 11.7 7.2 2.3 rain\n",
+ "3 2012-01-04 20.3 12.2 5.6 4.7 rain\n",
+ "4 2012-01-05 1.3 8.9 2.8 6.1 rain"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv(\"./data/seattle-weather.csv\")\n",
+ "df = df.sort_values('date')\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of columns with NaNs: 0\n",
+ "Weather Outcomes: ['drizzle' 'rain' 'sun' 'snow' 'fog']\n",
+ "Outcome Distribution: \n",
+ "weather\n",
+ "rain 641\n",
+ "sun 640\n",
+ "fog 101\n",
+ "drizzle 53\n",
+ "snow 26\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "nans = df.isnull().sum()\n",
+ "print(f\"Number of columns with NaNs: {len(nans[nans > 0])}\")\n",
+ "print(f\"Weather Outcomes: {df['weather'].unique()}\")\n",
+ "print(f\"Outcome Distribution: \\n{df['weather'].value_counts()}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precipitation temp_max temp_min wind\n",
+ "precipitation 1.000000 -0.228555 -0.072684 0.328045\n",
+ "temp_max -0.228555 1.000000 0.875687 -0.164857\n",
+ "temp_min -0.072684 0.875687 1.000000 -0.074185\n",
+ "wind 0.328045 -0.164857 -0.074185 1.000000\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Now calculate the correlation matrix\n",
+ "corr = df.corr(numeric_only = True)\n",
+ "print(corr)\n",
+ "\n",
+ "# plt.figure(figsize =(15, 12))\n",
+ "# sns.heatmap(corr)\n",
+ "# plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 1461.000000\n",
+ "mean 3.029432\n",
+ "std 6.680194\n",
+ "min 0.000000\n",
+ "25% 0.000000\n",
+ "50% 0.000000\n",
+ "75% 2.800000\n",
+ "max 55.900000\n",
+ "Name: precipitation, dtype: float64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['precipitation'].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 1461.000000\n",
+ "mean 8.234771\n",
+ "std 5.023004\n",
+ "min -7.100000\n",
+ "25% 4.400000\n",
+ "50% 8.300000\n",
+ "75% 12.200000\n",
+ "max 18.300000\n",
+ "Name: temp_min, dtype: float64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['temp_min'].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 1461.000000\n",
+ "mean 16.439083\n",
+ "std 7.349758\n",
+ "min -1.600000\n",
+ "25% 10.600000\n",
+ "50% 15.600000\n",
+ "75% 22.200000\n",
+ "max 35.600000\n",
+ "Name: temp_max, dtype: float64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['temp_max'].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "count 1461.000000\n",
+ "mean 3.241136\n",
+ "std 1.437825\n",
+ "min 0.400000\n",
+ "25% 2.200000\n",
+ "50% 3.000000\n",
+ "75% 4.000000\n",
+ "max 9.500000\n",
+ "Name: wind, dtype: float64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['wind'].describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.set_style(\"darkgrid\")\n",
+ "plt.figure(figsize = (15,9))\n",
+ "plt.plot(df['temp_max'], label = 'Temp Max')\n",
+ "plt.plot(df['temp_min'], label = 'Temp Min')\n",
+ "plt.xticks(range(0,df.shape[0],500),df['date'].loc[::500],rotation=45)\n",
+ "plt.title(\"Historical Temperature\",fontsize=18, fontweight='bold')\n",
+ "plt.xlabel('Date',fontsize=18)\n",
+ "plt.ylabel('Temperature (C)',fontsize=18)\n",
+ "plt.legend()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.set_style(\"darkgrid\")\n",
+ "plt.figure(figsize = (15,9))\n",
+ "plt.plot(df['precipitation'])\n",
+ "plt.xticks(range(0,df.shape[0],500),df['date'].loc[::500],rotation=45)\n",
+ "plt.title(\"Historical Precipitation\",fontsize=18, fontweight='bold')\n",
+ "plt.xlabel('Date',fontsize=18)\n",
+ "plt.ylabel('Precipitation (mm)',fontsize=18)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.set_style(\"darkgrid\")\n",
+ "plt.figure(figsize = (15,9))\n",
+ "plt.plot(df['wind'])\n",
+ "plt.xticks(range(0,df.shape[0],500),df['date'].loc[::500],rotation=45)\n",
+ "plt.title(\"Historical Wind\",fontsize=18, fontweight='bold')\n",
+ "plt.xlabel('Date',fontsize=18)\n",
+ "plt.ylabel('Wind (m/s)',fontsize=18)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1168, 4) (293, 4)\n",
+ "(1168,) (293,)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Target variable and train set\n",
+ "oe = OrdinalEncoder()\n",
+ "oe.fit(df[['weather']])\n",
+ "df['weather_ordinal'] = oe.transform(df[['weather']])\n",
+ "\n",
+ "y = df['weather_ordinal'].copy()\n",
+ "X = df.copy().drop('weather_ordinal', axis = 1)\n",
+ "X = X.drop('weather', axis = 1)\n",
+ "X = X.drop('date', axis = 1)\n",
+ "\n",
+ "# Create and fit MinMaxScaler\n",
+ "mmscaler = MinMaxScaler()\n",
+ "sc = mmscaler.fit(X)\n",
+ "\n",
+ "# Scale input features\n",
+ "X = sc.transform(X)\n",
+ "\n",
+ "# Split test and train data \n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)\n",
+ "print(X_train.shape, X_test.shape)\n",
+ "print(y_train.shape, y_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n_estimators = 10, Train accuracy = 0.9914383561643836, Test accuracy = 0.78839590443686\n",
+ "n_estimators = 15, Train accuracy = 0.9914383561643836, Test accuracy = 0.8156996587030717\n",
+ "n_estimators = 20, Train accuracy = 0.9931506849315068, Test accuracy = 0.8156996587030717\n",
+ "n_estimators = 25, Train accuracy = 0.9931506849315068, Test accuracy = 0.825938566552901\n",
+ "n_estimators = 30, Train accuracy = 0.9948630136986302, Test accuracy = 0.8191126279863481\n",
+ "n_estimators = 35, Train accuracy = 0.9965753424657534, Test accuracy = 0.825938566552901\n",
+ "n_estimators = 40, Train accuracy = 0.9957191780821918, Test accuracy = 0.8191126279863481\n",
+ "n_estimators = 45, Train accuracy = 0.9965753424657534, Test accuracy = 0.8293515358361775\n",
+ "n_estimators = 50, Train accuracy = 0.9965753424657534, Test accuracy = 0.8225255972696246\n",
+ "n_estimators = 55, Train accuracy = 0.9965753424657534, Test accuracy = 0.8225255972696246\n",
+ "n_estimators = 60, Train accuracy = 0.9965753424657534, Test accuracy = 0.825938566552901\n",
+ "n_estimators = 65, Train accuracy = 0.997431506849315, Test accuracy = 0.8225255972696246\n",
+ "n_estimators = 70, Train accuracy = 0.997431506849315, Test accuracy = 0.8191126279863481\n",
+ "n_estimators = 75, Train accuracy = 0.997431506849315, Test accuracy = 0.825938566552901\n",
+ "n_estimators = 80, Train accuracy = 0.997431506849315, Test accuracy = 0.8191126279863481\n",
+ "n_estimators = 85, Train accuracy = 0.997431506849315, Test accuracy = 0.8191126279863481\n",
+ "n_estimators = 90, Train accuracy = 0.997431506849315, Test accuracy = 0.8191126279863481\n",
+ "n_estimators = 95, Train accuracy = 0.997431506849315, Test accuracy = 0.8122866894197952\n",
+ "n_estimators = 100, Train accuracy = 0.997431506849315, Test accuracy = 0.8156996587030717\n"
+ ]
+ }
+ ],
+ "source": [
+ "for n in range(10, 105, 5):\n",
+ " r = RandomForestClassifier(n_estimators = n, random_state = 42)\n",
+ " r.fit(X_train, y_train)\n",
+ " train_score = r.score(X_train, y_train)\n",
+ " test_score = r.score(X_test, y_test)\n",
+ " print(f\"n_estimators = {n}, Train accuracy = {train_score}, Test accuracy = {test_score}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train score is: 0.9965753424657534\n",
+ "Test score is: 0.8293515358361775\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Random Forest\n",
+ "rfc = RandomForestClassifier(n_estimators = 45, random_state = 42)\n",
+ "rfc.fit(X_train, y_train)\n",
+ "print(f\"Train score is: {rfc.score(X_train, y_train)}\")\n",
+ "print(f\"Test score is: {rfc.score(X_test, y_test)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[0.06, 0.6, 0.94, 0.1]\n",
+ "[2.]\n",
+ "['rain']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# These values are close to the prediction for rain\n",
+ "X_new = [[0.06, 0.6, 0.94, 0.1]]\n",
+ "y_new = [rfc.predict(X_new)]\n",
+ "print(X_new[0])\n",
+ "print(y_new[0])\n",
+ "print(oe.inverse_transform(y_new)[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create lookup for weather\n",
+ "images = {'drizzle': 'assets/drizzle.png', 'rain': 'assets/rain.png', 'sun': 'assets/sun.png', 'snow': 'assets/snow.png', 'fog': 'assets/fog.png'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Save Model\n",
+ "model_file = open('weather_prediction_model.pkl', 'wb')\n",
+ "pickle.dump(rfc, model_file)\n",
+ "pickle.dump(oe, model_file)\n",
+ "pickle.dump(sc, model_file)\n",
+ "pickle.dump(images, model_file)\n",
+ "model_file.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# GRU Model Class\n",
+ "class GRU(nn.Module):\n",
+ " def __init__(self, input_dim, hidden_dim, num_layers, output_dim):\n",
+ " super(GRU, self).__init__()\n",
+ " self.hidden_dim = hidden_dim\n",
+ " self.num_layers = num_layers\n",
+ " \n",
+ " self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)\n",
+ " self.fc = nn.Linear(hidden_dim, output_dim)\n",
+ "\n",
+ " def forward(self, x):\n",
+ " h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()\n",
+ " out, (hn) = self.gru(x, (h0.detach()))\n",
+ " out = self.fc(out[:, -1, :]) \n",
+ " return out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def split_data(stock, lookback):\n",
+ " data_raw = np.array(stock) # convert to numpy array\n",
+ " data = []\n",
+ " \n",
+ " # create all possible sequences of length seq_len\n",
+ " for index in range(len(data_raw) - lookback): \n",
+ " data.append(data_raw[index: index + lookback])\n",
+ " \n",
+ " data = np.array(data);\n",
+ " test_set_size = int(np.round(0.2*data.shape[0]));\n",
+ " train_set_size = data.shape[0] - (test_set_size);\n",
+ " \n",
+ " X_train = data[:train_set_size,:-1,:]\n",
+ " y_train = data[:train_set_size,-1,:]\n",
+ " \n",
+ " X_test = data[train_set_size:,:-1]\n",
+ " y_test = data[train_set_size:,-1,:]\n",
+ " \n",
+ " return [X_train, y_train, X_test, y_test]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 146,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "precipitation = df['precipitation']\n",
+ "precip_scaler = MinMaxScaler(feature_range=(0, 1))\n",
+ "precipitation = precip_scaler.fit_transform(precipitation.values.reshape(-1,1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "temp_max = df['temp_max']\n",
+ "temp_max_scaler = MinMaxScaler(feature_range=(0, 1))\n",
+ "temp_max = temp_max_scaler.fit_transform(temp_max.values.reshape(-1,1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "temp_min = df['temp_min']\n",
+ "temp_min_scaler = MinMaxScaler(feature_range=(0, 1))\n",
+ "temp_min = temp_min_scaler.fit_transform(temp_min.values.reshape(-1,1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wind = df['wind']\n",
+ "wind_scaler = MinMaxScaler(feature_range=(0, 1))\n",
+ "wind = wind_scaler.fit_transform(wind.values.reshape(-1,1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 158,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "X_train.shape = (1163, 6, 1)\n",
+ "y_train.shape = (1163, 1)\n",
+ "X_test.shape = (291, 6, 1)\n",
+ "y_test.shape = (291, 1)\n"
+ ]
+ }
+ ],
+ "source": [
+ "lookback = 7 # choose sequence length\n",
+ "X_train, y_train, X_test, y_test = split_data(precipitation, lookback)\n",
+ "print('X_train.shape = ',X_train.shape)\n",
+ "print('y_train.shape = ',y_train.shape)\n",
+ "print('X_test.shape = ',X_test.shape)\n",
+ "print('y_test.shape = ',y_test.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 159,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train = torch.from_numpy(X_train).type(torch.Tensor)\n",
+ "X_test = torch.from_numpy(X_test).type(torch.Tensor)\n",
+ "y_train_gru = torch.from_numpy(y_train).type(torch.Tensor)\n",
+ "y_test_gru = torch.from_numpy(y_test).type(torch.Tensor)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 160,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "input_dim = 1\n",
+ "hidden_dim = 32\n",
+ "num_layers = 4\n",
+ "output_dim = 1\n",
+ "num_epochs = 1000"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "GRU does not appear to model precipitation properly, may have to consider a different approach\n",
+ "- Verify if it works for other properties (i.e. Temperature and Wind)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 161,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = GRU(input_dim = input_dim, hidden_dim = hidden_dim, output_dim = output_dim, num_layers = num_layers)\n",
+ "criterion = torch.nn.MSELoss(reduction = 'mean')\n",
+ "optimiser = torch.optim.Adam(model.parameters(), lr = 0.001)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 162,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "MSE: 0.043952811509370804\n",
+ "Training time: 29.618263006210327\n"
+ ]
+ }
+ ],
+ "source": [
+ "hist = np.zeros(num_epochs)\n",
+ "start_time = time.time()\n",
+ "gru = []\n",
+ "\n",
+ "for t in range(num_epochs):\n",
+ " y_train_pred = model(X_train)\n",
+ "\n",
+ " loss = criterion(y_train_pred, y_train_gru)\n",
+ " # print(f\"Epoch {t}, MSE: {loss.item()}\")\n",
+ " hist[t] = loss.item()\n",
+ "\n",
+ " optimiser.zero_grad()\n",
+ " loss.backward()\n",
+ " optimiser.step()\n",
+ "\n",
+ "training_time = time.time() - start_time \n",
+ "print(f\"MSE: {hist[-1]}\")\n",
+ "print(f\"Training time: {training_time}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predict = pd.DataFrame(precip_scaler.inverse_transform(y_train_pred.detach().numpy()))\n",
+ "original = pd.DataFrame(precip_scaler.inverse_transform(y_train_gru.detach().numpy()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "