{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import warnings\n", "with warnings.catch_warnings():\n", " warnings.simplefilter(\"ignore\")\n", " warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "import pandas as pd\n", "from getDailyData import get_daily\n", "from sklearn.model_selection import TimeSeriesSplit\n", "from sklearn.metrics import mean_absolute_error\n", "from sklearn.linear_model import LinearRegression # Example model\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder\n", "from lightgbm import LGBMRegressor\n", "from tqdm import tqdm\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import yfinance as yf\n", "spx = yf.Ticker('^GSPC')\n", "spx.history(start='2023-11-20', interval='1d')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import datetime\n", "from datetime import time, timedelta\n", "from tqdm import tqdm\n", "\n", "now = datetime.datetime.now()\n", "df_consolidated = pd.DataFrame()\n", "results = {}\n", "coefs = {}\n", "\n", "morning_start = datetime.datetime.combine(now.date(), time(6, 30))\n", "delta = now - morning_start\n", "print(delta)\n", "# candle = 1 #max(0,min((delta.total_seconds() / 60 / 30) // 1, 12))\n", "# candles = np.arange(1,13)\n", "candles = np.arange(1,2)\n", "for candle in tqdm(candles):\n", " print(f'running for {str(candle)}')\n", " data, df_final, final_row = get_daily(mode='intra', periods_30m=candle)\n", "\n", " df_new = data[['Open','High','Low','Close','Close30','Close_VIX30','Close_VIX','Close_VVIX30','Close_VVIX']].copy()\n", " df_new['PrevClose'] = df_new['Close'].shift(1)\n", " df_new['CurrentGap'] = (df_new['Open'] / df_new['PrevClose']) - 1\n", " df_new['ClosePctIntra'] = (df_new['Close30'] / df_new['Close'].shift(1)) - 1\n", " df_new['ClosePctOpenIntra'] = (df_new['Close30'] / df_new['Open']) - 1\n", " df_new['ClosePctVIXIntra'] = (df_new['Close_VIX30'] / df_new['Close_VIX'].shift(1)) - 1\n", " df_new['ClosePctVVIXIntra'] = (df_new['Close_VVIX30'] / df_new['Close_VVIX'].shift(1)) - 1\n", " df_new['EMA8'] = df_new['Close'].ewm(8).mean()\n", " df_new['EMA8'] = df_new['EMA8'].shift(1)\n", " df_new['EMA8Intra'] = df_new['Close30'] > df_new['EMA8']\n", "\n", " # Target will be the day's close\n", " df_new['ClosePct'] = (df_new['Close'] / df_new['Close'].shift(1)) - 1\n", "\n", " # Column to determine what percentile the current intra performance looks like\n", " intra_rank = []\n", " for i, pct in tqdm(enumerate(df_new['ClosePctIntra'])):\n", " try:\n", " historical = df_new['ClosePctIntra'].iloc[:i]\n", " current = df_new['ClosePctIntra'].iloc[i]\n", " perc = len(historical[historical > current]) / len(historical)\n", " except:\n", " perc = None\n", " intra_rank.append(perc)\n", "\n", " df_new['IntraPercentile'] = intra_rank\n", "\n", " # Column to determine what percentile the daily performance looks like\n", " daily_rank = []\n", " for i, pct in tqdm(enumerate(df_new['ClosePct'])):\n", " try:\n", " historical = df_new['ClosePct'].iloc[:i]\n", " current = df_new['ClosePct'].iloc[i]\n", " perc = len(historical[historical > current]) / len(historical)\n", " except:\n", " perc = None\n", " daily_rank.append(perc)\n", "\n", " df_new['ClosePctPercentile'] = daily_rank\n", "\n", " # Let's do n-5 to start just for closes\n", " lags = np.arange(1,6)\n", "\n", " for lag in lags:\n", " df_new[f'ClosePct_n{str(lag)}'] = df_new['ClosePct'].shift(lag)\n", " # df_new[f'ClosePctPercentile_n{str(lag)}'] = df_new['ClosePctPercentile'].shift(lag)\n", "\n", "\n", " df_feats = df_new[[c for c in df_new.columns if 'ClosePct' in c or 'Intra' in c or 'Gap' in c]]\n", "\n", " df_final = df_feats.dropna()\n", "\n", " X = df_final[['ClosePctIntra']] # Feature dataset\n", " y = df_final['ClosePct'] # Target dataset\n", "\n", " # model = LGBMRegressor(random_state=42, n_estimators=10, verbose=-1)\n", " # model = LinearRegression()\n", " # Define the column transformer for handling numeric and categorical features\n", " \n", "\n", " # Fit the pipeline on the training data\n", " # pipeline.fit(X_train, y_train)\n", "\n", " tscv = TimeSeriesSplit(n_splits=len(df_final)-1, max_train_size=None, test_size=1)\n", "\n", " mae_scores = []\n", " overall_results = []\n", "\n", " for train_index, test_index in tscv.split(X):\n", " \n", " X_train = X.iloc[train_index]\n", " X_test = X.iloc[test_index]\n", " y_train = y.iloc[train_index]\n", " y_test = y.iloc[test_index]\n", " \n", " # Select features\n", " categorical_features = X_train.select_dtypes(include='object').columns\n", " numeric_features = X_train.drop(columns=[c for c in X_train.columns if 'Percentile' in c]).select_dtypes(include='number').columns\n", "\n", " # Transformers\n", " numeric_transformer = RobustScaler() # Example: StandardScaler for numeric features\n", " categorical_transformer = OneHotEncoder() # Example: OneHotEncoder for categorical features\n", "\n", " # Define the pipeline steps\n", " preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('numeric', numeric_transformer, numeric_features), # numeric_features is a list of numeric feature column names\n", " ('categorical', categorical_transformer, categorical_features) # categorical_features is a list of categorical feature column names\n", " ])\n", "\n", " # Create the pipeline\n", " pipeline = Pipeline(steps=[\n", " ('preprocessor', preprocessor),\n", " ('model', LinearRegression())\n", " ])\n", " \n", " # Fit the model\n", " pipeline.fit(X_train, y_train)\n", "\n", " # Predict\n", " y_pred = pipeline.predict(X_test)\n", "\n", " # Calculate metrics\n", " # mae_scores.append(mean_absolute_error(y_test, y_pred))\n", " result_df = pd.DataFrame({'IsTrue': y_test, 'Predicted': y_pred}, index=y_test.index)\n", " overall_results.append(result_df)\n", "\n", " df_results = pd.concat(overall_results)\n", "\n", " uppers = []\n", " lowers = []\n", " alpha = 0.05\n", " for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):\n", " try:\n", " \n", " df_q = df_results.iloc[:i]\n", " pred = df_results['Predicted'].iloc[-1]\n", " errors = df_q['IsTrue'] - df_q['Predicted']\n", " positive_errors = errors[errors >= 0]\n", " negative_errors = errors[errors < 0]\n", "\n", " # Calculate bounds\n", " upper_bound = pred + np.quantile(positive_errors, 1 - alpha)\n", " lower_bound = pred + np.quantile(negative_errors, alpha)\n", " \n", " except:\n", " upper_bound = None\n", " lower_bound = None\n", "\n", " uppers.append(upper_bound)\n", " lowers.append(lower_bound)\n", "\n", " df_results['Upper'] = uppers\n", " df_results['Lower'] = lowers\n", "\n", " df_results = df_results.merge(data[['PrevClose']],left_index=True, right_index=True)\n", " df_results['Pred'] = df_results['PrevClose'] * (1 + df_results['Predicted'])\n", " df_results['Actual'] = df_results['PrevClose'] * (1 + df_results['IsTrue'])\n", " df_results['Up'] = df_results['PrevClose'] * (1 + df_results['Upper'])\n", " df_results['Down'] = df_results['PrevClose'] * (1 + df_results['Lower'])\n", "\n", " results[f'{str(int(candle))}'] = df_results\n", "\n", " # Average metrics across folds\n", " average_mae = mean_absolute_error(df_results['IsTrue'], df_results['Predicted'])\n", " # sorted_features = sorted([(feat, coef) for feat, coef in zip(model.feature_name_, model.feature_importances_)], key=lambda x: abs(x[1]), reverse=True)\n", " sorted_features = sorted([(feat, coef) for feat, coef in zip(pipeline.feature_names_in_, pipeline.named_steps.model.coef_)], key=lambda x: abs(x[1]), reverse=True)\n", "\n", " coefs[f'{str(int(candle))}'] = pd.DataFrame(sorted_features, columns=['Feature','Coefficient'])\n", "\n", " df_consolidated.loc[int(candle), 'MAE'] = average_mae" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline.named_steps['model'].coef_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_f = pd.concat(coefs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_consolidated" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "results[f'{str(candle)}'].loc['2023-10-01':, ['Pred','Actual','Up','Down']].plot();" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "coefs[f'{str(candle)}']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "d:\\Projects\\gamedayspx_lambda\\getDailyData.py:243: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()\n", "Merging econ data: 100%|██████████| 8/8 [00:00<00:00, 1598.36it/s]\n", "d:\\Projects\\gamedayspx_lambda\\model_intra_v2.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[target_column] = df[target_column].astype(bool)\n", "d:\\Projects\\gamedayspx_lambda\\.venv\\lib\\site-packages\\sklearn\\base.py:465: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names\n", " warnings.warn(\n", "C:\\Users\\WINSTON-ITX\\AppData\\Local\\Temp\\ipykernel_10000\\2718014135.py:38: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", " return df.groupby(pd.cut(df[col_name], q))['IsTrue'].mean()\n" ] } ], "source": [ "from getDailyData import get_daily\n", "from model_intra_v2 import walk_forward_validation\n", "from model_day_v2 import walk_forward_validation_seq as walk_forward_validation_daily\n", "from model_regr_v2 import walk_forward_validation as walk_forward_validation_regr\n", "from model_regr_v2 import calc_upper_lower\n", "import pandas as pd\n", "import json\n", "from dbConn import connection, engine, insert_dataframe_to_sql\n", "import numpy as np\n", "from datetime import time, timedelta\n", "import datetime\n", "from pandas.tseries.offsets import BDay\n", "import holidays\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "\n", "periods_30m = 1\n", "\n", "if periods_30m > 0:\n", " data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)\n", " # Regression model\n", " res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')\n", " regr_res, _ = walk_forward_validation_regr(df_final[['CurrentClose30toClose','ClosePct']].dropna(), 'ClosePct', 1, mode='single')\n", " df_regr_results = pd.read_sql_query(f'select * from reg_results where ModelNum = {str(periods_30m)}', con = engine)\n", " regr_pct = regr_res['Predicted'].iloc[-1]\n", " upper, lower = calc_upper_lower(regr_pct, df_regr_results, alpha=0.05)\n", "\n", "elif periods_30m == 0:\n", " data, df_final, final_row = get_daily()\n", " res, _, _ = walk_forward_validation_daily(df_final.dropna(), 'Target_clf', 'Target', 200, 1)\n", "\n", "# Get results, run calibration and pvalue \n", "\n", "df_results = pd.read_sql_query(f'select * from results where ModelNum = {str(periods_30m)}', con = engine)\n", "\n", "# Calibrate Probabilities\n", "def get_quantiles(df, col_name, q):\n", " return df.groupby(pd.cut(df[col_name], q))['IsTrue'].mean()\n", "\n", "pct = res['Predicted'].iloc[-1]\n", "\n", "df_q = get_quantiles(df_results, 'Predicted', 10)\n", "for q in df_q.index:\n", " if q.left <= pct <= q.right:\n", " p = df_q[q]\n", "\n", "calib_scores = np.abs(df_results['Predicted'].iloc[:-1] - 0.5)\n", "score = abs(pct - 0.5)\n", "pv = np.mean(calib_scores >= score)\n", "asof = datetime.datetime.combine(data.index[-1], time(9,30)) + (periods_30m * timedelta(minutes=30)) \n", "\n", "blob = {\n", " 'Datetime': str(res.index[-1]),\n", " 'IsTrue':df_final['Target_clf'].iloc[-1],\n", " 'Predicted': pct,\n", " 'CalibPredicted': p,\n", " 'Pvalue':pv,\n", " 'ModelNum':periods_30m,\n", " 'AsOf':str(asof)\n", "}\n", "\n", "# Write to DB\n", "df_write = pd.DataFrame.from_dict({k:[v] for k, v in blob.items()})\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Datetime | \n", "IsTrue | \n", "Predicted | \n", "CalibPredicted | \n", "Pvalue | \n", "ModelNum | \n", "AsOf | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "2023-11-22 00:00:00 | \n", "True | \n", "0.712132 | \n", "0.832636 | \n", "0.404288 | \n", "1 | \n", "2023-11-24 10:00:00 | \n", "