gamedayspx-monitor / uni_model.py
wnstnb's picture
updating charts
4b357c0
import numpy as np
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression # Example model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
import datetime
from datetime import time, timedelta
from tqdm import tqdm
def prep_data(df):
morning_start = datetime.datetime.combine(now.date(), time(6, 30))
delta = now - morning_start
print(delta)
# candle = 1 #max(0,min((delta.total_seconds() / 60 / 30) // 1, 12))
# candles = np.arange(1,13)
candles = np.arange(1,2)
for candle in tqdm(candles):
print(f'running for {str(candle)}')
data, df_final, final_row = get_daily(mode='intra', periods_30m=candle)
df_new = data[['Open','High','Low','Close','Close30','Close_VIX30','Close_VIX','Close_VVIX30','Close_VVIX']].copy()
df_new['PrevClose'] = df_new['Close'].shift(1)
df_new['CurrentGap'] = (df_new['Open'] / df_new['PrevClose']) - 1
df_new['ClosePctIntra'] = (df_new['Close30'] / df_new['Close'].shift(1)) - 1
df_new['ClosePctOpenIntra'] = (df_new['Close30'] / df_new['Open']) - 1
df_new['ClosePctVIXIntra'] = (df_new['Close_VIX30'] / df_new['Close_VIX'].shift(1)) - 1
df_new['ClosePctVVIXIntra'] = (df_new['Close_VVIX30'] / df_new['Close_VVIX'].shift(1)) - 1
df_new['EMA8'] = df_new['Close'].ewm(8).mean()
df_new['EMA8'] = df_new['EMA8'].shift(1)
df_new['EMA8Intra'] = df_new['Close30'] > df_new['EMA8']
# Target will be the day's close
df_new['ClosePct'] = (df_new['Close'] / df_new['Close'].shift(1)) - 1
# Column to determine what percentile the current intra performance looks like
intra_rank = []
for i, pct in tqdm(enumerate(df_new['ClosePctIntra'])):
try:
historical = df_new['ClosePctIntra'].iloc[:i]
current = df_new['ClosePctIntra'].iloc[i]
perc = len(historical[historical > current]) / len(historical)
except:
perc = None
intra_rank.append(perc)
df_new['IntraPercentile'] = intra_rank
# Column to determine what percentile the daily performance looks like
daily_rank = []
for i, pct in tqdm(enumerate(df_new['ClosePct'])):
try:
historical = df_new['ClosePct'].iloc[:i]
current = df_new['ClosePct'].iloc[i]
perc = len(historical[historical > current]) / len(historical)
except:
perc = None
daily_rank.append(perc)
df_new['ClosePctPercentile'] = daily_rank
# Let's do n-5 to start just for closes
lags = np.arange(1,6)
for lag in lags:
df_new[f'ClosePct_n{str(lag)}'] = df_new['ClosePct'].shift(lag)
# df_new[f'ClosePctPercentile_n{str(lag)}'] = df_new['ClosePctPercentile'].shift(lag)
df_feats = df_new[[c for c in df_new.columns if 'ClosePct' in c or 'Intra' in c or 'Gap' in c]]
df_final = df_feats.dropna()
X = df_final[['ClosePctIntra']] # Feature dataset
y = df_final['ClosePct'] # Target dataset
# model = LGBMRegressor(random_state=42, n_estimators=10, verbose=-1)
# model = LinearRegression()
# Define the column transformer for handling numeric and categorical features
# Fit the pipeline on the training data
# pipeline.fit(X_train, y_train)
tscv = TimeSeriesSplit(n_splits=len(df_final)-1, max_train_size=None, test_size=1)
mae_scores = []
overall_results = []
for train_index, test_index in tscv.split(X):
X_train = X.iloc[train_index]
X_test = X.iloc[test_index]
y_train = y.iloc[train_index]
y_test = y.iloc[test_index]
# Select features
categorical_features = X_train.select_dtypes(include='object').columns
numeric_features = X_train.drop(columns=[c for c in X_train.columns if 'Percentile' in c]).select_dtypes(include='number').columns
# Transformers
numeric_transformer = RobustScaler() # Example: StandardScaler for numeric features
categorical_transformer = OneHotEncoder() # Example: OneHotEncoder for categorical features
# Define the pipeline steps
preprocessor = ColumnTransformer(
transformers=[
('numeric', numeric_transformer, numeric_features), # numeric_features is a list of numeric feature column names
('categorical', categorical_transformer, categorical_features) # categorical_features is a list of categorical feature column names
])
# Create the pipeline
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', LinearRegression())
])
# Fit the model
pipeline.fit(X_train, y_train)
# Predict
y_pred = pipeline.predict(X_test)
# Calculate metrics
# mae_scores.append(mean_absolute_error(y_test, y_pred))
result_df = pd.DataFrame({'IsTrue': y_test, 'Predicted': y_pred}, index=y_test.index)
overall_results.append(result_df)
df_results = pd.concat(overall_results)
uppers = []
lowers = []
alpha = 0.05
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
try:
df_q = df_results.iloc[:i]
pred = df_results['Predicted'].iloc[-1]
errors = df_q['IsTrue'] - df_q['Predicted']
positive_errors = errors[errors >= 0]
negative_errors = errors[errors < 0]
# Calculate bounds
upper_bound = pred + np.quantile(positive_errors, 1 - alpha)
lower_bound = pred + np.quantile(negative_errors, alpha)
except:
upper_bound = None
lower_bound = None
uppers.append(upper_bound)
lowers.append(lower_bound)
df_results['Upper'] = uppers
df_results['Lower'] = lowers
df_results = df_results.merge(data[['PrevClose']],left_index=True, right_index=True)
df_results['Pred'] = df_results['PrevClose'] * (1 + df_results['Predicted'])
df_results['Actual'] = df_results['PrevClose'] * (1 + df_results['IsTrue'])
df_results['Up'] = df_results['PrevClose'] * (1 + df_results['Upper'])
df_results['Down'] = df_results['PrevClose'] * (1 + df_results['Lower'])
results[f'{str(int(candle))}'] = df_results
# Average metrics across folds
average_mae = mean_absolute_error(df_results['IsTrue'], df_results['Predicted'])
# sorted_features = sorted([(feat, coef) for feat, coef in zip(model.feature_name_, model.feature_importances_)], key=lambda x: abs(x[1]), reverse=True)
sorted_features = sorted([(feat, coef) for feat, coef in zip(pipeline.feature_names_in_, pipeline.named_steps.model.coef_)], key=lambda x: abs(x[1]), reverse=True)
coefs[f'{str(int(candle))}'] = pd.DataFrame(sorted_features, columns=['Feature','Coefficient'])
df_consolidated.loc[int(candle), 'MAE'] = average_mae