Spaces:
Sleeping
Sleeping
univariate reg model
Browse files- getDailyData.py +4 -1
- lambda_function.py +26 -4
- model_intra_v2.py +0 -1
- model_regr_v2.py +91 -0
- regrCols.py +3 -0
getDailyData.py
CHANGED
@@ -135,6 +135,8 @@ def get_daily(mode='daily', periods_30m=None):
|
|
135 |
|
136 |
# Target for clf -- whether tomorrow will close above or below today's close
|
137 |
data['Target_clf'] = data['Close'] > data['PrevClose']
|
|
|
|
|
138 |
data['Target_clf'] = data['Target_clf'].shift(-1)
|
139 |
data['DayOfWeek'] = pd.to_datetime(data.index)
|
140 |
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
@@ -297,8 +299,9 @@ def get_daily(mode='daily', periods_30m=None):
|
|
297 |
|
298 |
elif mode=='intra':
|
299 |
from intraCols import model_cols
|
|
|
300 |
|
301 |
-
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
|
302 |
df_final = df_final.dropna(subset=['Target','Target_clf'])
|
303 |
# df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
304 |
return data, df_final, final_row
|
|
|
135 |
|
136 |
# Target for clf -- whether tomorrow will close above or below today's close
|
137 |
data['Target_clf'] = data['Close'] > data['PrevClose']
|
138 |
+
data['ClosePct'] = (data['Close'] / data['PrevClose']) - 1
|
139 |
+
data['ClosePct'] = data['ClosePct'].shift(-1)
|
140 |
data['Target_clf'] = data['Target_clf'].shift(-1)
|
141 |
data['DayOfWeek'] = pd.to_datetime(data.index)
|
142 |
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
|
|
299 |
|
300 |
elif mode=='intra':
|
301 |
from intraCols import model_cols
|
302 |
+
from regrCols import model_cols as regr_cols
|
303 |
|
304 |
+
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf', 'ClosePct']]
|
305 |
df_final = df_final.dropna(subset=['Target','Target_clf'])
|
306 |
# df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
307 |
return data, df_final, final_row
|
lambda_function.py
CHANGED
@@ -4,6 +4,8 @@
|
|
4 |
from getDailyData import get_daily
|
5 |
from model_intra_v3 import walk_forward_validation
|
6 |
from model_day_v2 import walk_forward_validation_seq as walk_forward_validation_daily
|
|
|
|
|
7 |
import pandas as pd
|
8 |
import json
|
9 |
from dbConn import connection, engine, insert_dataframe_to_sql
|
@@ -64,13 +66,19 @@ def is_refresh_time():
|
|
64 |
def lambda_handler(periods_30m):
|
65 |
if periods_30m > 0:
|
66 |
data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
|
|
|
67 |
res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
|
|
|
|
|
|
|
|
|
68 |
|
69 |
elif periods_30m == 0:
|
70 |
data, df_final, final_row = get_daily()
|
71 |
res, _, _ = walk_forward_validation_daily(df_final.dropna(), 'Target_clf', 'Target', 200, 1)
|
72 |
-
|
73 |
# Get results, run calibration and pvalue
|
|
|
74 |
df_results = pd.read_sql_query(f'select * from results where ModelNum = {str(periods_30m)}', con = engine)
|
75 |
|
76 |
# Calibrate Probabilities
|
@@ -103,6 +111,20 @@ def lambda_handler(periods_30m):
|
|
103 |
df_write = pd.DataFrame.from_dict({k:[v] for k, v in blob.items()})
|
104 |
cursor = connection.cursor()
|
105 |
insert_dataframe_to_sql('results', df_write, cursor)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# cursor.close()
|
107 |
# connection.close()
|
108 |
|
@@ -110,15 +132,15 @@ def lambda_handler(periods_30m):
|
|
110 |
|
111 |
if __name__ == '__main__':
|
112 |
# Code that, based on the time of the day, return which data/model to run
|
113 |
-
game_time =
|
114 |
-
refresh_time =
|
115 |
if game_time:
|
116 |
now = datetime.datetime.now()
|
117 |
# Change this for debugging -- should be EST
|
118 |
morning_start = datetime.datetime.combine(now.date(), time(9, 30))
|
119 |
delta = now - morning_start
|
120 |
print(delta)
|
121 |
-
intervals =
|
122 |
print(f'running for {str(intervals)}')
|
123 |
j = lambda_handler(intervals)
|
124 |
elif refresh_time:
|
|
|
4 |
from getDailyData import get_daily
|
5 |
from model_intra_v3 import walk_forward_validation
|
6 |
from model_day_v2 import walk_forward_validation_seq as walk_forward_validation_daily
|
7 |
+
from model_regr_v2 import walk_forward_validation as walk_forward_validation_regr
|
8 |
+
from model_regr_v2 import calc_upper_lower
|
9 |
import pandas as pd
|
10 |
import json
|
11 |
from dbConn import connection, engine, insert_dataframe_to_sql
|
|
|
66 |
def lambda_handler(periods_30m):
|
67 |
if periods_30m > 0:
|
68 |
data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
|
69 |
+
# Regression model
|
70 |
res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
|
71 |
+
regr_res, _ = walk_forward_validation_regr(df_final[['CurrentClose30toClose','ClosePct']].dropna(), 'ClosePct', 1, mode='single')
|
72 |
+
df_regr_results = pd.read_sql_query(f'select * from reg_results where ModelNum = {str(periods_30m)}', con = engine)
|
73 |
+
regr_pct = regr_res['Predicted'].iloc[-1]
|
74 |
+
upper, lower = calc_upper_lower(regr_pct, df_regr_results, alpha=0.05)
|
75 |
|
76 |
elif periods_30m == 0:
|
77 |
data, df_final, final_row = get_daily()
|
78 |
res, _, _ = walk_forward_validation_daily(df_final.dropna(), 'Target_clf', 'Target', 200, 1)
|
79 |
+
|
80 |
# Get results, run calibration and pvalue
|
81 |
+
|
82 |
df_results = pd.read_sql_query(f'select * from results where ModelNum = {str(periods_30m)}', con = engine)
|
83 |
|
84 |
# Calibrate Probabilities
|
|
|
111 |
df_write = pd.DataFrame.from_dict({k:[v] for k, v in blob.items()})
|
112 |
cursor = connection.cursor()
|
113 |
insert_dataframe_to_sql('results', df_write, cursor)
|
114 |
+
|
115 |
+
if periods_30m > 0:
|
116 |
+
regr_blob = {
|
117 |
+
'Datetime': str(res.index[-1]),
|
118 |
+
'IsTrue':df_final['ClosePct'].iloc[-1],
|
119 |
+
'Predicted': regr_pct,
|
120 |
+
'Upper': upper,
|
121 |
+
'Lower':lower,
|
122 |
+
'ModelNum':periods_30m,
|
123 |
+
'AsOf':str(asof)
|
124 |
+
}
|
125 |
+
df_write_reg = pd.DataFrame.from_dict({k:[v] for k, v in regr_blob.items()})
|
126 |
+
insert_dataframe_to_sql('reg_results', df_write_reg, cursor)
|
127 |
+
|
128 |
# cursor.close()
|
129 |
# connection.close()
|
130 |
|
|
|
132 |
|
133 |
if __name__ == '__main__':
|
134 |
# Code that, based on the time of the day, return which data/model to run
|
135 |
+
game_time = is_trading_day_and_time()
|
136 |
+
refresh_time = is_refresh_time()
|
137 |
if game_time:
|
138 |
now = datetime.datetime.now()
|
139 |
# Change this for debugging -- should be EST
|
140 |
morning_start = datetime.datetime.combine(now.date(), time(9, 30))
|
141 |
delta = now - morning_start
|
142 |
print(delta)
|
143 |
+
intervals = max(0,min((delta.total_seconds() / 60 / 30) // 1, 12))
|
144 |
print(f'running for {str(intervals)}')
|
145 |
j = lambda_handler(intervals)
|
146 |
elif refresh_time:
|
model_intra_v2.py
CHANGED
@@ -9,7 +9,6 @@ def walk_forward_validation(df, target_column, num_periods, mode='full'):
|
|
9 |
|
10 |
df = df[model_cols + [target_column]]
|
11 |
df[target_column] = df[target_column].astype(bool)
|
12 |
-
|
13 |
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
|
14 |
|
15 |
if mode == 'full':
|
|
|
9 |
|
10 |
df = df[model_cols + [target_column]]
|
11 |
df[target_column] = df[target_column].astype(bool)
|
|
|
12 |
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
|
13 |
|
14 |
if mode == 'full':
|
model_regr_v2.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from tqdm import tqdm
|
4 |
+
from sklearn.linear_model import LinearRegression
|
5 |
+
from sklearn.model_selection import TimeSeriesSplit
|
6 |
+
from regrCols import model_cols
|
7 |
+
|
8 |
+
def walk_forward_validation(df, target_column, num_periods, mode='full'):
|
9 |
+
|
10 |
+
df = df[model_cols + [target_column]]
|
11 |
+
df[target_column] = df[target_column].astype(float)
|
12 |
+
|
13 |
+
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
|
14 |
+
|
15 |
+
if mode == 'full':
|
16 |
+
overall_results = []
|
17 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
18 |
+
# Split the time series data using TimeSeriesSplit
|
19 |
+
for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
|
20 |
+
# Extract the training and testing data for the current split
|
21 |
+
X_train = df.drop(target_column, axis=1).iloc[train_index]
|
22 |
+
y_train = df[target_column].iloc[train_index]
|
23 |
+
X_test = df.drop(target_column, axis=1).iloc[test_index]
|
24 |
+
y_test = df[target_column].iloc[test_index]
|
25 |
+
|
26 |
+
y_train = y_train.astype(float)
|
27 |
+
model = LinearRegression()
|
28 |
+
model.fit(X_train, y_train)
|
29 |
+
# Make a prediction on the test data
|
30 |
+
predictions = model.predict(X_test)
|
31 |
+
|
32 |
+
# Create a DataFrame to store the true and predicted values
|
33 |
+
result_df = pd.DataFrame({'IsTrue': y_test, 'Predicted': predictions}, index=y_test.index)
|
34 |
+
overall_results.append(result_df)
|
35 |
+
|
36 |
+
df_results = pd.concat(overall_results)
|
37 |
+
|
38 |
+
uppers = []
|
39 |
+
lowers = []
|
40 |
+
alpha = 0.05
|
41 |
+
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
|
42 |
+
try:
|
43 |
+
|
44 |
+
df_q = df_results.iloc[:i]
|
45 |
+
pred = df_results['Predicted'].iloc[-1]
|
46 |
+
errors = df_q['IsTrue'] - df_q['Predicted']
|
47 |
+
positive_errors = errors[errors >= 0]
|
48 |
+
negative_errors = errors[errors < 0]
|
49 |
+
|
50 |
+
# Calculate bounds
|
51 |
+
upper_bound = pred + np.quantile(positive_errors, 1 - alpha)
|
52 |
+
lower_bound = pred + np.quantile(negative_errors, alpha)
|
53 |
+
|
54 |
+
except:
|
55 |
+
upper_bound = None
|
56 |
+
lower_bound = None
|
57 |
+
|
58 |
+
uppers.append(upper_bound)
|
59 |
+
lowers.append(lower_bound)
|
60 |
+
|
61 |
+
df_results['Upper'] = uppers
|
62 |
+
df_results['Lower'] = lowers
|
63 |
+
|
64 |
+
elif mode == 'single':
|
65 |
+
X_train = df.drop(target_column, axis=1).iloc[:-1]
|
66 |
+
y_train = df[target_column].iloc[:-1]
|
67 |
+
X_test = df.drop(target_column, axis=1).iloc[-1]
|
68 |
+
y_test = df[target_column].iloc[-1]
|
69 |
+
y_train = y_train.astype(float)
|
70 |
+
model = LinearRegression()
|
71 |
+
model.fit(X_train, y_train)
|
72 |
+
predictions = model.predict(X_test.values.reshape(1, -1))
|
73 |
+
df_results = pd.DataFrame({'IsTrue': y_test, 'Predicted': predictions}, index=[df.index[-1]])
|
74 |
+
|
75 |
+
return df_results, model
|
76 |
+
|
77 |
+
def calc_upper_lower(pred, df_hist, alpha=0.05):
|
78 |
+
errors = df_hist['IsTrue'] - df_hist['Predicted']
|
79 |
+
positive_errors = errors[errors >= 0]
|
80 |
+
negative_errors = errors[errors < 0]
|
81 |
+
|
82 |
+
# Calculate bounds
|
83 |
+
upper_bound = pred + np.quantile(positive_errors, 1 - alpha)
|
84 |
+
lower_bound = pred + np.quantile(negative_errors, alpha)
|
85 |
+
|
86 |
+
return upper_bound, lower_bound
|
87 |
+
|
88 |
+
|
89 |
+
def seq_predict_proba(df, trained_clf_model):
|
90 |
+
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
|
91 |
+
return clf_pred_proba
|
regrCols.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
model_cols = [
|
2 |
+
'CurrentClose30toClose'
|
3 |
+
]
|