wnstnb commited on
Commit
2310a6b
·
1 Parent(s): a94f457

univariate reg model

Browse files
Files changed (5) hide show
  1. getDailyData.py +4 -1
  2. lambda_function.py +26 -4
  3. model_intra_v2.py +0 -1
  4. model_regr_v2.py +91 -0
  5. regrCols.py +3 -0
getDailyData.py CHANGED
@@ -135,6 +135,8 @@ def get_daily(mode='daily', periods_30m=None):
135
 
136
  # Target for clf -- whether tomorrow will close above or below today's close
137
  data['Target_clf'] = data['Close'] > data['PrevClose']
 
 
138
  data['Target_clf'] = data['Target_clf'].shift(-1)
139
  data['DayOfWeek'] = pd.to_datetime(data.index)
140
  data['Quarter'] = data['DayOfWeek'].dt.quarter
@@ -297,8 +299,9 @@ def get_daily(mode='daily', periods_30m=None):
297
 
298
  elif mode=='intra':
299
  from intraCols import model_cols
 
300
 
301
- df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
302
  df_final = df_final.dropna(subset=['Target','Target_clf'])
303
  # df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
304
  return data, df_final, final_row
 
135
 
136
  # Target for clf -- whether tomorrow will close above or below today's close
137
  data['Target_clf'] = data['Close'] > data['PrevClose']
138
+ data['ClosePct'] = (data['Close'] / data['PrevClose']) - 1
139
+ data['ClosePct'] = data['ClosePct'].shift(-1)
140
  data['Target_clf'] = data['Target_clf'].shift(-1)
141
  data['DayOfWeek'] = pd.to_datetime(data.index)
142
  data['Quarter'] = data['DayOfWeek'].dt.quarter
 
299
 
300
  elif mode=='intra':
301
  from intraCols import model_cols
302
+ from regrCols import model_cols as regr_cols
303
 
304
+ df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf', 'ClosePct']]
305
  df_final = df_final.dropna(subset=['Target','Target_clf'])
306
  # df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
307
  return data, df_final, final_row
lambda_function.py CHANGED
@@ -4,6 +4,8 @@
4
  from getDailyData import get_daily
5
  from model_intra_v3 import walk_forward_validation
6
  from model_day_v2 import walk_forward_validation_seq as walk_forward_validation_daily
 
 
7
  import pandas as pd
8
  import json
9
  from dbConn import connection, engine, insert_dataframe_to_sql
@@ -64,13 +66,19 @@ def is_refresh_time():
64
  def lambda_handler(periods_30m):
65
  if periods_30m > 0:
66
  data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
 
67
  res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
 
 
 
 
68
 
69
  elif periods_30m == 0:
70
  data, df_final, final_row = get_daily()
71
  res, _, _ = walk_forward_validation_daily(df_final.dropna(), 'Target_clf', 'Target', 200, 1)
72
-
73
  # Get results, run calibration and pvalue
 
74
  df_results = pd.read_sql_query(f'select * from results where ModelNum = {str(periods_30m)}', con = engine)
75
 
76
  # Calibrate Probabilities
@@ -103,6 +111,20 @@ def lambda_handler(periods_30m):
103
  df_write = pd.DataFrame.from_dict({k:[v] for k, v in blob.items()})
104
  cursor = connection.cursor()
105
  insert_dataframe_to_sql('results', df_write, cursor)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # cursor.close()
107
  # connection.close()
108
 
@@ -110,15 +132,15 @@ def lambda_handler(periods_30m):
110
 
111
  if __name__ == '__main__':
112
  # Code that, based on the time of the day, return which data/model to run
113
- game_time = False # is_trading_day_and_time()
114
- refresh_time = True # is_refresh_time()
115
  if game_time:
116
  now = datetime.datetime.now()
117
  # Change this for debugging -- should be EST
118
  morning_start = datetime.datetime.combine(now.date(), time(9, 30))
119
  delta = now - morning_start
120
  print(delta)
121
- intervals = 7 # max(0,min((delta.total_seconds() / 60 / 30) // 1, 12))
122
  print(f'running for {str(intervals)}')
123
  j = lambda_handler(intervals)
124
  elif refresh_time:
 
4
  from getDailyData import get_daily
5
  from model_intra_v3 import walk_forward_validation
6
  from model_day_v2 import walk_forward_validation_seq as walk_forward_validation_daily
7
+ from model_regr_v2 import walk_forward_validation as walk_forward_validation_regr
8
+ from model_regr_v2 import calc_upper_lower
9
  import pandas as pd
10
  import json
11
  from dbConn import connection, engine, insert_dataframe_to_sql
 
66
  def lambda_handler(periods_30m):
67
  if periods_30m > 0:
68
  data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
69
+ # Regression model
70
  res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
71
+ regr_res, _ = walk_forward_validation_regr(df_final[['CurrentClose30toClose','ClosePct']].dropna(), 'ClosePct', 1, mode='single')
72
+ df_regr_results = pd.read_sql_query(f'select * from reg_results where ModelNum = {str(periods_30m)}', con = engine)
73
+ regr_pct = regr_res['Predicted'].iloc[-1]
74
+ upper, lower = calc_upper_lower(regr_pct, df_regr_results, alpha=0.05)
75
 
76
  elif periods_30m == 0:
77
  data, df_final, final_row = get_daily()
78
  res, _, _ = walk_forward_validation_daily(df_final.dropna(), 'Target_clf', 'Target', 200, 1)
79
+
80
  # Get results, run calibration and pvalue
81
+
82
  df_results = pd.read_sql_query(f'select * from results where ModelNum = {str(periods_30m)}', con = engine)
83
 
84
  # Calibrate Probabilities
 
111
  df_write = pd.DataFrame.from_dict({k:[v] for k, v in blob.items()})
112
  cursor = connection.cursor()
113
  insert_dataframe_to_sql('results', df_write, cursor)
114
+
115
+ if periods_30m > 0:
116
+ regr_blob = {
117
+ 'Datetime': str(res.index[-1]),
118
+ 'IsTrue':df_final['ClosePct'].iloc[-1],
119
+ 'Predicted': regr_pct,
120
+ 'Upper': upper,
121
+ 'Lower':lower,
122
+ 'ModelNum':periods_30m,
123
+ 'AsOf':str(asof)
124
+ }
125
+ df_write_reg = pd.DataFrame.from_dict({k:[v] for k, v in regr_blob.items()})
126
+ insert_dataframe_to_sql('reg_results', df_write_reg, cursor)
127
+
128
  # cursor.close()
129
  # connection.close()
130
 
 
132
 
133
  if __name__ == '__main__':
134
  # Code that, based on the time of the day, return which data/model to run
135
+ game_time = is_trading_day_and_time()
136
+ refresh_time = is_refresh_time()
137
  if game_time:
138
  now = datetime.datetime.now()
139
  # Change this for debugging -- should be EST
140
  morning_start = datetime.datetime.combine(now.date(), time(9, 30))
141
  delta = now - morning_start
142
  print(delta)
143
+ intervals = max(0,min((delta.total_seconds() / 60 / 30) // 1, 12))
144
  print(f'running for {str(intervals)}')
145
  j = lambda_handler(intervals)
146
  elif refresh_time:
model_intra_v2.py CHANGED
@@ -9,7 +9,6 @@ def walk_forward_validation(df, target_column, num_periods, mode='full'):
9
 
10
  df = df[model_cols + [target_column]]
11
  df[target_column] = df[target_column].astype(bool)
12
-
13
  tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
14
 
15
  if mode == 'full':
 
9
 
10
  df = df[model_cols + [target_column]]
11
  df[target_column] = df[target_column].astype(bool)
 
12
  tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
13
 
14
  if mode == 'full':
model_regr_v2.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from tqdm import tqdm
4
+ from sklearn.linear_model import LinearRegression
5
+ from sklearn.model_selection import TimeSeriesSplit
6
+ from regrCols import model_cols
7
+
8
+ def walk_forward_validation(df, target_column, num_periods, mode='full'):
9
+
10
+ df = df[model_cols + [target_column]]
11
+ df[target_column] = df[target_column].astype(float)
12
+
13
+ tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
14
+
15
+ if mode == 'full':
16
+ overall_results = []
17
+ # Iterate over the rows in the DataFrame, one step at a time
18
+ # Split the time series data using TimeSeriesSplit
19
+ for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
20
+ # Extract the training and testing data for the current split
21
+ X_train = df.drop(target_column, axis=1).iloc[train_index]
22
+ y_train = df[target_column].iloc[train_index]
23
+ X_test = df.drop(target_column, axis=1).iloc[test_index]
24
+ y_test = df[target_column].iloc[test_index]
25
+
26
+ y_train = y_train.astype(float)
27
+ model = LinearRegression()
28
+ model.fit(X_train, y_train)
29
+ # Make a prediction on the test data
30
+ predictions = model.predict(X_test)
31
+
32
+ # Create a DataFrame to store the true and predicted values
33
+ result_df = pd.DataFrame({'IsTrue': y_test, 'Predicted': predictions}, index=y_test.index)
34
+ overall_results.append(result_df)
35
+
36
+ df_results = pd.concat(overall_results)
37
+
38
+ uppers = []
39
+ lowers = []
40
+ alpha = 0.05
41
+ for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
42
+ try:
43
+
44
+ df_q = df_results.iloc[:i]
45
+ pred = df_results['Predicted'].iloc[-1]
46
+ errors = df_q['IsTrue'] - df_q['Predicted']
47
+ positive_errors = errors[errors >= 0]
48
+ negative_errors = errors[errors < 0]
49
+
50
+ # Calculate bounds
51
+ upper_bound = pred + np.quantile(positive_errors, 1 - alpha)
52
+ lower_bound = pred + np.quantile(negative_errors, alpha)
53
+
54
+ except:
55
+ upper_bound = None
56
+ lower_bound = None
57
+
58
+ uppers.append(upper_bound)
59
+ lowers.append(lower_bound)
60
+
61
+ df_results['Upper'] = uppers
62
+ df_results['Lower'] = lowers
63
+
64
+ elif mode == 'single':
65
+ X_train = df.drop(target_column, axis=1).iloc[:-1]
66
+ y_train = df[target_column].iloc[:-1]
67
+ X_test = df.drop(target_column, axis=1).iloc[-1]
68
+ y_test = df[target_column].iloc[-1]
69
+ y_train = y_train.astype(float)
70
+ model = LinearRegression()
71
+ model.fit(X_train, y_train)
72
+ predictions = model.predict(X_test.values.reshape(1, -1))
73
+ df_results = pd.DataFrame({'IsTrue': y_test, 'Predicted': predictions}, index=[df.index[-1]])
74
+
75
+ return df_results, model
76
+
77
+ def calc_upper_lower(pred, df_hist, alpha=0.05):
78
+ errors = df_hist['IsTrue'] - df_hist['Predicted']
79
+ positive_errors = errors[errors >= 0]
80
+ negative_errors = errors[errors < 0]
81
+
82
+ # Calculate bounds
83
+ upper_bound = pred + np.quantile(positive_errors, 1 - alpha)
84
+ lower_bound = pred + np.quantile(negative_errors, alpha)
85
+
86
+ return upper_bound, lower_bound
87
+
88
+
89
+ def seq_predict_proba(df, trained_clf_model):
90
+ clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
91
+ return clf_pred_proba
regrCols.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ model_cols = [
2
+ 'CurrentClose30toClose'
3
+ ]