wnstnb commited on
Commit
2dff47b
·
0 Parent(s):

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /.env
2
+ /.venv
3
+ /hss.pem ec2-user@ec2-18-1
4
+ /__pycache__
ca-certificates.crt ADDED
The diff for this file is too large to render. See raw diff
 
dailyCols.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_cols = [
2
+ 'BigNewsDay',
3
+ 'Quarter',
4
+ 'Perf5Day',
5
+ 'Perf5Day_n1',
6
+ 'DaysGreen',
7
+ 'DaysRed',
8
+ 'CurrentGap',
9
+ 'RangePct',
10
+ 'RangePct_n1',
11
+ 'RangePct_n2',
12
+ 'OHLC4_VIX',
13
+ 'OHLC4_VIX_n1',
14
+ 'OHLC4_VIX_n2',
15
+ 'VIXOpen',
16
+ 'VVIXOpen',
17
+ 'OpenL1',
18
+ 'OpenL2',
19
+ 'OpenH1',
20
+ 'OpenH2',
21
+ 'L1TouchPct',
22
+ 'L2TouchPct',
23
+ 'H1TouchPct',
24
+ 'H2TouchPct',
25
+ 'L1BreakPct',
26
+ 'L2BreakPct',
27
+ 'H1BreakPct',
28
+ 'H2BreakPct',
29
+ 'H1BreakTouchPct',
30
+ 'H2BreakTouchPct',
31
+ 'L1BreakTouchPct',
32
+ 'L2BreakTouchPct'
33
+ ]
getDailyData.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import yfinance as yf
4
+ from tqdm import tqdm
5
+ import os
6
+ from pandas.tseries.offsets import BDay
7
+ from sqlalchemy import create_engine
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ data_start_date = '2018-07-01'
12
+
13
+ def get_daily(mode='daily', periods_30m=None):
14
+ '''
15
+ Method to get daily data and create daily features. Optionally append intra data if specified.
16
+ `mode`: 'daily' or 'intra'.
17
+ `periods_30m`: How many 30m periods to bring in. Only specify if mode == 'intra'.
18
+ '''
19
+
20
+ vix = yf.Ticker('^VIX')
21
+ vvix = yf.Ticker('^VVIX')
22
+ spx = yf.Ticker('^GSPC')
23
+
24
+ # Grab data from db
25
+ engine = create_engine(
26
+ f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
27
+ f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
28
+ f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
29
+ )
30
+
31
+ query = f'''SELECT
32
+ spx.Datetime AS Datetime,
33
+ spx.Open AS Open,
34
+ spx.High AS High,
35
+ spx.Low AS Low,
36
+ spx.Close AS Close,
37
+ vix.Open AS Open_VIX,
38
+ vix.High AS High_VIX,
39
+ vix.Low AS Low_VIX,
40
+ vix.Close AS Close_VIX,
41
+ vvix.Open AS Open_VVIX,
42
+ vvix.High AS High_VVIX,
43
+ vvix.Low AS Low_VVIX,
44
+ vvix.Close AS Close_VVIX
45
+ FROM
46
+ SPX_full_1day AS spx
47
+ LEFT JOIN
48
+ VIX_full_1day AS vix ON spx.Datetime = vix.Datetime AND vix.Datetime > '{data_start_date}'
49
+ LEFT JOIN
50
+ VVIX_full_1day AS vvix ON spx.Datetime = vvix.Datetime AND vvix.Datetime > '{data_start_date}'
51
+ WHERE
52
+ spx.Datetime > '{data_start_date}'
53
+
54
+ '''
55
+ data = pd.read_sql_query(sql=query, con=engine.connect())
56
+ data['Datetime'] = pd.to_datetime(data['Datetime'])
57
+ data = data.set_index('Datetime',drop=True)
58
+
59
+ # Get incremental date
60
+ last_date = data.index.date[-1]
61
+ last_date = last_date + BDay(1)
62
+
63
+ prices_vix = vix.history(start=last_date, interval='1d')
64
+ prices_vvix = vvix.history(start=last_date, interval='1d')
65
+ prices_spx = spx.history(start=last_date, interval='1d')
66
+
67
+ if len(prices_spx) > 0:
68
+
69
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
70
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
71
+ prices_spx.index = prices_spx['index']
72
+ prices_spx = prices_spx.drop(columns='index')
73
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
74
+
75
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
76
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
77
+ prices_vix.index = prices_vix['index']
78
+ prices_vix = prices_vix.drop(columns='index')
79
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
80
+
81
+ prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
82
+ prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
83
+ prices_vvix.index = prices_vvix['index']
84
+ prices_vvix = prices_vvix.drop(columns='index')
85
+ prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
86
+
87
+ data1 = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
88
+ data1 = data1.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
89
+ data = pd.concat([data, data1])
90
+
91
+ else:
92
+ data = data.copy()
93
+
94
+ if mode == 'intra':
95
+ from getIntraData import get_intra
96
+ df_intra = get_intra(periods_30m)
97
+ data = data.merge(df_intra, left_index=True, right_index=True)
98
+ else:
99
+ data = data.copy()
100
+
101
+ # Features
102
+ data['PrevClose'] = data['Close'].shift(1)
103
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
104
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
105
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
106
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
107
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
108
+
109
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
110
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
111
+
112
+ data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5)
113
+ data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool)
114
+
115
+ data['VIXOpen'] = data['Open_VIX'] > data['Close_VIX'].shift(1)
116
+ data['VVIXOpen'] = data['Open_VVIX'] > data['Close_VVIX'].shift(1)
117
+ data['VIXOpen'] = data['VIXOpen'].astype(bool)
118
+ data['VVIXOpen'] = data['VVIXOpen'].astype(bool)
119
+
120
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
121
+ data['RangePct'] = data['Range'] / data['Close']
122
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
123
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
124
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
125
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
126
+ data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool)
127
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
128
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
129
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
130
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
131
+ data['RangePct_n1'] = data['RangePct'].shift(1)
132
+ data['RangePct_n2'] = data['RangePct'].shift(2)
133
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
134
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
135
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
136
+ data['CurrentGapHist'] = data['CurrentGap'].copy()
137
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
138
+ data['DayOfWeek'] = pd.to_datetime(data.index)
139
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
140
+
141
+ # Target -- the next day's low
142
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
143
+ data['Target'] = data['Target'].shift(-1)
144
+ # data['Target'] = data['RangePct'].shift(-1)
145
+
146
+ # Target for clf -- whether tomorrow will close above or below today's close
147
+ data['Target_clf'] = data['Close'] > data['PrevClose']
148
+ data['Target_clf'] = data['Target_clf'].shift(-1)
149
+ data['DayOfWeek'] = pd.to_datetime(data.index)
150
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
151
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
152
+
153
+ # Calculate up
154
+ data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
155
+
156
+ # Calculate upSD
157
+ data['upSD'] = data['up'].rolling(30).std(ddof=0)
158
+
159
+ # Calculate aveUp
160
+ data['aveUp'] = data['up'].rolling(30).mean()
161
+ data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
162
+ data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
163
+ data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
164
+ data['downSD'] = data['down'].rolling(30).std(ddof=0)
165
+ data['aveDown'] = data['down'].rolling(30).mean()
166
+ data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
167
+ data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open']
168
+
169
+ data = data.assign(
170
+ L1Touch = lambda x: x['Low'] < x['L1'],
171
+ L2Touch = lambda x: x['Low'] < x['L2'],
172
+ H1Touch = lambda x: x['High'] > x['H1'],
173
+ H2Touch = lambda x: x['High'] > x['H2'],
174
+ L1Break = lambda x: x['Close'] < x['L1'],
175
+ L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']),
176
+ L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']),
177
+ L2Break = lambda x: x['Close'] < x['L2'],
178
+ H1Break = lambda x: x['Close'] > x['H1'],
179
+ H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']),
180
+ H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']),
181
+ H2Break = lambda x: x['Close'] > x['H2'],
182
+ OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
183
+ OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
184
+ OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
185
+ OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0)
186
+ )
187
+
188
+ data['OpenL1'] = data['OpenL1'].shift(-1)
189
+ data['OpenL2'] = data['OpenL2'].shift(-1)
190
+ data['OpenH1'] = data['OpenH1'].shift(-1)
191
+ data['OpenH2'] = data['OpenH2'].shift(-1)
192
+
193
+
194
+ level_cols = [
195
+ 'L1Touch',
196
+ 'L2Touch',
197
+ 'H1Touch',
198
+ 'H2Touch',
199
+ 'L1Break',
200
+ 'L2Break',
201
+ 'H1Break',
202
+ 'H2Break'
203
+ ]
204
+
205
+ for col in level_cols:
206
+ data[col+'Pct'] = data[col].rolling(100).mean()
207
+ # data[col+'Pct'] = data[col+'Pct'].shift(-1)
208
+
209
+ data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
210
+ data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
211
+ data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
212
+ data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
213
+ data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
214
+ data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
215
+
216
+ data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
217
+ data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
218
+
219
+ if mode=='intra':
220
+ # Intraday features
221
+ data['CurrentOpen30'] = data['Open30'].shift(-1)
222
+ data['CurrentHigh30'] = data['High30'].shift(-1)
223
+ data['CurrentLow30'] = data['Low30'].shift(-1)
224
+ data['CurrentClose30'] = data['Close30'].shift(-1)
225
+ data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1)
226
+ data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4']
227
+ data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool)
228
+ data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
229
+
230
+ data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1)
231
+ data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1)
232
+
233
+ data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX']
234
+
235
+ # Open to High
236
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
237
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
238
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
239
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
240
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
241
+ data['CloseL1'] = np.where(data['Close30'] < data['L1'], 1, 0)
242
+ data['CloseL2'] = np.where(data['Close30'] < data['L2'], 1, 0)
243
+ data['CloseH1'] = np.where(data['Close30'] > data['H1'], 1, 0)
244
+ data['CloseH2'] = np.where(data['Close30'] > data['H2'], 1, 0)
245
+ data['CloseL1'] = data['CloseL1'].shift(-1)
246
+ data['CloseL2'] = data['CloseL2'].shift(-1)
247
+ data['CloseH1'] = data['CloseH1'].shift(-1)
248
+ data['CloseH2'] = data['CloseH2'].shift(-1)
249
+
250
+ def get_quintiles(df, col_name, q):
251
+ return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
252
+
253
+ probas = []
254
+ # Given the current price level
255
+ for i, pct in enumerate(data['CurrentClose30toClose']):
256
+ try:
257
+ # Split
258
+ df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
259
+ for q in df_q.index:
260
+ if q.left <= pct <= q.right:
261
+ p = df_q[q]
262
+ except:
263
+ p = None
264
+
265
+ probas.append(p)
266
+
267
+ data['GreenProbas'] = probas
268
+
269
+ engine = create_engine(
270
+ f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
271
+ f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
272
+ f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
273
+ )
274
+
275
+ df_releases = pd.read_sql_query('select * from releases', con=engine)
276
+ df_releases = df_releases.set_index('Datetime')
277
+ data = data.merge(df_releases, how = 'left', left_index=True, right_index=True)
278
+
279
+ for n in tqdm(df_releases.columns, desc='Merging econ data'):
280
+ # Get the name of the release
281
+ # n = releases[rid]['name']
282
+ # Merge the corresponding DF of the release
283
+ # data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
284
+ # Create a column that shifts the value in the merged column up by 1
285
+ data[f'{n}_shift'] = data[n].shift(-1)
286
+ # Fill the rest with zeroes
287
+ data[n] = data[n].fillna(0)
288
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
289
+
290
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
291
+
292
+ def cumul_sum(col):
293
+ nums = []
294
+ s = 0
295
+ for x in col:
296
+ if x == 1:
297
+ s += 1
298
+ elif x == 0:
299
+ s = 0
300
+ nums.append(s)
301
+ return nums
302
+
303
+ consec_green = cumul_sum(data['GreenDay'].values)
304
+ consec_red = cumul_sum(data['RedDay'].values)
305
+
306
+ data['DaysGreen'] = consec_green
307
+ data['DaysRed'] = consec_red
308
+
309
+ final_row = data.index[-2]
310
+
311
+ if mode=='daily':
312
+ from dailyCols import model_cols
313
+
314
+ elif mode=='intra':
315
+ from intraCols import model_cols
316
+
317
+ df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
318
+ df_final = df_final.dropna(subset=['Target','Target_clf'])
319
+ # df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
320
+ return data, df_final, final_row
getData.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pandas_datareader as pdr
3
+ import numpy as np
4
+ import yfinance as yf
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ from typing import List
8
+ from tqdm import tqdm
9
+ import os
10
+ import datetime
11
+
12
+ model_cols = [
13
+ 'BigNewsDay',
14
+ 'Quarter',
15
+ 'Perf5Day',
16
+ 'Perf5Day_n1',
17
+ 'DaysGreen',
18
+ 'DaysRed',
19
+ 'CurrentHigh30toClose',
20
+ 'CurrentLow30toClose',
21
+ 'CurrentClose30toClose',
22
+ 'CurrentRange30',
23
+ 'GapFill30',
24
+ 'CurrentGap',
25
+ 'RangePct',
26
+ 'RangePct_n1',
27
+ 'RangePct_n2',
28
+ 'OHLC4_VIX',
29
+ 'OHLC4_VIX_n1',
30
+ 'OHLC4_VIX_n2',
31
+ 'OHLC4_Current_Trend',
32
+ 'OHLC4_Trend',
33
+ 'CurrentVIXTrend',
34
+ 'SPX30IntraPerf',
35
+ 'VIX30IntraPerf',
36
+ 'VVIX30IntraPerf',
37
+ # 'OpenL1',
38
+ # 'OpenL2',
39
+ # 'OpenH1',
40
+ # 'OpenH2',
41
+ 'L1TouchPct',
42
+ 'L2TouchPct',
43
+ 'H1TouchPct',
44
+ 'H2TouchPct',
45
+ 'L1BreakPct',
46
+ 'L2BreakPct',
47
+ 'H1BreakPct',
48
+ 'H2BreakPct',
49
+ 'GreenProbas',
50
+ 'H1BreakTouchPct',
51
+ 'H2BreakTouchPct',
52
+ 'L1BreakTouchPct',
53
+ 'L2BreakTouchPct',
54
+ 'H1BreakH2TouchPct',
55
+ 'L1BreakL2TouchPct',
56
+ 'H1TouchGreenPct',
57
+ 'L1TouchRedPct'
58
+ # 'GapFillGreenProba'
59
+ ]
60
+
61
+ def get_data(periods_30m = 1):
62
+ # f = open('settings.json')
63
+ # j = json.load(f)
64
+ # API_KEY_FRED = j["API_KEY_FRED"]
65
+
66
+ API_KEY_FRED = os.getenv('API_KEY_FRED')
67
+
68
+ def parse_release_dates(release_id: str) -> List[str]:
69
+ release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
70
+ r = requests.get(release_dates_url)
71
+ text = r.text
72
+ soup = BeautifulSoup(text, 'xml')
73
+ dates = []
74
+ for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
75
+ dates.append(release_date_tag.text)
76
+ return dates
77
+
78
+ econ_dfs = {}
79
+
80
+ econ_tickers = [
81
+ 'WALCL',
82
+ 'NFCI',
83
+ 'WRESBAL'
84
+ ]
85
+
86
+ for et in tqdm(econ_tickers, desc='getting econ tickers'):
87
+ df = pdr.get_data_fred(et)
88
+ df.index = df.index.rename('ds')
89
+ econ_dfs[et] = df
90
+
91
+ release_ids = [
92
+ "10", # "Consumer Price Index"
93
+ "46", # "Producer Price Index"
94
+ "50", # "Employment Situation"
95
+ "53", # "Gross Domestic Product"
96
+ "103", # "Discount Rate Meeting Minutes"
97
+ "180", # "Unemployment Insurance Weekly Claims Report"
98
+ "194", # "ADP National Employment Report"
99
+ "323" # "Trimmed Mean PCE Inflation Rate"
100
+ ]
101
+
102
+ release_names = [
103
+ "CPI",
104
+ "PPI",
105
+ "NFP",
106
+ "GDP",
107
+ "FOMC",
108
+ "UNEMP",
109
+ "ADP",
110
+ "PCE"
111
+ ]
112
+
113
+ releases = {}
114
+
115
+ for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
116
+ releases[rid] = {}
117
+ releases[rid]['dates'] = parse_release_dates(rid)
118
+ releases[rid]['name'] = n
119
+
120
+ # Create a DF that has all dates with the name of the col as 1
121
+ # Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
122
+ # This column serves as the true/false indicator of whether there was economic data released that day.
123
+ for rid in tqdm(release_ids, desc='Making indicators'):
124
+ releases[rid]['df'] = pd.DataFrame(
125
+ index=releases[rid]['dates'],
126
+ data={
127
+ releases[rid]['name']: 1
128
+ })
129
+ releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
130
+
131
+ vix = yf.Ticker('^VIX')
132
+ vvix = yf.Ticker('^VVIX')
133
+ spx = yf.Ticker('^GSPC')
134
+
135
+ # Pull in data
136
+ data_files = {"spx": "SPX_full_30min.txt", "vix": "VIX_full_30min.txt", "vvix":'VVIX_full_30min.txt'}
137
+ data = load_dataset("boomsss/spx_intra", data_files=data_files)
138
+ dfs = []
139
+ for ticker in data.keys():
140
+ rows = [d['text'] for d in data[ticker]]
141
+ rows = [x.split(',') for x in rows]
142
+
143
+ fr = pd.DataFrame(columns=[
144
+ 'Datetime','Open','High','Low','Close'
145
+ ], data = rows)
146
+
147
+ fr['Datetime'] = pd.to_datetime(fr['Datetime'])
148
+ fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
149
+ fr = fr.set_index('Datetime')
150
+ fr['Open'] = pd.to_numeric(fr['Open'])
151
+ fr['High'] = pd.to_numeric(fr['High'])
152
+ fr['Low'] = pd.to_numeric(fr['Low'])
153
+ fr['Close'] = pd.to_numeric(fr['Close'])
154
+ dfs.append(fr)
155
+
156
+ df_30m = pd.concat(dfs, axis=1)
157
+
158
+ df_30m.columns = [
159
+ 'Open30',
160
+ 'High30',
161
+ 'Low30',
162
+ 'Close30',
163
+ 'Open_VIX30',
164
+ 'High_VIX30',
165
+ 'Low_VIX30',
166
+ 'Close_VIX30',
167
+ 'Open_VVIX30',
168
+ 'High_VVIX30',
169
+ 'Low_VVIX30',
170
+ 'Close_VVIX30'
171
+ ]
172
+
173
+ # Get incremental date
174
+ last_date = df_30m.index.date[-1]
175
+ last_date = last_date + datetime.timedelta(days=1)
176
+
177
+ # Get incremental data for each index
178
+ spx1 = yf.Ticker('^GSPC')
179
+ vix1 = yf.Ticker('^VIX')
180
+ vvix1 = yf.Ticker('^VVIX')
181
+ yfp = spx1.history(start=last_date, interval='30m')
182
+ yf_vix = vix1.history(start=last_date, interval='30m')
183
+ yf_vvix = vvix1.history(start=last_date, interval='30m')
184
+
185
+ if len(yfp) > 0:
186
+ # Convert indexes to EST if not already
187
+ for _df in [yfp, yf_vix, yf_vvix]:
188
+ if _df.index.tz.zone != 'America/New_York':
189
+ _df['Datetime'] = pd.to_datetime(_df.index)
190
+ _df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York')
191
+ _df.set_index('Datetime', inplace=True)
192
+ # Concat them
193
+ df_inc = pd.concat([
194
+ yfp[['Open','High','Low','Close']],
195
+ yf_vix[['Open','High','Low','Close']],
196
+ yf_vvix[['Open','High','Low','Close']]
197
+ ], axis=1)
198
+ df_inc.columns = df_30m.columns
199
+ df_inc = df_inc.loc[
200
+ (df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00))
201
+ ]
202
+ df_30m = pd.concat([df_30m, df_inc])
203
+ else:
204
+ df_30m = df_30m.copy()
205
+
206
+ df_30m = df_30m.loc[
207
+ (df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00))
208
+ ]
209
+ df_30m['dt'] = df_30m.index.date
210
+ df_30m = df_30m.groupby('dt').head(periods_30m)
211
+ df_30m = df_30m.set_index('dt',drop=True)
212
+ df_30m.index.name = 'Datetime'
213
+
214
+ df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1
215
+ df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1
216
+ df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1
217
+
218
+ opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1)
219
+ highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max()
220
+ lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min()
221
+ closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1)
222
+ spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1)
223
+ vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1)
224
+ vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
225
+
226
+ df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
227
+
228
+
229
+ prices_vix = vix.history(start=data_start_date, interval='1d')
230
+ prices_vvix = vvix.history(start=data_start_date, interval='1d')
231
+ prices_spx = spx.history(start=data_start_date, interval='1d')
232
+
233
+ prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
234
+ prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
235
+ prices_spx.index = prices_spx['index']
236
+ prices_spx = prices_spx.drop(columns='index')
237
+ prices_spx.index = pd.DatetimeIndex(prices_spx.index)
238
+
239
+ prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
240
+ prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
241
+ prices_vix.index = prices_vix['index']
242
+ prices_vix = prices_vix.drop(columns='index')
243
+ prices_vix.index = pd.DatetimeIndex(prices_vix.index)
244
+
245
+ prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
246
+ prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
247
+ prices_vvix.index = prices_vvix['index']
248
+ prices_vvix = prices_vvix.drop(columns='index')
249
+ prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
250
+
251
+ data = prices_spx.merge(df_intra, left_index=True, right_index=True)
252
+ data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
253
+ data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
254
+
255
+ # Features
256
+ data['PrevClose'] = data['Close'].shift(1)
257
+ data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
258
+ data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
259
+ data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
260
+ data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
261
+ data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
262
+
263
+ data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
264
+ data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
265
+
266
+ data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5)
267
+ data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool)
268
+
269
+ data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
270
+ data['RangePct'] = data['Range'] / data['Close']
271
+ data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
272
+ data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
273
+ data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
274
+ data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
275
+ data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool)
276
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
277
+ data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
278
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
279
+ data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
280
+ data['RangePct_n1'] = data['RangePct'].shift(1)
281
+ data['RangePct_n2'] = data['RangePct'].shift(2)
282
+ data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
283
+ data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
284
+ data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
285
+ data['CurrentGapHist'] = data['CurrentGap'].copy()
286
+ data['CurrentGap'] = data['CurrentGap'].shift(-1)
287
+ data['DayOfWeek'] = pd.to_datetime(data.index)
288
+ data['DayOfWeek'] = data['DayOfWeek'].dt.day
289
+
290
+ # Intraday features
291
+ data['CurrentOpen30'] = data['Open30'].shift(-1)
292
+ data['CurrentHigh30'] = data['High30'].shift(-1)
293
+ data['CurrentLow30'] = data['Low30'].shift(-1)
294
+ data['CurrentClose30'] = data['Close30'].shift(-1)
295
+ data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1)
296
+ data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4']
297
+ data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool)
298
+ data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
299
+
300
+ data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1)
301
+ data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1)
302
+
303
+ data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX']
304
+
305
+ # Open to High
306
+ data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
307
+ data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
308
+ data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
309
+ data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
310
+ data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
311
+
312
+ # Target -- the next day's low
313
+ data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
314
+ data['Target'] = data['Target'].shift(-1)
315
+ # data['Target'] = data['RangePct'].shift(-1)
316
+
317
+ # Target for clf -- whether tomorrow will close above or below today's close
318
+ data['Target_clf'] = data['Close'] > data['PrevClose']
319
+ data['Target_clf'] = data['Target_clf'].shift(-1)
320
+ data['DayOfWeek'] = pd.to_datetime(data.index)
321
+ data['Quarter'] = data['DayOfWeek'].dt.quarter
322
+ data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
323
+
324
+ # Calculate up
325
+ data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
326
+
327
+ # Calculate upSD
328
+ data['upSD'] = data['up'].rolling(30).std(ddof=0)
329
+
330
+ # Calculate aveUp
331
+ data['aveUp'] = data['up'].rolling(30).mean()
332
+ data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
333
+ data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
334
+ data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
335
+ data['downSD'] = data['down'].rolling(30).std(ddof=0)
336
+ data['aveDown'] = data['down'].rolling(30).mean()
337
+ data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
338
+ data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open']
339
+
340
+ data = data.assign(
341
+ L1Touch = lambda x: x['Low'] < x['L1'],
342
+ L2Touch = lambda x: x['Low'] < x['L2'],
343
+ H1Touch = lambda x: x['High'] > x['H1'],
344
+ H2Touch = lambda x: x['High'] > x['H2'],
345
+ L1Break = lambda x: x['Close'] < x['L1'],
346
+ L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']),
347
+ L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']),
348
+ L2Break = lambda x: x['Close'] < x['L2'],
349
+ H1Break = lambda x: x['Close'] > x['H1'],
350
+ H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']),
351
+ H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']),
352
+ H2Break = lambda x: x['Close'] > x['H2'],
353
+ OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
354
+ OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
355
+ OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
356
+ OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
357
+ CloseL1 = lambda x: np.where(x['Close30'] < x['L1'], 1, 0),
358
+ CloseL2 = lambda x: np.where(x['Close30'] < x['L2'], 1, 0),
359
+ CloseH1 = lambda x: np.where(x['Close30'] > x['H1'], 1, 0),
360
+ CloseH2 = lambda x: np.where(x['Close30'] > x['H2'], 1, 0)
361
+ )
362
+
363
+ data['OpenL1'] = data['OpenL1'].shift(-1)
364
+ data['OpenL2'] = data['OpenL2'].shift(-1)
365
+ data['OpenH1'] = data['OpenH1'].shift(-1)
366
+ data['OpenH2'] = data['OpenH2'].shift(-1)
367
+ data['CloseL1'] = data['CloseL1'].shift(-1)
368
+ data['CloseL2'] = data['CloseL2'].shift(-1)
369
+ data['CloseH1'] = data['CloseH1'].shift(-1)
370
+ data['CloseH2'] = data['CloseH2'].shift(-1)
371
+
372
+ level_cols = [
373
+ 'L1Touch',
374
+ 'L2Touch',
375
+ 'H1Touch',
376
+ 'H2Touch',
377
+ 'L1Break',
378
+ 'L2Break',
379
+ 'H1Break',
380
+ 'H2Break'
381
+ ]
382
+
383
+ for col in level_cols:
384
+ data[col+'Pct'] = data[col].rolling(100).mean()
385
+ # data[col+'Pct'] = data[col+'Pct'].shift(-1)
386
+
387
+ data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
388
+ data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
389
+ data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
390
+ data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
391
+ data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
392
+ data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
393
+
394
+ data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
395
+ data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
396
+
397
+ def get_quintiles(df, col_name, q):
398
+ return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
399
+
400
+ probas = []
401
+ # Given the current price level
402
+ for i, pct in enumerate(data['CurrentClose30toClose']):
403
+ try:
404
+ # Split
405
+ df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
406
+ for q in df_q.index:
407
+ if q.left <= pct <= q.right:
408
+ p = df_q[q]
409
+ except:
410
+ p = None
411
+
412
+ probas.append(p)
413
+
414
+ # gapfills = []
415
+ # for i, pct in enumerate(data['CurrentGap']):
416
+ # try:
417
+ # df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
418
+ # for q in df_q.index:
419
+ # if q.left <= pct <= q.right:
420
+ # p = df_q[q]
421
+ # except:
422
+ # p = None
423
+
424
+ # gapfills.append(p)
425
+
426
+ data['GreenProbas'] = probas
427
+ # data['GapFillGreenProba'] = gapfills
428
+
429
+ for rid in tqdm(release_ids, desc='Merging econ data'):
430
+ # Get the name of the release
431
+ n = releases[rid]['name']
432
+ # Merge the corresponding DF of the release
433
+ data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
434
+ # Create a column that shifts the value in the merged column up by 1
435
+ data[f'{n}_shift'] = data[n].shift(-1)
436
+ # Fill the rest with zeroes
437
+ data[n] = data[n].fillna(0)
438
+ data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
439
+
440
+ data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
441
+
442
+ def cumul_sum(col):
443
+ nums = []
444
+ s = 0
445
+ for x in col:
446
+ if x == 1:
447
+ s += 1
448
+ elif x == 0:
449
+ s = 0
450
+ nums.append(s)
451
+ return nums
452
+
453
+ consec_green = cumul_sum(data['GreenDay'].values)
454
+ consec_red = cumul_sum(data['RedDay'].values)
455
+
456
+ data['DaysGreen'] = consec_green
457
+ data['DaysRed'] = consec_red
458
+
459
+ final_row = data.index[-2]
460
+
461
+ exp_row = data.index[-1]
462
+
463
+ df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
464
+ df_final = df_final.dropna(subset=['Target','Target_clf'])
465
+ # df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
466
+ return data, df_final, final_row
getIntraData.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import yfinance as yf
3
+ import datetime
4
+ # from datasets import load_dataset
5
+ from sqlalchemy import create_engine
6
+ import os
7
+ from getDailyData import data_start_date
8
+ from dotenv import load_dotenv
9
+
10
+ # Load environment variables from the .env file
11
+ load_dotenv()
12
+
13
+ def get_intra(periods_30m = 1):
14
+ '''
15
+ Method to get historical 30 minute data and append live data to it, if exists.
16
+ '''
17
+ engine = create_engine(
18
+ f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
19
+ f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
20
+ f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
21
+ )
22
+
23
+ query = f'''SELECT
24
+ spx30.Datetime AS Datetime,
25
+ spx30.Open AS Open30,
26
+ spx30.High AS High30,
27
+ spx30.Low AS Low30,
28
+ spx30.Close AS Close30,
29
+ vix30.Open AS Open_VIX30,
30
+ vix30.High AS High_VIX30,
31
+ vix30.Low AS Low_VIX30,
32
+ vix30.Close AS Close_VIX30,
33
+ vvix30.Open AS Open_VVIX30,
34
+ vvix30.High AS High_VVIX30,
35
+ vvix30.Low AS Low_VVIX30,
36
+ vvix30.Close AS Close_VVIX30
37
+ FROM
38
+ SPX_full_30min AS spx30
39
+ LEFT JOIN
40
+ VIX_full_30min AS vix30 ON spx30.Datetime = vix30.Datetime AND vix30.Datetime > '{data_start_date}'
41
+ LEFT JOIN
42
+ VVIX_full_30min AS vvix30 ON spx30.Datetime = vvix30.Datetime AND vvix30.Datetime > '{data_start_date}'
43
+ WHERE
44
+ spx30.Datetime > '{data_start_date}'
45
+
46
+ '''
47
+ # spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
48
+ # vix30 = pd.read_sql_query(f'SELECT * FROM VIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
49
+ # vvix30 = pd.read_sql_query(f'SELECT * FROM VVIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
50
+ # dfs = []
51
+
52
+ df_30m = pd.read_sql_query(sql=query, con=engine.connect())
53
+ df_30m['Datetime'] = df_30m['Datetime'].dt.tz_localize('America/New_York')
54
+ df_30m = df_30m.set_index('Datetime',drop=True)
55
+
56
+ # for fr in [spx30, vix30, vvix30]:
57
+ # # fr['Datetime'] = fr['Datetime'].apply(lambda x: datetime.datetime.strptime(x[:-6], dt_format))
58
+ # fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
59
+ # fr = fr.set_index('Datetime')
60
+ # fr['Open'] = pd.to_numeric(fr['Open'])
61
+ # fr['High'] = pd.to_numeric(fr['High'])
62
+ # fr['Low'] = pd.to_numeric(fr['Low'])
63
+ # fr['Close'] = pd.to_numeric(fr['Close'])
64
+ # dfs.append(fr[['Open','High','Low','Close']])
65
+
66
+ # df_30m = pd.concat(dfs, axis=1)
67
+
68
+ # df_30m.columns = [
69
+ # 'Open30',
70
+ # 'High30',
71
+ # 'Low30',
72
+ # 'Close30',
73
+ # 'Open_VIX30',
74
+ # 'High_VIX30',
75
+ # 'Low_VIX30',
76
+ # 'Close_VIX30',
77
+ # 'Open_VVIX30',
78
+ # 'High_VVIX30',
79
+ # 'Low_VVIX30',
80
+ # 'Close_VVIX30'
81
+ # ]
82
+
83
+ # Get incremental date
84
+ last_date = df_30m.index.date[-1]
85
+ last_date = last_date + datetime.timedelta(days=1)
86
+
87
+ # Get incremental data for each index
88
+ spx1 = yf.Ticker('^GSPC')
89
+ vix1 = yf.Ticker('^VIX')
90
+ vvix1 = yf.Ticker('^VVIX')
91
+ yfp = spx1.history(start=last_date, interval='30m')
92
+ yf_vix = vix1.history(start=last_date, interval='30m')
93
+ yf_vvix = vvix1.history(start=last_date, interval='30m')
94
+
95
+ if len(yfp) > 0:
96
+ # Convert indexes to EST if not already
97
+ for _df in [yfp, yf_vix, yf_vvix]:
98
+ if (_df.index.tz.zone != 'America/New_York') or (type(_df.index) != pd.DatetimeIndex):
99
+ _df['Datetime'] = pd.to_datetime(_df.index)
100
+ _df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York')
101
+ _df.set_index('Datetime', inplace=True)
102
+ # Concat them
103
+ df_inc = pd.concat([
104
+ yfp[['Open','High','Low','Close']],
105
+ yf_vix[['Open','High','Low','Close']],
106
+ yf_vvix[['Open','High','Low','Close']]
107
+ ], axis=1)
108
+ df_inc.columns = df_30m.columns
109
+ df_inc = df_inc.loc[
110
+ (df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00))
111
+ ]
112
+ df_30m = pd.concat([df_30m, df_inc])
113
+ else:
114
+ df_30m = df_30m.copy()
115
+
116
+ df_30m = df_30m.loc[
117
+ (df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00))
118
+ ]
119
+ df_30m['dt'] = df_30m.index.date
120
+ df_30m = df_30m.groupby('dt').head(periods_30m)
121
+ df_30m = df_30m.set_index('dt',drop=True)
122
+ df_30m.index.name = 'Datetime'
123
+
124
+ df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1
125
+ df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1
126
+ df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1
127
+
128
+ opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1)
129
+ highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max()
130
+ lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min()
131
+ closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1)
132
+ spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1)
133
+ vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1)
134
+ vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
135
+
136
+ df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
137
+ return df_intra
138
+
139
+
intraCols.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_cols = [
2
+ 'BigNewsDay',
3
+ 'Quarter',
4
+ 'Perf5Day',
5
+ 'Perf5Day_n1',
6
+ 'DaysGreen',
7
+ 'DaysRed',
8
+ 'CurrentHigh30toClose',
9
+ 'CurrentLow30toClose',
10
+ 'CurrentClose30toClose',
11
+ 'CurrentRange30',
12
+ 'GapFill30',
13
+ 'CurrentGap',
14
+ 'RangePct',
15
+ 'RangePct_n1',
16
+ 'RangePct_n2',
17
+ 'OHLC4_VIX',
18
+ 'OHLC4_VIX_n1',
19
+ 'OHLC4_VIX_n2',
20
+ 'OHLC4_Current_Trend',
21
+ 'OHLC4_Trend',
22
+ 'CurrentVIXTrend',
23
+ 'SPX30IntraPerf',
24
+ 'VIX30IntraPerf',
25
+ 'VVIX30IntraPerf',
26
+ # 'OpenL1',
27
+ # 'OpenL2',
28
+ # 'OpenH1',
29
+ # 'OpenH2',
30
+ 'L1TouchPct',
31
+ 'L2TouchPct',
32
+ 'H1TouchPct',
33
+ 'H2TouchPct',
34
+ 'L1BreakPct',
35
+ 'L2BreakPct',
36
+ 'H1BreakPct',
37
+ 'H2BreakPct',
38
+ 'GreenProbas',
39
+ 'H1BreakTouchPct',
40
+ 'H2BreakTouchPct',
41
+ 'L1BreakTouchPct',
42
+ 'L2BreakTouchPct',
43
+ 'H1BreakH2TouchPct',
44
+ 'L1BreakL2TouchPct',
45
+ 'H1TouchGreenPct',
46
+ 'L1TouchRedPct'
47
+ # 'GapFillGreenProba'
48
+ ]
lambda_function.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Function should get the data and run the whole model, return a single prediction based on the time
2
+ from getDailyData import get_daily
3
+ from model_intra_v3 import walk_forward_validation
4
+ import json
5
+
6
+ def lambda_handler(periods_30m):
7
+ data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
8
+ res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
9
+ return json.loads(json.dumps({
10
+ 'date': str(res.index[-1]),
11
+ 'prediction': res['Predicted'].iloc[-1],
12
+ 'time':periods_30m
13
+ }))
14
+
15
+ if __name__ == '__main__':
16
+ j = lambda_handler(1)
17
+ print(j)
model_day_v2.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ from sklearn import linear_model
4
+ import lightgbm as lgb
5
+ from dailyCols import model_cols
6
+
7
+ def walk_forward_validation(df, target_column, num_training_rows, num_periods):
8
+
9
+ # Create an XGBRegressor model
10
+ # model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
11
+ model = linear_model.LinearRegression()
12
+
13
+ overall_results = []
14
+ # Iterate over the rows in the DataFrame, one step at a time
15
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
16
+ # Split the data into training and test sets
17
+ X_train = df.drop(target_column, axis=1).iloc[:i]
18
+ y_train = df[target_column].iloc[:i]
19
+ X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
20
+ y_test = df[target_column].iloc[i:i+num_periods]
21
+
22
+ # Fit the model to the training data
23
+ model.fit(X_train, y_train)
24
+
25
+ # Make a prediction on the test data
26
+ predictions = model.predict(X_test)
27
+
28
+ # Create a DataFrame to store the true and predicted values
29
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
30
+
31
+ overall_results.append(result_df)
32
+
33
+ df_results = pd.concat(overall_results)
34
+ # model.save_model('model_lr.bin')
35
+ # Return the true and predicted values, and fitted model
36
+ return df_results, model
37
+
38
+ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
39
+
40
+ # Create run the regression model to get its target
41
+ res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
42
+ # joblib.dump(model1, 'model1.bin')
43
+
44
+ # Merge the result df back on the df for feeding into the classifier
45
+ for_merge = res[['Predicted']]
46
+ for_merge.columns = ['RegrModelOut']
47
+ for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
48
+ df = df.merge(for_merge, left_index=True, right_index=True)
49
+ df = df.drop(columns=[target_column_regr])
50
+ df = df[model_cols + ['RegrModelOut', target_column_clf]]
51
+
52
+ df[target_column_clf] = df[target_column_clf].astype(bool)
53
+ df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
54
+
55
+ # Create an XGBRegressor model
56
+ # model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
57
+ model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
58
+ # model = linear_model.LogisticRegression(max_iter=1500)
59
+
60
+ overall_results = []
61
+ # Iterate over the rows in the DataFrame, one step at a time
62
+ for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
63
+ # Split the data into training and test sets
64
+ X_train = df.drop(target_column_clf, axis=1).iloc[:i]
65
+ y_train = df[target_column_clf].iloc[:i]
66
+ X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
67
+ y_test = df[target_column_clf].iloc[i:i+num_periods]
68
+
69
+ # Fit the model to the training data
70
+ model2.fit(X_train, y_train)
71
+
72
+ # Make a prediction on the test data
73
+ predictions = model2.predict_proba(X_test)[:,-1]
74
+
75
+ # Create a DataFrame to store the true and predicted values
76
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
77
+
78
+ overall_results.append(result_df)
79
+
80
+ df_results = pd.concat(overall_results)
81
+
82
+ # Calibrate Probabilities
83
+ def get_quantiles(df, col_name, q):
84
+ return df.groupby(pd.cut(df[col_name], q))['True'].mean()
85
+
86
+ greenprobas = []
87
+ meanprobas = []
88
+ for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
89
+ try:
90
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
91
+ for q in df_q.index:
92
+ if q.left <= pct <= q.right:
93
+ p = df_q[q]
94
+ c = (q.left + q.right) / 2
95
+ except:
96
+ p = None
97
+ c = None
98
+
99
+ greenprobas.append(p)
100
+ meanprobas.append(c)
101
+
102
+ df_results['CalibPredicted'] = greenprobas
103
+
104
+ return df_results, model1, model2
105
+
106
+ def seq_predict_proba(df, trained_reg_model, trained_clf_model):
107
+ regr_pred = trained_reg_model.predict(df)
108
+ regr_pred = regr_pred > 0
109
+ new_df = df.copy()
110
+ new_df['RegrModelOut'] = regr_pred
111
+ clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
112
+ return clf_pred_proba
model_intra_v2.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ import lightgbm as lgb
4
+ from sklearn.model_selection import TimeSeriesSplit
5
+ from intraCols import model_cols
6
+
7
+ def walk_forward_validation(df, target_column, num_periods, mode='full'):
8
+
9
+ df = df[model_cols + [target_column]]
10
+ df[target_column] = df[target_column].astype(bool)
11
+
12
+ tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
13
+
14
+ if mode == 'full':
15
+ overall_results = []
16
+ # Iterate over the rows in the DataFrame, one step at a time
17
+ # Split the time series data using TimeSeriesSplit
18
+ for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
19
+ # Extract the training and testing data for the current split
20
+ X_train = df.drop(target_column, axis=1).iloc[train_index]
21
+ y_train = df[target_column].iloc[train_index]
22
+ X_test = df.drop(target_column, axis=1).iloc[test_index]
23
+ y_test = df[target_column].iloc[test_index]
24
+
25
+ y_train = y_train.astype(bool)
26
+ model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
27
+ model.fit(X_train, y_train)
28
+ # Make a prediction on the test data
29
+ predictions = model.predict_proba(X_test)[:,-1]
30
+
31
+ # Create a DataFrame to store the true and predicted values
32
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
33
+ overall_results.append(result_df)
34
+ df_results = pd.concat(overall_results)
35
+
36
+ # Calibrate Probabilities
37
+ def get_quantiles(df, col_name, q):
38
+ return df.groupby(pd.cut(df[col_name], q))['True'].mean()
39
+
40
+ greenprobas = []
41
+ for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
42
+ try:
43
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
44
+ for q in df_q.index:
45
+ if q.left <= pct <= q.right:
46
+ p = df_q[q]
47
+ except:
48
+ p = None
49
+
50
+ greenprobas.append(p)
51
+
52
+ df_results['CalibPredicted'] = greenprobas
53
+
54
+ return df_results, model
55
+
56
+ elif mode == 'single':
57
+ X_train = df.drop(target_column, axis=1).iloc[:-1]
58
+ y_train = df[target_column].iloc[:-1]
59
+ X_test = df.drop(target_column, axis=1).iloc[-1]
60
+ y_test = df[target_column].iloc[-1]
61
+ y_train = y_train.astype(bool)
62
+ model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
63
+ model.fit(X_train, y_train)
64
+ predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
65
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
66
+
67
+ return result_df, model
68
+
69
+
70
+
71
+
72
+ def seq_predict_proba(df, trained_clf_model):
73
+ clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
74
+ return clf_pred_proba
model_intra_v3.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lightgbm as lgb
3
+ from intraCols import model_cols
4
+
5
+ def walk_forward_validation(df, target_column, num_periods, mode='full'):
6
+
7
+ df = df[model_cols + [target_column]]
8
+ df[target_column] = df[target_column].astype(bool)
9
+
10
+ X_train = df.drop(target_column, axis=1).iloc[:-1]
11
+ y_train = df[target_column].iloc[:-1]
12
+ X_test = df.drop(target_column, axis=1).iloc[-1]
13
+ y_test = df[target_column].iloc[-1]
14
+ y_train = y_train.astype(bool)
15
+ model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
16
+ model.fit(X_train, y_train)
17
+ predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
18
+ result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
19
+
20
+ return result_df, model
21
+
22
+ def seq_predict_proba(df, trained_clf_model):
23
+ clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
24
+ return clf_pred_proba
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ yfinance==0.2.28
4
+ requests
5
+ typing_extensions
6
+ lightgbm
7
+ tqdm
8
+ fastjsonschema
9
+ json5
10
+ jsonschema
11
+ holidays
12
+ pytz
13
+ mysqlclient
14
+ sqlalchemy<2.0
15
+ python-dotenv