Spaces:
Sleeping
Sleeping
Commit
·
2dff47b
0
Parent(s):
first commit
Browse files- .gitignore +4 -0
- ca-certificates.crt +0 -0
- dailyCols.py +33 -0
- getDailyData.py +320 -0
- getData.py +466 -0
- getIntraData.py +139 -0
- intraCols.py +48 -0
- lambda_function.py +17 -0
- model_day_v2.py +112 -0
- model_intra_v2.py +74 -0
- model_intra_v3.py +24 -0
- requirements.txt +15 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/.env
|
2 |
+
/.venv
|
3 |
+
/hss.pem ec2-user@ec2-18-1
|
4 |
+
/__pycache__
|
ca-certificates.crt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dailyCols.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_cols = [
|
2 |
+
'BigNewsDay',
|
3 |
+
'Quarter',
|
4 |
+
'Perf5Day',
|
5 |
+
'Perf5Day_n1',
|
6 |
+
'DaysGreen',
|
7 |
+
'DaysRed',
|
8 |
+
'CurrentGap',
|
9 |
+
'RangePct',
|
10 |
+
'RangePct_n1',
|
11 |
+
'RangePct_n2',
|
12 |
+
'OHLC4_VIX',
|
13 |
+
'OHLC4_VIX_n1',
|
14 |
+
'OHLC4_VIX_n2',
|
15 |
+
'VIXOpen',
|
16 |
+
'VVIXOpen',
|
17 |
+
'OpenL1',
|
18 |
+
'OpenL2',
|
19 |
+
'OpenH1',
|
20 |
+
'OpenH2',
|
21 |
+
'L1TouchPct',
|
22 |
+
'L2TouchPct',
|
23 |
+
'H1TouchPct',
|
24 |
+
'H2TouchPct',
|
25 |
+
'L1BreakPct',
|
26 |
+
'L2BreakPct',
|
27 |
+
'H1BreakPct',
|
28 |
+
'H2BreakPct',
|
29 |
+
'H1BreakTouchPct',
|
30 |
+
'H2BreakTouchPct',
|
31 |
+
'L1BreakTouchPct',
|
32 |
+
'L2BreakTouchPct'
|
33 |
+
]
|
getDailyData.py
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import yfinance as yf
|
4 |
+
from tqdm import tqdm
|
5 |
+
import os
|
6 |
+
from pandas.tseries.offsets import BDay
|
7 |
+
from sqlalchemy import create_engine
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
data_start_date = '2018-07-01'
|
12 |
+
|
13 |
+
def get_daily(mode='daily', periods_30m=None):
|
14 |
+
'''
|
15 |
+
Method to get daily data and create daily features. Optionally append intra data if specified.
|
16 |
+
`mode`: 'daily' or 'intra'.
|
17 |
+
`periods_30m`: How many 30m periods to bring in. Only specify if mode == 'intra'.
|
18 |
+
'''
|
19 |
+
|
20 |
+
vix = yf.Ticker('^VIX')
|
21 |
+
vvix = yf.Ticker('^VVIX')
|
22 |
+
spx = yf.Ticker('^GSPC')
|
23 |
+
|
24 |
+
# Grab data from db
|
25 |
+
engine = create_engine(
|
26 |
+
f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
|
27 |
+
f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
|
28 |
+
f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
|
29 |
+
)
|
30 |
+
|
31 |
+
query = f'''SELECT
|
32 |
+
spx.Datetime AS Datetime,
|
33 |
+
spx.Open AS Open,
|
34 |
+
spx.High AS High,
|
35 |
+
spx.Low AS Low,
|
36 |
+
spx.Close AS Close,
|
37 |
+
vix.Open AS Open_VIX,
|
38 |
+
vix.High AS High_VIX,
|
39 |
+
vix.Low AS Low_VIX,
|
40 |
+
vix.Close AS Close_VIX,
|
41 |
+
vvix.Open AS Open_VVIX,
|
42 |
+
vvix.High AS High_VVIX,
|
43 |
+
vvix.Low AS Low_VVIX,
|
44 |
+
vvix.Close AS Close_VVIX
|
45 |
+
FROM
|
46 |
+
SPX_full_1day AS spx
|
47 |
+
LEFT JOIN
|
48 |
+
VIX_full_1day AS vix ON spx.Datetime = vix.Datetime AND vix.Datetime > '{data_start_date}'
|
49 |
+
LEFT JOIN
|
50 |
+
VVIX_full_1day AS vvix ON spx.Datetime = vvix.Datetime AND vvix.Datetime > '{data_start_date}'
|
51 |
+
WHERE
|
52 |
+
spx.Datetime > '{data_start_date}'
|
53 |
+
|
54 |
+
'''
|
55 |
+
data = pd.read_sql_query(sql=query, con=engine.connect())
|
56 |
+
data['Datetime'] = pd.to_datetime(data['Datetime'])
|
57 |
+
data = data.set_index('Datetime',drop=True)
|
58 |
+
|
59 |
+
# Get incremental date
|
60 |
+
last_date = data.index.date[-1]
|
61 |
+
last_date = last_date + BDay(1)
|
62 |
+
|
63 |
+
prices_vix = vix.history(start=last_date, interval='1d')
|
64 |
+
prices_vvix = vvix.history(start=last_date, interval='1d')
|
65 |
+
prices_spx = spx.history(start=last_date, interval='1d')
|
66 |
+
|
67 |
+
if len(prices_spx) > 0:
|
68 |
+
|
69 |
+
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
70 |
+
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
71 |
+
prices_spx.index = prices_spx['index']
|
72 |
+
prices_spx = prices_spx.drop(columns='index')
|
73 |
+
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
74 |
+
|
75 |
+
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
76 |
+
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
77 |
+
prices_vix.index = prices_vix['index']
|
78 |
+
prices_vix = prices_vix.drop(columns='index')
|
79 |
+
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
80 |
+
|
81 |
+
prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
|
82 |
+
prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
|
83 |
+
prices_vvix.index = prices_vvix['index']
|
84 |
+
prices_vvix = prices_vvix.drop(columns='index')
|
85 |
+
prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
|
86 |
+
|
87 |
+
data1 = prices_spx.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
88 |
+
data1 = data1.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
|
89 |
+
data = pd.concat([data, data1])
|
90 |
+
|
91 |
+
else:
|
92 |
+
data = data.copy()
|
93 |
+
|
94 |
+
if mode == 'intra':
|
95 |
+
from getIntraData import get_intra
|
96 |
+
df_intra = get_intra(periods_30m)
|
97 |
+
data = data.merge(df_intra, left_index=True, right_index=True)
|
98 |
+
else:
|
99 |
+
data = data.copy()
|
100 |
+
|
101 |
+
# Features
|
102 |
+
data['PrevClose'] = data['Close'].shift(1)
|
103 |
+
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
104 |
+
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
105 |
+
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
106 |
+
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
107 |
+
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
108 |
+
|
109 |
+
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
110 |
+
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
111 |
+
|
112 |
+
data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5)
|
113 |
+
data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool)
|
114 |
+
|
115 |
+
data['VIXOpen'] = data['Open_VIX'] > data['Close_VIX'].shift(1)
|
116 |
+
data['VVIXOpen'] = data['Open_VVIX'] > data['Close_VVIX'].shift(1)
|
117 |
+
data['VIXOpen'] = data['VIXOpen'].astype(bool)
|
118 |
+
data['VVIXOpen'] = data['VVIXOpen'].astype(bool)
|
119 |
+
|
120 |
+
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
121 |
+
data['RangePct'] = data['Range'] / data['Close']
|
122 |
+
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
123 |
+
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
124 |
+
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
125 |
+
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
126 |
+
data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool)
|
127 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
128 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
129 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
130 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
131 |
+
data['RangePct_n1'] = data['RangePct'].shift(1)
|
132 |
+
data['RangePct_n2'] = data['RangePct'].shift(2)
|
133 |
+
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
134 |
+
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
135 |
+
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
136 |
+
data['CurrentGapHist'] = data['CurrentGap'].copy()
|
137 |
+
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
138 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
139 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
140 |
+
|
141 |
+
# Target -- the next day's low
|
142 |
+
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
143 |
+
data['Target'] = data['Target'].shift(-1)
|
144 |
+
# data['Target'] = data['RangePct'].shift(-1)
|
145 |
+
|
146 |
+
# Target for clf -- whether tomorrow will close above or below today's close
|
147 |
+
data['Target_clf'] = data['Close'] > data['PrevClose']
|
148 |
+
data['Target_clf'] = data['Target_clf'].shift(-1)
|
149 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
150 |
+
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
151 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
152 |
+
|
153 |
+
# Calculate up
|
154 |
+
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
|
155 |
+
|
156 |
+
# Calculate upSD
|
157 |
+
data['upSD'] = data['up'].rolling(30).std(ddof=0)
|
158 |
+
|
159 |
+
# Calculate aveUp
|
160 |
+
data['aveUp'] = data['up'].rolling(30).mean()
|
161 |
+
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
|
162 |
+
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
|
163 |
+
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
|
164 |
+
data['downSD'] = data['down'].rolling(30).std(ddof=0)
|
165 |
+
data['aveDown'] = data['down'].rolling(30).mean()
|
166 |
+
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
|
167 |
+
data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open']
|
168 |
+
|
169 |
+
data = data.assign(
|
170 |
+
L1Touch = lambda x: x['Low'] < x['L1'],
|
171 |
+
L2Touch = lambda x: x['Low'] < x['L2'],
|
172 |
+
H1Touch = lambda x: x['High'] > x['H1'],
|
173 |
+
H2Touch = lambda x: x['High'] > x['H2'],
|
174 |
+
L1Break = lambda x: x['Close'] < x['L1'],
|
175 |
+
L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']),
|
176 |
+
L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']),
|
177 |
+
L2Break = lambda x: x['Close'] < x['L2'],
|
178 |
+
H1Break = lambda x: x['Close'] > x['H1'],
|
179 |
+
H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']),
|
180 |
+
H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']),
|
181 |
+
H2Break = lambda x: x['Close'] > x['H2'],
|
182 |
+
OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
|
183 |
+
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
|
184 |
+
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
|
185 |
+
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0)
|
186 |
+
)
|
187 |
+
|
188 |
+
data['OpenL1'] = data['OpenL1'].shift(-1)
|
189 |
+
data['OpenL2'] = data['OpenL2'].shift(-1)
|
190 |
+
data['OpenH1'] = data['OpenH1'].shift(-1)
|
191 |
+
data['OpenH2'] = data['OpenH2'].shift(-1)
|
192 |
+
|
193 |
+
|
194 |
+
level_cols = [
|
195 |
+
'L1Touch',
|
196 |
+
'L2Touch',
|
197 |
+
'H1Touch',
|
198 |
+
'H2Touch',
|
199 |
+
'L1Break',
|
200 |
+
'L2Break',
|
201 |
+
'H1Break',
|
202 |
+
'H2Break'
|
203 |
+
]
|
204 |
+
|
205 |
+
for col in level_cols:
|
206 |
+
data[col+'Pct'] = data[col].rolling(100).mean()
|
207 |
+
# data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
208 |
+
|
209 |
+
data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
|
210 |
+
data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
|
211 |
+
data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
|
212 |
+
data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
|
213 |
+
data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
|
214 |
+
data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
|
215 |
+
|
216 |
+
data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
|
217 |
+
data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
|
218 |
+
|
219 |
+
if mode=='intra':
|
220 |
+
# Intraday features
|
221 |
+
data['CurrentOpen30'] = data['Open30'].shift(-1)
|
222 |
+
data['CurrentHigh30'] = data['High30'].shift(-1)
|
223 |
+
data['CurrentLow30'] = data['Low30'].shift(-1)
|
224 |
+
data['CurrentClose30'] = data['Close30'].shift(-1)
|
225 |
+
data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1)
|
226 |
+
data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4']
|
227 |
+
data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool)
|
228 |
+
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
|
229 |
+
|
230 |
+
data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1)
|
231 |
+
data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1)
|
232 |
+
|
233 |
+
data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX']
|
234 |
+
|
235 |
+
# Open to High
|
236 |
+
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
237 |
+
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
238 |
+
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
239 |
+
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
240 |
+
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
241 |
+
data['CloseL1'] = np.where(data['Close30'] < data['L1'], 1, 0)
|
242 |
+
data['CloseL2'] = np.where(data['Close30'] < data['L2'], 1, 0)
|
243 |
+
data['CloseH1'] = np.where(data['Close30'] > data['H1'], 1, 0)
|
244 |
+
data['CloseH2'] = np.where(data['Close30'] > data['H2'], 1, 0)
|
245 |
+
data['CloseL1'] = data['CloseL1'].shift(-1)
|
246 |
+
data['CloseL2'] = data['CloseL2'].shift(-1)
|
247 |
+
data['CloseH1'] = data['CloseH1'].shift(-1)
|
248 |
+
data['CloseH2'] = data['CloseH2'].shift(-1)
|
249 |
+
|
250 |
+
def get_quintiles(df, col_name, q):
|
251 |
+
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
|
252 |
+
|
253 |
+
probas = []
|
254 |
+
# Given the current price level
|
255 |
+
for i, pct in enumerate(data['CurrentClose30toClose']):
|
256 |
+
try:
|
257 |
+
# Split
|
258 |
+
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
|
259 |
+
for q in df_q.index:
|
260 |
+
if q.left <= pct <= q.right:
|
261 |
+
p = df_q[q]
|
262 |
+
except:
|
263 |
+
p = None
|
264 |
+
|
265 |
+
probas.append(p)
|
266 |
+
|
267 |
+
data['GreenProbas'] = probas
|
268 |
+
|
269 |
+
engine = create_engine(
|
270 |
+
f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
|
271 |
+
f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
|
272 |
+
f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
|
273 |
+
)
|
274 |
+
|
275 |
+
df_releases = pd.read_sql_query('select * from releases', con=engine)
|
276 |
+
df_releases = df_releases.set_index('Datetime')
|
277 |
+
data = data.merge(df_releases, how = 'left', left_index=True, right_index=True)
|
278 |
+
|
279 |
+
for n in tqdm(df_releases.columns, desc='Merging econ data'):
|
280 |
+
# Get the name of the release
|
281 |
+
# n = releases[rid]['name']
|
282 |
+
# Merge the corresponding DF of the release
|
283 |
+
# data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
284 |
+
# Create a column that shifts the value in the merged column up by 1
|
285 |
+
data[f'{n}_shift'] = data[n].shift(-1)
|
286 |
+
# Fill the rest with zeroes
|
287 |
+
data[n] = data[n].fillna(0)
|
288 |
+
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
289 |
+
|
290 |
+
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
291 |
+
|
292 |
+
def cumul_sum(col):
|
293 |
+
nums = []
|
294 |
+
s = 0
|
295 |
+
for x in col:
|
296 |
+
if x == 1:
|
297 |
+
s += 1
|
298 |
+
elif x == 0:
|
299 |
+
s = 0
|
300 |
+
nums.append(s)
|
301 |
+
return nums
|
302 |
+
|
303 |
+
consec_green = cumul_sum(data['GreenDay'].values)
|
304 |
+
consec_red = cumul_sum(data['RedDay'].values)
|
305 |
+
|
306 |
+
data['DaysGreen'] = consec_green
|
307 |
+
data['DaysRed'] = consec_red
|
308 |
+
|
309 |
+
final_row = data.index[-2]
|
310 |
+
|
311 |
+
if mode=='daily':
|
312 |
+
from dailyCols import model_cols
|
313 |
+
|
314 |
+
elif mode=='intra':
|
315 |
+
from intraCols import model_cols
|
316 |
+
|
317 |
+
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
|
318 |
+
df_final = df_final.dropna(subset=['Target','Target_clf'])
|
319 |
+
# df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
320 |
+
return data, df_final, final_row
|
getData.py
ADDED
@@ -0,0 +1,466 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pandas_datareader as pdr
|
3 |
+
import numpy as np
|
4 |
+
import yfinance as yf
|
5 |
+
import requests
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
+
from typing import List
|
8 |
+
from tqdm import tqdm
|
9 |
+
import os
|
10 |
+
import datetime
|
11 |
+
|
12 |
+
model_cols = [
|
13 |
+
'BigNewsDay',
|
14 |
+
'Quarter',
|
15 |
+
'Perf5Day',
|
16 |
+
'Perf5Day_n1',
|
17 |
+
'DaysGreen',
|
18 |
+
'DaysRed',
|
19 |
+
'CurrentHigh30toClose',
|
20 |
+
'CurrentLow30toClose',
|
21 |
+
'CurrentClose30toClose',
|
22 |
+
'CurrentRange30',
|
23 |
+
'GapFill30',
|
24 |
+
'CurrentGap',
|
25 |
+
'RangePct',
|
26 |
+
'RangePct_n1',
|
27 |
+
'RangePct_n2',
|
28 |
+
'OHLC4_VIX',
|
29 |
+
'OHLC4_VIX_n1',
|
30 |
+
'OHLC4_VIX_n2',
|
31 |
+
'OHLC4_Current_Trend',
|
32 |
+
'OHLC4_Trend',
|
33 |
+
'CurrentVIXTrend',
|
34 |
+
'SPX30IntraPerf',
|
35 |
+
'VIX30IntraPerf',
|
36 |
+
'VVIX30IntraPerf',
|
37 |
+
# 'OpenL1',
|
38 |
+
# 'OpenL2',
|
39 |
+
# 'OpenH1',
|
40 |
+
# 'OpenH2',
|
41 |
+
'L1TouchPct',
|
42 |
+
'L2TouchPct',
|
43 |
+
'H1TouchPct',
|
44 |
+
'H2TouchPct',
|
45 |
+
'L1BreakPct',
|
46 |
+
'L2BreakPct',
|
47 |
+
'H1BreakPct',
|
48 |
+
'H2BreakPct',
|
49 |
+
'GreenProbas',
|
50 |
+
'H1BreakTouchPct',
|
51 |
+
'H2BreakTouchPct',
|
52 |
+
'L1BreakTouchPct',
|
53 |
+
'L2BreakTouchPct',
|
54 |
+
'H1BreakH2TouchPct',
|
55 |
+
'L1BreakL2TouchPct',
|
56 |
+
'H1TouchGreenPct',
|
57 |
+
'L1TouchRedPct'
|
58 |
+
# 'GapFillGreenProba'
|
59 |
+
]
|
60 |
+
|
61 |
+
def get_data(periods_30m = 1):
|
62 |
+
# f = open('settings.json')
|
63 |
+
# j = json.load(f)
|
64 |
+
# API_KEY_FRED = j["API_KEY_FRED"]
|
65 |
+
|
66 |
+
API_KEY_FRED = os.getenv('API_KEY_FRED')
|
67 |
+
|
68 |
+
def parse_release_dates(release_id: str) -> List[str]:
|
69 |
+
release_dates_url = f'https://api.stlouisfed.org/fred/release/dates?release_id={release_id}&realtime_start=2015-01-01&include_release_dates_with_no_data=true&api_key={API_KEY_FRED}'
|
70 |
+
r = requests.get(release_dates_url)
|
71 |
+
text = r.text
|
72 |
+
soup = BeautifulSoup(text, 'xml')
|
73 |
+
dates = []
|
74 |
+
for release_date_tag in soup.find_all('release_date', {'release_id': release_id}):
|
75 |
+
dates.append(release_date_tag.text)
|
76 |
+
return dates
|
77 |
+
|
78 |
+
econ_dfs = {}
|
79 |
+
|
80 |
+
econ_tickers = [
|
81 |
+
'WALCL',
|
82 |
+
'NFCI',
|
83 |
+
'WRESBAL'
|
84 |
+
]
|
85 |
+
|
86 |
+
for et in tqdm(econ_tickers, desc='getting econ tickers'):
|
87 |
+
df = pdr.get_data_fred(et)
|
88 |
+
df.index = df.index.rename('ds')
|
89 |
+
econ_dfs[et] = df
|
90 |
+
|
91 |
+
release_ids = [
|
92 |
+
"10", # "Consumer Price Index"
|
93 |
+
"46", # "Producer Price Index"
|
94 |
+
"50", # "Employment Situation"
|
95 |
+
"53", # "Gross Domestic Product"
|
96 |
+
"103", # "Discount Rate Meeting Minutes"
|
97 |
+
"180", # "Unemployment Insurance Weekly Claims Report"
|
98 |
+
"194", # "ADP National Employment Report"
|
99 |
+
"323" # "Trimmed Mean PCE Inflation Rate"
|
100 |
+
]
|
101 |
+
|
102 |
+
release_names = [
|
103 |
+
"CPI",
|
104 |
+
"PPI",
|
105 |
+
"NFP",
|
106 |
+
"GDP",
|
107 |
+
"FOMC",
|
108 |
+
"UNEMP",
|
109 |
+
"ADP",
|
110 |
+
"PCE"
|
111 |
+
]
|
112 |
+
|
113 |
+
releases = {}
|
114 |
+
|
115 |
+
for rid, n in tqdm(zip(release_ids, release_names), total = len(release_ids), desc='Getting release dates'):
|
116 |
+
releases[rid] = {}
|
117 |
+
releases[rid]['dates'] = parse_release_dates(rid)
|
118 |
+
releases[rid]['name'] = n
|
119 |
+
|
120 |
+
# Create a DF that has all dates with the name of the col as 1
|
121 |
+
# Once merged on the main dataframe, days with econ events will be 1 or None. Fill NA with 0
|
122 |
+
# This column serves as the true/false indicator of whether there was economic data released that day.
|
123 |
+
for rid in tqdm(release_ids, desc='Making indicators'):
|
124 |
+
releases[rid]['df'] = pd.DataFrame(
|
125 |
+
index=releases[rid]['dates'],
|
126 |
+
data={
|
127 |
+
releases[rid]['name']: 1
|
128 |
+
})
|
129 |
+
releases[rid]['df'].index = pd.DatetimeIndex(releases[rid]['df'].index)
|
130 |
+
|
131 |
+
vix = yf.Ticker('^VIX')
|
132 |
+
vvix = yf.Ticker('^VVIX')
|
133 |
+
spx = yf.Ticker('^GSPC')
|
134 |
+
|
135 |
+
# Pull in data
|
136 |
+
data_files = {"spx": "SPX_full_30min.txt", "vix": "VIX_full_30min.txt", "vvix":'VVIX_full_30min.txt'}
|
137 |
+
data = load_dataset("boomsss/spx_intra", data_files=data_files)
|
138 |
+
dfs = []
|
139 |
+
for ticker in data.keys():
|
140 |
+
rows = [d['text'] for d in data[ticker]]
|
141 |
+
rows = [x.split(',') for x in rows]
|
142 |
+
|
143 |
+
fr = pd.DataFrame(columns=[
|
144 |
+
'Datetime','Open','High','Low','Close'
|
145 |
+
], data = rows)
|
146 |
+
|
147 |
+
fr['Datetime'] = pd.to_datetime(fr['Datetime'])
|
148 |
+
fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
149 |
+
fr = fr.set_index('Datetime')
|
150 |
+
fr['Open'] = pd.to_numeric(fr['Open'])
|
151 |
+
fr['High'] = pd.to_numeric(fr['High'])
|
152 |
+
fr['Low'] = pd.to_numeric(fr['Low'])
|
153 |
+
fr['Close'] = pd.to_numeric(fr['Close'])
|
154 |
+
dfs.append(fr)
|
155 |
+
|
156 |
+
df_30m = pd.concat(dfs, axis=1)
|
157 |
+
|
158 |
+
df_30m.columns = [
|
159 |
+
'Open30',
|
160 |
+
'High30',
|
161 |
+
'Low30',
|
162 |
+
'Close30',
|
163 |
+
'Open_VIX30',
|
164 |
+
'High_VIX30',
|
165 |
+
'Low_VIX30',
|
166 |
+
'Close_VIX30',
|
167 |
+
'Open_VVIX30',
|
168 |
+
'High_VVIX30',
|
169 |
+
'Low_VVIX30',
|
170 |
+
'Close_VVIX30'
|
171 |
+
]
|
172 |
+
|
173 |
+
# Get incremental date
|
174 |
+
last_date = df_30m.index.date[-1]
|
175 |
+
last_date = last_date + datetime.timedelta(days=1)
|
176 |
+
|
177 |
+
# Get incremental data for each index
|
178 |
+
spx1 = yf.Ticker('^GSPC')
|
179 |
+
vix1 = yf.Ticker('^VIX')
|
180 |
+
vvix1 = yf.Ticker('^VVIX')
|
181 |
+
yfp = spx1.history(start=last_date, interval='30m')
|
182 |
+
yf_vix = vix1.history(start=last_date, interval='30m')
|
183 |
+
yf_vvix = vvix1.history(start=last_date, interval='30m')
|
184 |
+
|
185 |
+
if len(yfp) > 0:
|
186 |
+
# Convert indexes to EST if not already
|
187 |
+
for _df in [yfp, yf_vix, yf_vvix]:
|
188 |
+
if _df.index.tz.zone != 'America/New_York':
|
189 |
+
_df['Datetime'] = pd.to_datetime(_df.index)
|
190 |
+
_df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York')
|
191 |
+
_df.set_index('Datetime', inplace=True)
|
192 |
+
# Concat them
|
193 |
+
df_inc = pd.concat([
|
194 |
+
yfp[['Open','High','Low','Close']],
|
195 |
+
yf_vix[['Open','High','Low','Close']],
|
196 |
+
yf_vvix[['Open','High','Low','Close']]
|
197 |
+
], axis=1)
|
198 |
+
df_inc.columns = df_30m.columns
|
199 |
+
df_inc = df_inc.loc[
|
200 |
+
(df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00))
|
201 |
+
]
|
202 |
+
df_30m = pd.concat([df_30m, df_inc])
|
203 |
+
else:
|
204 |
+
df_30m = df_30m.copy()
|
205 |
+
|
206 |
+
df_30m = df_30m.loc[
|
207 |
+
(df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00))
|
208 |
+
]
|
209 |
+
df_30m['dt'] = df_30m.index.date
|
210 |
+
df_30m = df_30m.groupby('dt').head(periods_30m)
|
211 |
+
df_30m = df_30m.set_index('dt',drop=True)
|
212 |
+
df_30m.index.name = 'Datetime'
|
213 |
+
|
214 |
+
df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1
|
215 |
+
df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1
|
216 |
+
df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1
|
217 |
+
|
218 |
+
opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1)
|
219 |
+
highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max()
|
220 |
+
lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min()
|
221 |
+
closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1)
|
222 |
+
spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1)
|
223 |
+
vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1)
|
224 |
+
vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
|
225 |
+
|
226 |
+
df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
|
227 |
+
|
228 |
+
|
229 |
+
prices_vix = vix.history(start=data_start_date, interval='1d')
|
230 |
+
prices_vvix = vvix.history(start=data_start_date, interval='1d')
|
231 |
+
prices_spx = spx.history(start=data_start_date, interval='1d')
|
232 |
+
|
233 |
+
prices_spx['index'] = [str(x).split()[0] for x in prices_spx.index]
|
234 |
+
prices_spx['index'] = pd.to_datetime(prices_spx['index']).dt.date
|
235 |
+
prices_spx.index = prices_spx['index']
|
236 |
+
prices_spx = prices_spx.drop(columns='index')
|
237 |
+
prices_spx.index = pd.DatetimeIndex(prices_spx.index)
|
238 |
+
|
239 |
+
prices_vix['index'] = [str(x).split()[0] for x in prices_vix.index]
|
240 |
+
prices_vix['index'] = pd.to_datetime(prices_vix['index']).dt.date
|
241 |
+
prices_vix.index = prices_vix['index']
|
242 |
+
prices_vix = prices_vix.drop(columns='index')
|
243 |
+
prices_vix.index = pd.DatetimeIndex(prices_vix.index)
|
244 |
+
|
245 |
+
prices_vvix['index'] = [str(x).split()[0] for x in prices_vvix.index]
|
246 |
+
prices_vvix['index'] = pd.to_datetime(prices_vvix['index']).dt.date
|
247 |
+
prices_vvix.index = prices_vvix['index']
|
248 |
+
prices_vvix = prices_vvix.drop(columns='index')
|
249 |
+
prices_vvix.index = pd.DatetimeIndex(prices_vvix.index)
|
250 |
+
|
251 |
+
data = prices_spx.merge(df_intra, left_index=True, right_index=True)
|
252 |
+
data = data.merge(prices_vix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VIX'])
|
253 |
+
data = data.merge(prices_vvix[['Open','High','Low','Close']], left_index=True, right_index=True, suffixes=['','_VVIX'])
|
254 |
+
|
255 |
+
# Features
|
256 |
+
data['PrevClose'] = data['Close'].shift(1)
|
257 |
+
data['Perf5Day'] = data['Close'] > data['Close'].shift(5)
|
258 |
+
data['Perf5Day_n1'] = data['Perf5Day'].shift(1)
|
259 |
+
data['Perf5Day_n1'] = data['Perf5Day_n1'].astype(bool)
|
260 |
+
data['GreenDay'] = (data['Close'] > data['PrevClose']) * 1
|
261 |
+
data['RedDay'] = (data['Close'] <= data['PrevClose']) * 1
|
262 |
+
|
263 |
+
data['VIX5Day'] = data['Close_VIX'] > data['Close_VIX'].shift(5)
|
264 |
+
data['VIX5Day_n1'] = data['VIX5Day'].astype(bool)
|
265 |
+
|
266 |
+
data['VVIX5Day'] = data['Close_VVIX'] > data['Close_VVIX'].shift(5)
|
267 |
+
data['VVIX5Day_n1'] = data['VVIX5Day'].astype(bool)
|
268 |
+
|
269 |
+
data['Range'] = data[['Open','High']].max(axis=1) - data[['Low','Open']].min(axis=1) # Current day range in points
|
270 |
+
data['RangePct'] = data['Range'] / data['Close']
|
271 |
+
data['VIXLevel'] = pd.qcut(data['Close_VIX'], 4)
|
272 |
+
data['OHLC4_VIX'] = data[['Open_VIX','High_VIX','Low_VIX','Close_VIX']].mean(axis=1)
|
273 |
+
data['OHLC4'] = data[['Open','High','Low','Close']].mean(axis=1)
|
274 |
+
data['OHLC4_Trend'] = data['OHLC4'] > data['OHLC4'].shift(1)
|
275 |
+
data['OHLC4_Trend'] = data['OHLC4_Trend'].astype(bool)
|
276 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend'].shift(1)
|
277 |
+
data['OHLC4_Trend_n1'] = data['OHLC4_Trend_n1'].astype(float)
|
278 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend'].shift(1)
|
279 |
+
data['OHLC4_Trend_n2'] = data['OHLC4_Trend_n2'].astype(float)
|
280 |
+
data['RangePct_n1'] = data['RangePct'].shift(1)
|
281 |
+
data['RangePct_n2'] = data['RangePct'].shift(2)
|
282 |
+
data['OHLC4_VIX_n1'] = data['OHLC4_VIX'].shift(1)
|
283 |
+
data['OHLC4_VIX_n2'] = data['OHLC4_VIX'].shift(2)
|
284 |
+
data['CurrentGap'] = (data['Open'] - data['PrevClose']) / data['PrevClose']
|
285 |
+
data['CurrentGapHist'] = data['CurrentGap'].copy()
|
286 |
+
data['CurrentGap'] = data['CurrentGap'].shift(-1)
|
287 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
288 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.day
|
289 |
+
|
290 |
+
# Intraday features
|
291 |
+
data['CurrentOpen30'] = data['Open30'].shift(-1)
|
292 |
+
data['CurrentHigh30'] = data['High30'].shift(-1)
|
293 |
+
data['CurrentLow30'] = data['Low30'].shift(-1)
|
294 |
+
data['CurrentClose30'] = data['Close30'].shift(-1)
|
295 |
+
data['CurrentOHLC430'] = data[['CurrentOpen30','CurrentHigh30','CurrentLow30','CurrentClose30']].max(axis=1)
|
296 |
+
data['OHLC4_Current_Trend'] = data['CurrentOHLC430'] > data['OHLC4']
|
297 |
+
data['OHLC4_Current_Trend'] = data['OHLC4_Current_Trend'].astype(bool)
|
298 |
+
data['HistClose30toPrevClose'] = (data['Close30'] / data['PrevClose']) - 1
|
299 |
+
|
300 |
+
data['CurrentCloseVIX30'] = data['Close_VIX30'].shift(-1)
|
301 |
+
data['CurrentOpenVIX30'] = data['Open_VIX30'].shift(-1)
|
302 |
+
|
303 |
+
data['CurrentVIXTrend'] = data['CurrentCloseVIX30'] > data['Close_VIX']
|
304 |
+
|
305 |
+
# Open to High
|
306 |
+
data['CurrentHigh30toClose'] = (data['CurrentHigh30'] / data['Close']) - 1
|
307 |
+
data['CurrentLow30toClose'] = (data['CurrentLow30'] / data['Close']) - 1
|
308 |
+
data['CurrentClose30toClose'] = (data['CurrentClose30'] / data['Close']) - 1
|
309 |
+
data['CurrentRange30'] = (data['CurrentHigh30'] - data['CurrentLow30']) / data['Close']
|
310 |
+
data['GapFill30'] = [low <= prev_close if gap > 0 else high >= prev_close for high, low, prev_close, gap in zip(data['CurrentHigh30'], data['CurrentLow30'], data['Close'], data['CurrentGap'])]
|
311 |
+
|
312 |
+
# Target -- the next day's low
|
313 |
+
data['Target'] = (data['OHLC4'] / data['PrevClose']) - 1
|
314 |
+
data['Target'] = data['Target'].shift(-1)
|
315 |
+
# data['Target'] = data['RangePct'].shift(-1)
|
316 |
+
|
317 |
+
# Target for clf -- whether tomorrow will close above or below today's close
|
318 |
+
data['Target_clf'] = data['Close'] > data['PrevClose']
|
319 |
+
data['Target_clf'] = data['Target_clf'].shift(-1)
|
320 |
+
data['DayOfWeek'] = pd.to_datetime(data.index)
|
321 |
+
data['Quarter'] = data['DayOfWeek'].dt.quarter
|
322 |
+
data['DayOfWeek'] = data['DayOfWeek'].dt.weekday
|
323 |
+
|
324 |
+
# Calculate up
|
325 |
+
data['up'] = 100 * (data['High'].shift(1) - data['Open'].shift(1)) / data['Close'].shift(1)
|
326 |
+
|
327 |
+
# Calculate upSD
|
328 |
+
data['upSD'] = data['up'].rolling(30).std(ddof=0)
|
329 |
+
|
330 |
+
# Calculate aveUp
|
331 |
+
data['aveUp'] = data['up'].rolling(30).mean()
|
332 |
+
data['H1'] = data['Open'] + (data['aveUp'] / 100) * data['Open']
|
333 |
+
data['H2'] = data['Open'] + ((data['aveUp'] + data['upSD']) / 100) * data['Open']
|
334 |
+
data['down'] = 100 * (data['Open'].shift(1) - data['Low'].shift(1)) / data['Close'].shift(1)
|
335 |
+
data['downSD'] = data['down'].rolling(30).std(ddof=0)
|
336 |
+
data['aveDown'] = data['down'].rolling(30).mean()
|
337 |
+
data['L1'] = data['Open'] - (data['aveDown'] / 100) * data['Open']
|
338 |
+
data['L2'] = data['Open'] - ((data['aveDown'] + data['downSD']) / 100) * data['Open']
|
339 |
+
|
340 |
+
data = data.assign(
|
341 |
+
L1Touch = lambda x: x['Low'] < x['L1'],
|
342 |
+
L2Touch = lambda x: x['Low'] < x['L2'],
|
343 |
+
H1Touch = lambda x: x['High'] > x['H1'],
|
344 |
+
H2Touch = lambda x: x['High'] > x['H2'],
|
345 |
+
L1Break = lambda x: x['Close'] < x['L1'],
|
346 |
+
L1TouchRed = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['PrevClose']),
|
347 |
+
L2TouchL1Break = lambda x: (x['Low'] < x['L2']) & (x['Close'] < x['L1']),
|
348 |
+
L2Break = lambda x: x['Close'] < x['L2'],
|
349 |
+
H1Break = lambda x: x['Close'] > x['H1'],
|
350 |
+
H1TouchGreen = lambda x: (x['High'] > x['H1']) & (x['Close'] > x['PrevClose']),
|
351 |
+
H2TouchH1Break = lambda x: (x['High'] > x['H2']) & (x['Close'] > x['H1']),
|
352 |
+
H2Break = lambda x: x['Close'] > x['H2'],
|
353 |
+
OpenL1 = lambda x: np.where(x['Open'] < x['L1'], 1, 0),
|
354 |
+
OpenL2 = lambda x: np.where(x['Open'] < x['L2'], 1, 0),
|
355 |
+
OpenH1 = lambda x: np.where(x['Open'] > x['H1'], 1, 0),
|
356 |
+
OpenH2 = lambda x: np.where(x['Open'] > x['H2'], 1, 0),
|
357 |
+
CloseL1 = lambda x: np.where(x['Close30'] < x['L1'], 1, 0),
|
358 |
+
CloseL2 = lambda x: np.where(x['Close30'] < x['L2'], 1, 0),
|
359 |
+
CloseH1 = lambda x: np.where(x['Close30'] > x['H1'], 1, 0),
|
360 |
+
CloseH2 = lambda x: np.where(x['Close30'] > x['H2'], 1, 0)
|
361 |
+
)
|
362 |
+
|
363 |
+
data['OpenL1'] = data['OpenL1'].shift(-1)
|
364 |
+
data['OpenL2'] = data['OpenL2'].shift(-1)
|
365 |
+
data['OpenH1'] = data['OpenH1'].shift(-1)
|
366 |
+
data['OpenH2'] = data['OpenH2'].shift(-1)
|
367 |
+
data['CloseL1'] = data['CloseL1'].shift(-1)
|
368 |
+
data['CloseL2'] = data['CloseL2'].shift(-1)
|
369 |
+
data['CloseH1'] = data['CloseH1'].shift(-1)
|
370 |
+
data['CloseH2'] = data['CloseH2'].shift(-1)
|
371 |
+
|
372 |
+
level_cols = [
|
373 |
+
'L1Touch',
|
374 |
+
'L2Touch',
|
375 |
+
'H1Touch',
|
376 |
+
'H2Touch',
|
377 |
+
'L1Break',
|
378 |
+
'L2Break',
|
379 |
+
'H1Break',
|
380 |
+
'H2Break'
|
381 |
+
]
|
382 |
+
|
383 |
+
for col in level_cols:
|
384 |
+
data[col+'Pct'] = data[col].rolling(100).mean()
|
385 |
+
# data[col+'Pct'] = data[col+'Pct'].shift(-1)
|
386 |
+
|
387 |
+
data['H1BreakTouchPct'] = data['H1Break'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
|
388 |
+
data['H2BreakTouchPct'] = data['H2Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
|
389 |
+
data['L1BreakTouchPct'] = data['L1Break'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
|
390 |
+
data['L2BreakTouchPct'] = data['L2Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
|
391 |
+
data['L1TouchRedPct'] = data['L1TouchRed'].rolling(100).sum() / data['L1Touch'].rolling(100).sum()
|
392 |
+
data['H1TouchGreenPct'] = data['H1TouchGreen'].rolling(100).sum() / data['H1Touch'].rolling(100).sum()
|
393 |
+
|
394 |
+
data['H1BreakH2TouchPct'] = data['H2TouchH1Break'].rolling(100).sum() / data['H2Touch'].rolling(100).sum()
|
395 |
+
data['L1BreakL2TouchPct'] = data['L2TouchL1Break'].rolling(100).sum() / data['L2Touch'].rolling(100).sum()
|
396 |
+
|
397 |
+
def get_quintiles(df, col_name, q):
|
398 |
+
return df.groupby(pd.qcut(df[col_name], q))['GreenDay'].mean()
|
399 |
+
|
400 |
+
probas = []
|
401 |
+
# Given the current price level
|
402 |
+
for i, pct in enumerate(data['CurrentClose30toClose']):
|
403 |
+
try:
|
404 |
+
# Split
|
405 |
+
df_q = get_quintiles(data.iloc[:i], 'HistClose30toPrevClose', 10)
|
406 |
+
for q in df_q.index:
|
407 |
+
if q.left <= pct <= q.right:
|
408 |
+
p = df_q[q]
|
409 |
+
except:
|
410 |
+
p = None
|
411 |
+
|
412 |
+
probas.append(p)
|
413 |
+
|
414 |
+
# gapfills = []
|
415 |
+
# for i, pct in enumerate(data['CurrentGap']):
|
416 |
+
# try:
|
417 |
+
# df_q = get_quintiles(data.iloc[:i], 'CurrentGapHist', 5)
|
418 |
+
# for q in df_q.index:
|
419 |
+
# if q.left <= pct <= q.right:
|
420 |
+
# p = df_q[q]
|
421 |
+
# except:
|
422 |
+
# p = None
|
423 |
+
|
424 |
+
# gapfills.append(p)
|
425 |
+
|
426 |
+
data['GreenProbas'] = probas
|
427 |
+
# data['GapFillGreenProba'] = gapfills
|
428 |
+
|
429 |
+
for rid in tqdm(release_ids, desc='Merging econ data'):
|
430 |
+
# Get the name of the release
|
431 |
+
n = releases[rid]['name']
|
432 |
+
# Merge the corresponding DF of the release
|
433 |
+
data = data.merge(releases[rid]['df'], how = 'left', left_index=True, right_index=True)
|
434 |
+
# Create a column that shifts the value in the merged column up by 1
|
435 |
+
data[f'{n}_shift'] = data[n].shift(-1)
|
436 |
+
# Fill the rest with zeroes
|
437 |
+
data[n] = data[n].fillna(0)
|
438 |
+
data[f'{n}_shift'] = data[f'{n}_shift'].fillna(0)
|
439 |
+
|
440 |
+
data['BigNewsDay'] = data[[x for x in data.columns if '_shift' in x]].max(axis=1)
|
441 |
+
|
442 |
+
def cumul_sum(col):
|
443 |
+
nums = []
|
444 |
+
s = 0
|
445 |
+
for x in col:
|
446 |
+
if x == 1:
|
447 |
+
s += 1
|
448 |
+
elif x == 0:
|
449 |
+
s = 0
|
450 |
+
nums.append(s)
|
451 |
+
return nums
|
452 |
+
|
453 |
+
consec_green = cumul_sum(data['GreenDay'].values)
|
454 |
+
consec_red = cumul_sum(data['RedDay'].values)
|
455 |
+
|
456 |
+
data['DaysGreen'] = consec_green
|
457 |
+
data['DaysRed'] = consec_red
|
458 |
+
|
459 |
+
final_row = data.index[-2]
|
460 |
+
|
461 |
+
exp_row = data.index[-1]
|
462 |
+
|
463 |
+
df_final = data.loc[:final_row, model_cols + ['Target', 'Target_clf']]
|
464 |
+
df_final = df_final.dropna(subset=['Target','Target_clf'])
|
465 |
+
# df_final = df_final.dropna(subset=['Target','Target_clf','Perf5Day_n1'])
|
466 |
+
return data, df_final, final_row
|
getIntraData.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import yfinance as yf
|
3 |
+
import datetime
|
4 |
+
# from datasets import load_dataset
|
5 |
+
from sqlalchemy import create_engine
|
6 |
+
import os
|
7 |
+
from getDailyData import data_start_date
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
|
10 |
+
# Load environment variables from the .env file
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
def get_intra(periods_30m = 1):
|
14 |
+
'''
|
15 |
+
Method to get historical 30 minute data and append live data to it, if exists.
|
16 |
+
'''
|
17 |
+
engine = create_engine(
|
18 |
+
f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
|
19 |
+
f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
|
20 |
+
f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
|
21 |
+
)
|
22 |
+
|
23 |
+
query = f'''SELECT
|
24 |
+
spx30.Datetime AS Datetime,
|
25 |
+
spx30.Open AS Open30,
|
26 |
+
spx30.High AS High30,
|
27 |
+
spx30.Low AS Low30,
|
28 |
+
spx30.Close AS Close30,
|
29 |
+
vix30.Open AS Open_VIX30,
|
30 |
+
vix30.High AS High_VIX30,
|
31 |
+
vix30.Low AS Low_VIX30,
|
32 |
+
vix30.Close AS Close_VIX30,
|
33 |
+
vvix30.Open AS Open_VVIX30,
|
34 |
+
vvix30.High AS High_VVIX30,
|
35 |
+
vvix30.Low AS Low_VVIX30,
|
36 |
+
vvix30.Close AS Close_VVIX30
|
37 |
+
FROM
|
38 |
+
SPX_full_30min AS spx30
|
39 |
+
LEFT JOIN
|
40 |
+
VIX_full_30min AS vix30 ON spx30.Datetime = vix30.Datetime AND vix30.Datetime > '{data_start_date}'
|
41 |
+
LEFT JOIN
|
42 |
+
VVIX_full_30min AS vvix30 ON spx30.Datetime = vvix30.Datetime AND vvix30.Datetime > '{data_start_date}'
|
43 |
+
WHERE
|
44 |
+
spx30.Datetime > '{data_start_date}'
|
45 |
+
|
46 |
+
'''
|
47 |
+
# spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
|
48 |
+
# vix30 = pd.read_sql_query(f'SELECT * FROM VIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
|
49 |
+
# vvix30 = pd.read_sql_query(f'SELECT * FROM VVIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
|
50 |
+
# dfs = []
|
51 |
+
|
52 |
+
df_30m = pd.read_sql_query(sql=query, con=engine.connect())
|
53 |
+
df_30m['Datetime'] = df_30m['Datetime'].dt.tz_localize('America/New_York')
|
54 |
+
df_30m = df_30m.set_index('Datetime',drop=True)
|
55 |
+
|
56 |
+
# for fr in [spx30, vix30, vvix30]:
|
57 |
+
# # fr['Datetime'] = fr['Datetime'].apply(lambda x: datetime.datetime.strptime(x[:-6], dt_format))
|
58 |
+
# fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
|
59 |
+
# fr = fr.set_index('Datetime')
|
60 |
+
# fr['Open'] = pd.to_numeric(fr['Open'])
|
61 |
+
# fr['High'] = pd.to_numeric(fr['High'])
|
62 |
+
# fr['Low'] = pd.to_numeric(fr['Low'])
|
63 |
+
# fr['Close'] = pd.to_numeric(fr['Close'])
|
64 |
+
# dfs.append(fr[['Open','High','Low','Close']])
|
65 |
+
|
66 |
+
# df_30m = pd.concat(dfs, axis=1)
|
67 |
+
|
68 |
+
# df_30m.columns = [
|
69 |
+
# 'Open30',
|
70 |
+
# 'High30',
|
71 |
+
# 'Low30',
|
72 |
+
# 'Close30',
|
73 |
+
# 'Open_VIX30',
|
74 |
+
# 'High_VIX30',
|
75 |
+
# 'Low_VIX30',
|
76 |
+
# 'Close_VIX30',
|
77 |
+
# 'Open_VVIX30',
|
78 |
+
# 'High_VVIX30',
|
79 |
+
# 'Low_VVIX30',
|
80 |
+
# 'Close_VVIX30'
|
81 |
+
# ]
|
82 |
+
|
83 |
+
# Get incremental date
|
84 |
+
last_date = df_30m.index.date[-1]
|
85 |
+
last_date = last_date + datetime.timedelta(days=1)
|
86 |
+
|
87 |
+
# Get incremental data for each index
|
88 |
+
spx1 = yf.Ticker('^GSPC')
|
89 |
+
vix1 = yf.Ticker('^VIX')
|
90 |
+
vvix1 = yf.Ticker('^VVIX')
|
91 |
+
yfp = spx1.history(start=last_date, interval='30m')
|
92 |
+
yf_vix = vix1.history(start=last_date, interval='30m')
|
93 |
+
yf_vvix = vvix1.history(start=last_date, interval='30m')
|
94 |
+
|
95 |
+
if len(yfp) > 0:
|
96 |
+
# Convert indexes to EST if not already
|
97 |
+
for _df in [yfp, yf_vix, yf_vvix]:
|
98 |
+
if (_df.index.tz.zone != 'America/New_York') or (type(_df.index) != pd.DatetimeIndex):
|
99 |
+
_df['Datetime'] = pd.to_datetime(_df.index)
|
100 |
+
_df['Datetime'] = _df['Datetime'].dt.tz_convert('America/New_York')
|
101 |
+
_df.set_index('Datetime', inplace=True)
|
102 |
+
# Concat them
|
103 |
+
df_inc = pd.concat([
|
104 |
+
yfp[['Open','High','Low','Close']],
|
105 |
+
yf_vix[['Open','High','Low','Close']],
|
106 |
+
yf_vvix[['Open','High','Low','Close']]
|
107 |
+
], axis=1)
|
108 |
+
df_inc.columns = df_30m.columns
|
109 |
+
df_inc = df_inc.loc[
|
110 |
+
(df_inc.index.time >= datetime.time(9,30)) & (df_inc.index.time < datetime.time(16,00))
|
111 |
+
]
|
112 |
+
df_30m = pd.concat([df_30m, df_inc])
|
113 |
+
else:
|
114 |
+
df_30m = df_30m.copy()
|
115 |
+
|
116 |
+
df_30m = df_30m.loc[
|
117 |
+
(df_30m.index.time >= datetime.time(9,30)) & (df_30m.index.time < datetime.time(16,00))
|
118 |
+
]
|
119 |
+
df_30m['dt'] = df_30m.index.date
|
120 |
+
df_30m = df_30m.groupby('dt').head(periods_30m)
|
121 |
+
df_30m = df_30m.set_index('dt',drop=True)
|
122 |
+
df_30m.index.name = 'Datetime'
|
123 |
+
|
124 |
+
df_30m['SPX30IntraPerf'] = (df_30m['Close30'] / df_30m['Close30'].shift(1)) - 1
|
125 |
+
df_30m['VIX30IntraPerf'] = (df_30m['Close_VIX30'] / df_30m['Close_VIX30'].shift(1)) - 1
|
126 |
+
df_30m['VVIX30IntraPerf'] = (df_30m['Close_VVIX30'] / df_30m['Close_VVIX30'].shift(1)) - 1
|
127 |
+
|
128 |
+
opens_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Open' in c]].head(1)
|
129 |
+
highs_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'High' in c]].max()
|
130 |
+
lows_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Low' in c]].min()
|
131 |
+
closes_intra = df_30m.groupby('Datetime')[[c for c in df_30m.columns if 'Close' in c]].tail(1)
|
132 |
+
spx_intra = df_30m.groupby('Datetime')['SPX30IntraPerf'].tail(1)
|
133 |
+
vix_intra = df_30m.groupby('Datetime')['VIX30IntraPerf'].tail(1)
|
134 |
+
vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
|
135 |
+
|
136 |
+
df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
|
137 |
+
return df_intra
|
138 |
+
|
139 |
+
|
intraCols.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_cols = [
|
2 |
+
'BigNewsDay',
|
3 |
+
'Quarter',
|
4 |
+
'Perf5Day',
|
5 |
+
'Perf5Day_n1',
|
6 |
+
'DaysGreen',
|
7 |
+
'DaysRed',
|
8 |
+
'CurrentHigh30toClose',
|
9 |
+
'CurrentLow30toClose',
|
10 |
+
'CurrentClose30toClose',
|
11 |
+
'CurrentRange30',
|
12 |
+
'GapFill30',
|
13 |
+
'CurrentGap',
|
14 |
+
'RangePct',
|
15 |
+
'RangePct_n1',
|
16 |
+
'RangePct_n2',
|
17 |
+
'OHLC4_VIX',
|
18 |
+
'OHLC4_VIX_n1',
|
19 |
+
'OHLC4_VIX_n2',
|
20 |
+
'OHLC4_Current_Trend',
|
21 |
+
'OHLC4_Trend',
|
22 |
+
'CurrentVIXTrend',
|
23 |
+
'SPX30IntraPerf',
|
24 |
+
'VIX30IntraPerf',
|
25 |
+
'VVIX30IntraPerf',
|
26 |
+
# 'OpenL1',
|
27 |
+
# 'OpenL2',
|
28 |
+
# 'OpenH1',
|
29 |
+
# 'OpenH2',
|
30 |
+
'L1TouchPct',
|
31 |
+
'L2TouchPct',
|
32 |
+
'H1TouchPct',
|
33 |
+
'H2TouchPct',
|
34 |
+
'L1BreakPct',
|
35 |
+
'L2BreakPct',
|
36 |
+
'H1BreakPct',
|
37 |
+
'H2BreakPct',
|
38 |
+
'GreenProbas',
|
39 |
+
'H1BreakTouchPct',
|
40 |
+
'H2BreakTouchPct',
|
41 |
+
'L1BreakTouchPct',
|
42 |
+
'L2BreakTouchPct',
|
43 |
+
'H1BreakH2TouchPct',
|
44 |
+
'L1BreakL2TouchPct',
|
45 |
+
'H1TouchGreenPct',
|
46 |
+
'L1TouchRedPct'
|
47 |
+
# 'GapFillGreenProba'
|
48 |
+
]
|
lambda_function.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Function should get the data and run the whole model, return a single prediction based on the time
|
2 |
+
from getDailyData import get_daily
|
3 |
+
from model_intra_v3 import walk_forward_validation
|
4 |
+
import json
|
5 |
+
|
6 |
+
def lambda_handler(periods_30m):
|
7 |
+
data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
|
8 |
+
res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
|
9 |
+
return json.loads(json.dumps({
|
10 |
+
'date': str(res.index[-1]),
|
11 |
+
'prediction': res['Predicted'].iloc[-1],
|
12 |
+
'time':periods_30m
|
13 |
+
}))
|
14 |
+
|
15 |
+
if __name__ == '__main__':
|
16 |
+
j = lambda_handler(1)
|
17 |
+
print(j)
|
model_day_v2.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from tqdm import tqdm
|
3 |
+
from sklearn import linear_model
|
4 |
+
import lightgbm as lgb
|
5 |
+
from dailyCols import model_cols
|
6 |
+
|
7 |
+
def walk_forward_validation(df, target_column, num_training_rows, num_periods):
|
8 |
+
|
9 |
+
# Create an XGBRegressor model
|
10 |
+
# model = xgb.XGBRegressor(n_estimators=100, objective='reg:squarederror', random_state = 42)
|
11 |
+
model = linear_model.LinearRegression()
|
12 |
+
|
13 |
+
overall_results = []
|
14 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
15 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),desc='LR Model'):
|
16 |
+
# Split the data into training and test sets
|
17 |
+
X_train = df.drop(target_column, axis=1).iloc[:i]
|
18 |
+
y_train = df[target_column].iloc[:i]
|
19 |
+
X_test = df.drop(target_column, axis=1).iloc[i:i+num_periods]
|
20 |
+
y_test = df[target_column].iloc[i:i+num_periods]
|
21 |
+
|
22 |
+
# Fit the model to the training data
|
23 |
+
model.fit(X_train, y_train)
|
24 |
+
|
25 |
+
# Make a prediction on the test data
|
26 |
+
predictions = model.predict(X_test)
|
27 |
+
|
28 |
+
# Create a DataFrame to store the true and predicted values
|
29 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
30 |
+
|
31 |
+
overall_results.append(result_df)
|
32 |
+
|
33 |
+
df_results = pd.concat(overall_results)
|
34 |
+
# model.save_model('model_lr.bin')
|
35 |
+
# Return the true and predicted values, and fitted model
|
36 |
+
return df_results, model
|
37 |
+
|
38 |
+
def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_training_rows, num_periods):
|
39 |
+
|
40 |
+
# Create run the regression model to get its target
|
41 |
+
res, model1 = walk_forward_validation(df.drop(columns=[target_column_clf]).dropna(), target_column_regr, num_training_rows, num_periods)
|
42 |
+
# joblib.dump(model1, 'model1.bin')
|
43 |
+
|
44 |
+
# Merge the result df back on the df for feeding into the classifier
|
45 |
+
for_merge = res[['Predicted']]
|
46 |
+
for_merge.columns = ['RegrModelOut']
|
47 |
+
for_merge['RegrModelOut'] = for_merge['RegrModelOut'] > 0
|
48 |
+
df = df.merge(for_merge, left_index=True, right_index=True)
|
49 |
+
df = df.drop(columns=[target_column_regr])
|
50 |
+
df = df[model_cols + ['RegrModelOut', target_column_clf]]
|
51 |
+
|
52 |
+
df[target_column_clf] = df[target_column_clf].astype(bool)
|
53 |
+
df['RegrModelOut'] = df['RegrModelOut'].astype(bool)
|
54 |
+
|
55 |
+
# Create an XGBRegressor model
|
56 |
+
# model2 = xgb.XGBClassifier(n_estimators=10, random_state = 42)
|
57 |
+
model2 = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
58 |
+
# model = linear_model.LogisticRegression(max_iter=1500)
|
59 |
+
|
60 |
+
overall_results = []
|
61 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
62 |
+
for i in tqdm(range(num_training_rows, df.shape[0] - num_periods + 1),'CLF Model'):
|
63 |
+
# Split the data into training and test sets
|
64 |
+
X_train = df.drop(target_column_clf, axis=1).iloc[:i]
|
65 |
+
y_train = df[target_column_clf].iloc[:i]
|
66 |
+
X_test = df.drop(target_column_clf, axis=1).iloc[i:i+num_periods]
|
67 |
+
y_test = df[target_column_clf].iloc[i:i+num_periods]
|
68 |
+
|
69 |
+
# Fit the model to the training data
|
70 |
+
model2.fit(X_train, y_train)
|
71 |
+
|
72 |
+
# Make a prediction on the test data
|
73 |
+
predictions = model2.predict_proba(X_test)[:,-1]
|
74 |
+
|
75 |
+
# Create a DataFrame to store the true and predicted values
|
76 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
77 |
+
|
78 |
+
overall_results.append(result_df)
|
79 |
+
|
80 |
+
df_results = pd.concat(overall_results)
|
81 |
+
|
82 |
+
# Calibrate Probabilities
|
83 |
+
def get_quantiles(df, col_name, q):
|
84 |
+
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
|
85 |
+
|
86 |
+
greenprobas = []
|
87 |
+
meanprobas = []
|
88 |
+
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
|
89 |
+
try:
|
90 |
+
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
91 |
+
for q in df_q.index:
|
92 |
+
if q.left <= pct <= q.right:
|
93 |
+
p = df_q[q]
|
94 |
+
c = (q.left + q.right) / 2
|
95 |
+
except:
|
96 |
+
p = None
|
97 |
+
c = None
|
98 |
+
|
99 |
+
greenprobas.append(p)
|
100 |
+
meanprobas.append(c)
|
101 |
+
|
102 |
+
df_results['CalibPredicted'] = greenprobas
|
103 |
+
|
104 |
+
return df_results, model1, model2
|
105 |
+
|
106 |
+
def seq_predict_proba(df, trained_reg_model, trained_clf_model):
|
107 |
+
regr_pred = trained_reg_model.predict(df)
|
108 |
+
regr_pred = regr_pred > 0
|
109 |
+
new_df = df.copy()
|
110 |
+
new_df['RegrModelOut'] = regr_pred
|
111 |
+
clf_pred_proba = trained_clf_model.predict_proba(new_df[model_cols + ['RegrModelOut']])[:,-1]
|
112 |
+
return clf_pred_proba
|
model_intra_v2.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from tqdm import tqdm
|
3 |
+
import lightgbm as lgb
|
4 |
+
from sklearn.model_selection import TimeSeriesSplit
|
5 |
+
from intraCols import model_cols
|
6 |
+
|
7 |
+
def walk_forward_validation(df, target_column, num_periods, mode='full'):
|
8 |
+
|
9 |
+
df = df[model_cols + [target_column]]
|
10 |
+
df[target_column] = df[target_column].astype(bool)
|
11 |
+
|
12 |
+
tscv = TimeSeriesSplit(n_splits=len(df)-1, max_train_size=None, test_size=num_periods) # num_splits is the number of splits you want
|
13 |
+
|
14 |
+
if mode == 'full':
|
15 |
+
overall_results = []
|
16 |
+
# Iterate over the rows in the DataFrame, one step at a time
|
17 |
+
# Split the time series data using TimeSeriesSplit
|
18 |
+
for train_index, test_index in tqdm(tscv.split(df), total=tscv.n_splits):
|
19 |
+
# Extract the training and testing data for the current split
|
20 |
+
X_train = df.drop(target_column, axis=1).iloc[train_index]
|
21 |
+
y_train = df[target_column].iloc[train_index]
|
22 |
+
X_test = df.drop(target_column, axis=1).iloc[test_index]
|
23 |
+
y_test = df[target_column].iloc[test_index]
|
24 |
+
|
25 |
+
y_train = y_train.astype(bool)
|
26 |
+
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
27 |
+
model.fit(X_train, y_train)
|
28 |
+
# Make a prediction on the test data
|
29 |
+
predictions = model.predict_proba(X_test)[:,-1]
|
30 |
+
|
31 |
+
# Create a DataFrame to store the true and predicted values
|
32 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
|
33 |
+
overall_results.append(result_df)
|
34 |
+
df_results = pd.concat(overall_results)
|
35 |
+
|
36 |
+
# Calibrate Probabilities
|
37 |
+
def get_quantiles(df, col_name, q):
|
38 |
+
return df.groupby(pd.cut(df[col_name], q))['True'].mean()
|
39 |
+
|
40 |
+
greenprobas = []
|
41 |
+
for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
|
42 |
+
try:
|
43 |
+
df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
|
44 |
+
for q in df_q.index:
|
45 |
+
if q.left <= pct <= q.right:
|
46 |
+
p = df_q[q]
|
47 |
+
except:
|
48 |
+
p = None
|
49 |
+
|
50 |
+
greenprobas.append(p)
|
51 |
+
|
52 |
+
df_results['CalibPredicted'] = greenprobas
|
53 |
+
|
54 |
+
return df_results, model
|
55 |
+
|
56 |
+
elif mode == 'single':
|
57 |
+
X_train = df.drop(target_column, axis=1).iloc[:-1]
|
58 |
+
y_train = df[target_column].iloc[:-1]
|
59 |
+
X_test = df.drop(target_column, axis=1).iloc[-1]
|
60 |
+
y_test = df[target_column].iloc[-1]
|
61 |
+
y_train = y_train.astype(bool)
|
62 |
+
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
63 |
+
model.fit(X_train, y_train)
|
64 |
+
predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
|
65 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
|
66 |
+
|
67 |
+
return result_df, model
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
def seq_predict_proba(df, trained_clf_model):
|
73 |
+
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
|
74 |
+
return clf_pred_proba
|
model_intra_v3.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import lightgbm as lgb
|
3 |
+
from intraCols import model_cols
|
4 |
+
|
5 |
+
def walk_forward_validation(df, target_column, num_periods, mode='full'):
|
6 |
+
|
7 |
+
df = df[model_cols + [target_column]]
|
8 |
+
df[target_column] = df[target_column].astype(bool)
|
9 |
+
|
10 |
+
X_train = df.drop(target_column, axis=1).iloc[:-1]
|
11 |
+
y_train = df[target_column].iloc[:-1]
|
12 |
+
X_test = df.drop(target_column, axis=1).iloc[-1]
|
13 |
+
y_test = df[target_column].iloc[-1]
|
14 |
+
y_train = y_train.astype(bool)
|
15 |
+
model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
|
16 |
+
model.fit(X_train, y_train)
|
17 |
+
predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
|
18 |
+
result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
|
19 |
+
|
20 |
+
return result_df, model
|
21 |
+
|
22 |
+
def seq_predict_proba(df, trained_clf_model):
|
23 |
+
clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
|
24 |
+
return clf_pred_proba
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
numpy
|
3 |
+
yfinance==0.2.28
|
4 |
+
requests
|
5 |
+
typing_extensions
|
6 |
+
lightgbm
|
7 |
+
tqdm
|
8 |
+
fastjsonschema
|
9 |
+
json5
|
10 |
+
jsonschema
|
11 |
+
holidays
|
12 |
+
pytz
|
13 |
+
mysqlclient
|
14 |
+
sqlalchemy<2.0
|
15 |
+
python-dotenv
|