wnstnb commited on
Commit
c219cd7
·
1 Parent(s): 637fab8

DB functions and auto handle time of day

Browse files
Files changed (6) hide show
  1. dbConn.py +42 -0
  2. getDailyData.py +1 -17
  3. getIntraData.py +2 -46
  4. lambda_function.py +66 -9
  5. model_day_v2.py +12 -7
  6. model_intra_v2.py +17 -11
dbConn.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from sqlalchemy import create_engine
3
+ import pandas as pd
4
+ import os
5
+ import MySQLdb
6
+ load_dotenv()
7
+
8
+ engine = create_engine(
9
+ f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
10
+ f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
11
+ f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
12
+ )
13
+
14
+ connection = MySQLdb.connect(
15
+ host=os.getenv("DATABASE_HOST"),
16
+ user=os.getenv("DATABASE_USERNAME"),
17
+ passwd=os.getenv("DATABASE_PASSWORD"),
18
+ db=os.getenv("DATABASE"),
19
+ autocommit=True,
20
+ ssl_mode="VERIFY_IDENTITY",
21
+ ssl={ "ca": "ca-certificates.crt" }
22
+ )
23
+
24
+ # Function to write dataframe to SQL
25
+ def insert_dataframe_to_sql(table_name, dataframe, cursor):
26
+ # Prepare the SQL insert statement
27
+ placeholders = ', '.join(['%s'] * len(dataframe.columns))
28
+ columns = ', '.join(dataframe.columns)
29
+
30
+ # Prepare the ON DUPLICATE KEY UPDATE part of the query
31
+ update_columns = ', '.join([f"{col} = VALUES({col})" for col in dataframe.columns])
32
+
33
+ sql = f"""INSERT INTO {table_name} ({columns}) VALUES ({placeholders})
34
+ ON DUPLICATE KEY UPDATE {update_columns}"""
35
+
36
+ # Convert dataframe to a list of tuples, handling NaN values
37
+ data = [tuple(row) if not any(pd.isna(val) for val in row)
38
+ else tuple(None if pd.isna(val) else val for val in row)
39
+ for row in dataframe.values]
40
+
41
+ # Execute the SQL command for each row
42
+ cursor.executemany(sql, data)
getDailyData.py CHANGED
@@ -2,11 +2,8 @@ import pandas as pd
2
  import numpy as np
3
  import yfinance as yf
4
  from tqdm import tqdm
5
- import os
6
  from pandas.tseries.offsets import BDay
7
- from sqlalchemy import create_engine
8
- from dotenv import load_dotenv
9
- load_dotenv()
10
 
11
  data_start_date = '2018-07-01'
12
 
@@ -21,13 +18,6 @@ def get_daily(mode='daily', periods_30m=None):
21
  vvix = yf.Ticker('^VVIX')
22
  spx = yf.Ticker('^GSPC')
23
 
24
- # Grab data from db
25
- engine = create_engine(
26
- f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
27
- f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
28
- f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
29
- )
30
-
31
  query = f'''SELECT
32
  spx.Datetime AS Datetime,
33
  spx.Open AS Open,
@@ -266,12 +256,6 @@ def get_daily(mode='daily', periods_30m=None):
266
 
267
  data['GreenProbas'] = probas
268
 
269
- engine = create_engine(
270
- f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
271
- f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
272
- f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
273
- )
274
-
275
  df_releases = pd.read_sql_query('select * from releases', con=engine)
276
  df_releases = df_releases.set_index('Datetime')
277
  data = data.merge(df_releases, how = 'left', left_index=True, right_index=True)
 
2
  import numpy as np
3
  import yfinance as yf
4
  from tqdm import tqdm
 
5
  from pandas.tseries.offsets import BDay
6
+ from dbConn import engine
 
 
7
 
8
  data_start_date = '2018-07-01'
9
 
 
18
  vvix = yf.Ticker('^VVIX')
19
  spx = yf.Ticker('^GSPC')
20
 
 
 
 
 
 
 
 
21
  query = f'''SELECT
22
  spx.Datetime AS Datetime,
23
  spx.Open AS Open,
 
256
 
257
  data['GreenProbas'] = probas
258
 
 
 
 
 
 
 
259
  df_releases = pd.read_sql_query('select * from releases', con=engine)
260
  df_releases = df_releases.set_index('Datetime')
261
  data = data.merge(df_releases, how = 'left', left_index=True, right_index=True)
getIntraData.py CHANGED
@@ -1,24 +1,13 @@
1
  import pandas as pd
2
  import yfinance as yf
3
  import datetime
4
- # from datasets import load_dataset
5
- from sqlalchemy import create_engine
6
- import os
7
  from getDailyData import data_start_date
8
- from dotenv import load_dotenv
9
-
10
- # Load environment variables from the .env file
11
- load_dotenv()
12
 
13
  def get_intra(periods_30m = 1):
14
  '''
15
  Method to get historical 30 minute data and append live data to it, if exists.
16
  '''
17
- engine = create_engine(
18
- f"mysql+mysqldb://{os.getenv('DATABASE_USERNAME')}:" \
19
- f"{os.getenv('DATABASE_PASSWORD')}@{os.getenv('DATABASE_HOST')}/" \
20
- f"{os.getenv('DATABASE')}?ssl_ca=ca-certificates.crt&ssl_mode=VERIFY_IDENTITY"
21
- )
22
 
23
  query = f'''SELECT
24
  spx30.Datetime AS Datetime,
@@ -44,42 +33,11 @@ def get_intra(periods_30m = 1):
44
  spx30.Datetime > '{data_start_date}'
45
 
46
  '''
47
- # spx30 = pd.read_sql_query(f'SELECT * FROM SPX_full_30min WHERE Datetime > {data_start_date}', con=engine)
48
- # vix30 = pd.read_sql_query(f'SELECT * FROM VIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
49
- # vvix30 = pd.read_sql_query(f'SELECT * FROM VVIX_full_30min WHERE Datetime > {data_start_date}', con=engine)
50
- # dfs = []
51
 
52
  df_30m = pd.read_sql_query(sql=query, con=engine.connect())
53
  df_30m['Datetime'] = df_30m['Datetime'].dt.tz_localize('America/New_York')
54
  df_30m = df_30m.set_index('Datetime',drop=True)
55
 
56
- # for fr in [spx30, vix30, vvix30]:
57
- # # fr['Datetime'] = fr['Datetime'].apply(lambda x: datetime.datetime.strptime(x[:-6], dt_format))
58
- # fr['Datetime'] = fr['Datetime'].dt.tz_localize('America/New_York')
59
- # fr = fr.set_index('Datetime')
60
- # fr['Open'] = pd.to_numeric(fr['Open'])
61
- # fr['High'] = pd.to_numeric(fr['High'])
62
- # fr['Low'] = pd.to_numeric(fr['Low'])
63
- # fr['Close'] = pd.to_numeric(fr['Close'])
64
- # dfs.append(fr[['Open','High','Low','Close']])
65
-
66
- # df_30m = pd.concat(dfs, axis=1)
67
-
68
- # df_30m.columns = [
69
- # 'Open30',
70
- # 'High30',
71
- # 'Low30',
72
- # 'Close30',
73
- # 'Open_VIX30',
74
- # 'High_VIX30',
75
- # 'Low_VIX30',
76
- # 'Close_VIX30',
77
- # 'Open_VVIX30',
78
- # 'High_VVIX30',
79
- # 'Low_VVIX30',
80
- # 'Close_VVIX30'
81
- # ]
82
-
83
  # Get incremental date
84
  last_date = df_30m.index.date[-1]
85
  last_date = last_date + datetime.timedelta(days=1)
@@ -134,6 +92,4 @@ def get_intra(periods_30m = 1):
134
  vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
135
 
136
  df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
137
- return df_intra
138
-
139
-
 
1
  import pandas as pd
2
  import yfinance as yf
3
  import datetime
 
 
 
4
  from getDailyData import data_start_date
5
+ from dbConn import engine
 
 
 
6
 
7
  def get_intra(periods_30m = 1):
8
  '''
9
  Method to get historical 30 minute data and append live data to it, if exists.
10
  '''
 
 
 
 
 
11
 
12
  query = f'''SELECT
13
  spx30.Datetime AS Datetime,
 
33
  spx30.Datetime > '{data_start_date}'
34
 
35
  '''
 
 
 
 
36
 
37
  df_30m = pd.read_sql_query(sql=query, con=engine.connect())
38
  df_30m['Datetime'] = df_30m['Datetime'].dt.tz_localize('America/New_York')
39
  df_30m = df_30m.set_index('Datetime',drop=True)
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Get incremental date
42
  last_date = df_30m.index.date[-1]
43
  last_date = last_date + datetime.timedelta(days=1)
 
92
  vvix_intra = df_30m.groupby('Datetime')['VVIX30IntraPerf'].tail(1)
93
 
94
  df_intra = pd.concat([opens_intra, highs_intra, lows_intra, closes_intra, spx_intra, vix_intra, vvix_intra], axis=1)
95
+ return df_intra
 
 
lambda_function.py CHANGED
@@ -1,17 +1,74 @@
1
  # Function should get the data and run the whole model, return a single prediction based on the time
2
  from getDailyData import get_daily
3
  from model_intra_v3 import walk_forward_validation
 
 
 
4
  import json
 
 
 
 
 
 
5
 
6
  def lambda_handler(periods_30m):
7
- data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
8
- res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
9
- return json.loads(json.dumps({
10
- 'date': str(res.index[-1]),
11
- 'prediction': res['Predicted'].iloc[-1],
12
- 'time':periods_30m
13
- }))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  if __name__ == '__main__':
16
- j = lambda_handler(1)
17
- print(j)
 
 
 
 
 
 
 
 
 
 
 
1
  # Function should get the data and run the whole model, return a single prediction based on the time
2
  from getDailyData import get_daily
3
  from model_intra_v3 import walk_forward_validation
4
+ from model_day_v2 import walk_forward_validation_seq as walk_forward_validation_daily
5
+ from datetime import timedelta
6
+ import pandas as pd
7
  import json
8
+ from dbConn import connection, engine, insert_dataframe_to_sql
9
+ import numpy as np
10
+ import datetime
11
+ from datetime import time
12
+ import datetime
13
+ from pandas.tseries.offsets import BDay
14
 
15
  def lambda_handler(periods_30m):
16
+ if periods_30m > 0:
17
+ data, df_final, final_row = get_daily(mode='intra', periods_30m=periods_30m)
18
+ res, _ = walk_forward_validation(df_final.drop(columns=['Target']).dropna(), 'Target_clf', 1, mode='single')
19
+
20
+ elif periods_30m == 0:
21
+ data, df_final, final_row = get_daily()
22
+ res, _, _ = walk_forward_validation_daily(df_final.dropna(), 'Target_clf', 'Target', 200, 1)
23
+
24
+ # Get results, run calibration and pvalue
25
+ df_results = pd.read_sql_query(f'select * from results where ModelNum = {str(periods_30m)}', con = engine)
26
+
27
+ # Calibrate Probabilities
28
+ def get_quantiles(df, col_name, q):
29
+ return df.groupby(pd.cut(df[col_name], q))['IsTrue'].mean()
30
+
31
+ pct = res['Predicted'].iloc[-1]
32
+
33
+ df_q = get_quantiles(df_results, 'Predicted', 10)
34
+ for q in df_q.index:
35
+ if q.left <= pct <= q.right:
36
+ p = df_q[q]
37
+
38
+ calib_scores = np.abs(df_results['Predicted'].iloc[:-1] - 0.5)
39
+ score = abs(pct - 0.5)
40
+ pv = np.mean(calib_scores >= score)
41
+ asof = datetime.combine(data.index[-1], time(9,30)) + (periods_30m * timedelta(minutes=30))
42
+
43
+ blob = {
44
+ 'Datetime': str(res.index[-1]),
45
+ 'IsTrue':df_final['Target_clf'].iloc[-1],
46
+ 'Predicted': pct,
47
+ 'CalibPredicted': p,
48
+ 'Pvalue':pv,
49
+ 'ModelNum':periods_30m,
50
+ 'AsOf':str(asof)
51
+ }
52
+
53
+ # Write to DB
54
+ df_write = pd.DataFrame.from_dict({k:[v] for k, v in blob.items()})
55
+ cursor = connection.cursor()
56
+ insert_dataframe_to_sql('results', df_write, cursor)
57
+ cursor.close()
58
+ connection.close()
59
+
60
+ return json.loads(json.dumps(blob))
61
 
62
  if __name__ == '__main__':
63
+ # Code that, based on the time of the day, return which data/model to run
64
+ from datetime import datetime, time
65
+ now = datetime.now()
66
+ morning_start = datetime.combine(now.date(), time(6, 30))
67
+ delta = now - morning_start
68
+ intervals = max(1,min((delta.total_seconds() / 60 / 30) // 1, 12))
69
+ print(f'running for {str(intervals)}')
70
+ j = lambda_handler(intervals)
71
+ # times_list = np.arange(0,13)
72
+ # for i in times_list:
73
+ # j = lambda_handler(i)
74
+ # print(j)
model_day_v2.py CHANGED
@@ -3,6 +3,7 @@ from tqdm import tqdm
3
  from sklearn import linear_model
4
  import lightgbm as lgb
5
  from dailyCols import model_cols
 
6
 
7
  def walk_forward_validation(df, target_column, num_training_rows, num_periods):
8
 
@@ -84,22 +85,26 @@ def walk_forward_validation_seq(df, target_column_clf, target_column_regr, num_t
84
  return df.groupby(pd.cut(df[col_name], q))['True'].mean()
85
 
86
  greenprobas = []
87
- meanprobas = []
88
- for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas'):
89
  try:
90
- df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
91
  for q in df_q.index:
92
  if q.left <= pct <= q.right:
93
  p = df_q[q]
94
- c = (q.left + q.right) / 2
 
 
 
95
  except:
96
  p = None
97
- c = None
98
 
99
  greenprobas.append(p)
100
- meanprobas.append(c)
101
-
102
  df_results['CalibPredicted'] = greenprobas
 
103
 
104
  return df_results, model1, model2
105
 
 
3
  from sklearn import linear_model
4
  import lightgbm as lgb
5
  from dailyCols import model_cols
6
+ import numpy as np
7
 
8
  def walk_forward_validation(df, target_column, num_training_rows, num_periods):
9
 
 
85
  return df.groupby(pd.cut(df[col_name], q))['True'].mean()
86
 
87
  greenprobas = []
88
+ pvals = []
89
+ for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
90
  try:
91
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 10)
92
  for q in df_q.index:
93
  if q.left <= pct <= q.right:
94
  p = df_q[q]
95
+
96
+ calib_scores = np.abs(df_results['Predicted'].iloc[:i] - 0.5)
97
+ score = abs(df_results['Predicted'].iloc[i] - 0.5)
98
+ pv = np.mean(calib_scores >= score)
99
  except:
100
  p = None
101
+ pv = None
102
 
103
  greenprobas.append(p)
104
+ pvals.append(pv)
105
+
106
  df_results['CalibPredicted'] = greenprobas
107
+ df_results['Pvalue'] = pvals
108
 
109
  return df_results, model1, model2
110
 
model_intra_v2.py CHANGED
@@ -1,4 +1,5 @@
1
  import pandas as pd
 
2
  from tqdm import tqdm
3
  import lightgbm as lgb
4
  from sklearn.model_selection import TimeSeriesSplit
@@ -29,29 +30,36 @@ def walk_forward_validation(df, target_column, num_periods, mode='full'):
29
  predictions = model.predict_proba(X_test)[:,-1]
30
 
31
  # Create a DataFrame to store the true and predicted values
32
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=y_test.index)
33
  overall_results.append(result_df)
34
- df_results = pd.concat(overall_results)
35
 
 
 
36
  # Calibrate Probabilities
37
  def get_quantiles(df, col_name, q):
38
- return df.groupby(pd.cut(df[col_name], q))['True'].mean()
39
 
40
  greenprobas = []
 
41
  for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
42
  try:
43
- df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 7)
44
  for q in df_q.index:
45
  if q.left <= pct <= q.right:
46
  p = df_q[q]
 
 
 
 
47
  except:
48
  p = None
 
49
 
50
  greenprobas.append(p)
51
-
 
52
  df_results['CalibPredicted'] = greenprobas
53
-
54
- return df_results, model
55
 
56
  elif mode == 'single':
57
  X_train = df.drop(target_column, axis=1).iloc[:-1]
@@ -62,13 +70,11 @@ def walk_forward_validation(df, target_column, num_periods, mode='full'):
62
  model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
63
  model.fit(X_train, y_train)
64
  predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
65
- result_df = pd.DataFrame({'True': y_test, 'Predicted': predictions}, index=[df.index[-1]])
66
 
67
- return result_df, model
68
 
69
 
70
-
71
-
72
  def seq_predict_proba(df, trained_clf_model):
73
  clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
74
  return clf_pred_proba
 
1
  import pandas as pd
2
+ import numpy as np
3
  from tqdm import tqdm
4
  import lightgbm as lgb
5
  from sklearn.model_selection import TimeSeriesSplit
 
30
  predictions = model.predict_proba(X_test)[:,-1]
31
 
32
  # Create a DataFrame to store the true and predicted values
33
+ result_df = pd.DataFrame({'IsTrue': y_test, 'Predicted': predictions}, index=y_test.index)
34
  overall_results.append(result_df)
 
35
 
36
+ df_results = pd.concat(overall_results)
37
+
38
  # Calibrate Probabilities
39
  def get_quantiles(df, col_name, q):
40
+ return df.groupby(pd.cut(df[col_name], q))['IsTrue'].mean()
41
 
42
  greenprobas = []
43
+ pvals = []
44
  for i, pct in tqdm(enumerate(df_results['Predicted']), desc='Calibrating Probas',total=len(df_results)):
45
  try:
46
+ df_q = get_quantiles(df_results.iloc[:i], 'Predicted', 10)
47
  for q in df_q.index:
48
  if q.left <= pct <= q.right:
49
  p = df_q[q]
50
+
51
+ calib_scores = np.abs(df_results['Predicted'].iloc[:i] - 0.5)
52
+ score = abs(df_results['Predicted'].iloc[i] - 0.5)
53
+ pv = np.mean(calib_scores >= score)
54
  except:
55
  p = None
56
+ pv = None
57
 
58
  greenprobas.append(p)
59
+ pvals.append(pv)
60
+
61
  df_results['CalibPredicted'] = greenprobas
62
+ df_results['Pvalue'] = pvals
 
63
 
64
  elif mode == 'single':
65
  X_train = df.drop(target_column, axis=1).iloc[:-1]
 
70
  model = lgb.LGBMClassifier(n_estimators=10, random_state=42, verbosity=-1)
71
  model.fit(X_train, y_train)
72
  predictions = model.predict_proba(X_test.values.reshape(1, -1))[:,-1]
73
+ df_results = pd.DataFrame({'IsTrue': y_test, 'Predicted': predictions}, index=[df.index[-1]])
74
 
75
+ return df_results, model
76
 
77
 
 
 
78
  def seq_predict_proba(df, trained_clf_model):
79
  clf_pred_proba = trained_clf_model.predict_proba(df[model_cols])[:,-1]
80
  return clf_pred_proba