Anupam202224 commited on
Commit
08a171c
·
verified ·
1 Parent(s): f627cf8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +345 -0
app.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.preprocessing import StandardScaler
5
+ from sklearn.ensemble import RandomForestRegressor
6
+ from sklearn.metrics import mean_squared_error, r2_score
7
+ import tensorflow as tf
8
+ from tensorflow.keras.models import Sequential
9
+ from tensorflow.keras.layers import LSTM, Dense, Dropout
10
+ import gradio as gr
11
+ import plotly.graph_objects as go
12
+ from datetime import datetime, timedelta
13
+ import warnings
14
+ import logging
15
+ import traceback
16
+ import yfinance as yf
17
+
18
+ # Set up logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class PredictiveSystem:
23
+ def __init__(self):
24
+ self.scaler = StandardScaler()
25
+ self.rf_model = None
26
+ self.lstm_model = None
27
+ self.feature_importance = None
28
+
29
+ def convert_dates(self, df):
30
+ """Convert date columns to datetime"""
31
+ try:
32
+ df = df.copy()
33
+ # Try to convert 'date' column to datetime
34
+ if 'date' in df.columns:
35
+ df['date'] = pd.to_datetime(df['date'], errors='coerce')
36
+
37
+ # Extract datetime features
38
+ df['month'] = df['date'].dt.month
39
+ df['day'] = df['date'].dt.day
40
+ df['day_of_week'] = df['date'].dt.dayofweek
41
+ df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int)
42
+
43
+ # Drop original date column
44
+ df = df.drop('date', axis=1)
45
+
46
+ return df
47
+ except Exception as e:
48
+ logger.error(f"Error converting dates: {str(e)}")
49
+ raise
50
+
51
+ def validate_data(self, df):
52
+ """Validate input data structure and contents"""
53
+ try:
54
+ # Check if dataframe is empty
55
+ if df.empty:
56
+ raise ValueError("The uploaded file contains no data")
57
+
58
+ # Check minimum number of rows
59
+ if len(df) < 30:
60
+ raise ValueError("Dataset must contain at least 30 rows of data")
61
+
62
+ # Check for minimum number of columns
63
+ if len(df.columns) < 2:
64
+ raise ValueError("Dataset must contain at least 2 columns (features and target)")
65
+
66
+ # First convert date columns
67
+ df = self.convert_dates(df)
68
+
69
+ # Now check for remaining non-numeric columns
70
+ non_numeric_cols = df.select_dtypes(exclude=['number']).columns
71
+ if len(non_numeric_cols) > 0:
72
+ raise ValueError(f"Non-numeric columns found after date processing: {', '.join(non_numeric_cols)}. Please ensure all features are numeric.")
73
+
74
+ return True
75
+
76
+ except Exception as e:
77
+ logger.error(f"Data validation error: {str(e)}")
78
+ raise
79
+
80
+ def preprocess_data(self, df):
81
+ """Clean and preprocess the data with error handling"""
82
+ try:
83
+ logger.info("Starting data preprocessing...")
84
+
85
+ # Convert dates first
86
+ df_processed = self.convert_dates(df)
87
+
88
+ # Handle missing values
89
+ missing_count = df_processed.isnull().sum().sum()
90
+ if missing_count > 0:
91
+ logger.info(f"Handling {missing_count} missing values")
92
+ df_processed = df_processed.fillna(method='ffill').fillna(method='bfill')
93
+
94
+ # Remove any remaining non-numeric columns
95
+ numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
96
+ df_processed = df_processed[numeric_cols]
97
+
98
+ logger.info("Data preprocessing completed successfully")
99
+ return df_processed
100
+
101
+ except Exception as e:
102
+ logger.error(f"Error in preprocessing data: {str(e)}")
103
+ raise
104
+
105
+ def feature_selection(self, X, y):
106
+ """Select important features using Random Forest with error handling"""
107
+ try:
108
+ logger.info("Starting feature selection...")
109
+
110
+ rf = RandomForestRegressor(n_estimators=100, random_state=42)
111
+ rf.fit(X, y)
112
+
113
+ self.feature_importance = pd.DataFrame({
114
+ 'feature': X.columns,
115
+ 'importance': rf.feature_importances_
116
+ }).sort_values('importance', ascending=False)
117
+
118
+ selected_features = self.feature_importance['feature'].head(
119
+ min(10, len(X.columns))
120
+ )
121
+
122
+ logger.info(f"Selected {len(selected_features)} features")
123
+ return X[selected_features]
124
+
125
+ except Exception as e:
126
+ logger.error(f"Error in feature selection: {str(e)}")
127
+ raise
128
+
129
+ def train_models(self, X, y):
130
+ """Train both Random Forest and LSTM models with error handling"""
131
+ try:
132
+ logger.info("Starting model training...")
133
+
134
+ # Split data
135
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
136
+
137
+ # Scale data
138
+ X_train_scaled = self.scaler.fit_transform(X_train)
139
+ X_test_scaled = self.scaler.transform(X_test)
140
+
141
+ # Train Random Forest
142
+ logger.info("Training Random Forest model...")
143
+ self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
144
+ self.rf_model.fit(X_train_scaled, y_train)
145
+
146
+ # Train LSTM
147
+ logger.info("Training LSTM model...")
148
+ X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
149
+
150
+ self.lstm_model = Sequential([
151
+ LSTM(50, activation='relu', input_shape=(1, X_train_scaled.shape[1]), return_sequences=True),
152
+ Dropout(0.2),
153
+ LSTM(50, activation='relu'),
154
+ Dense(1)
155
+ ])
156
+
157
+ self.lstm_model.compile(optimizer='adam', loss='mse')
158
+
159
+ # Use early stopping
160
+ early_stopping = tf.keras.callbacks.EarlyStopping(
161
+ monitor='loss',
162
+ patience=5,
163
+ restore_best_weights=True
164
+ )
165
+
166
+ self.lstm_model.fit(
167
+ X_train_lstm,
168
+ y_train,
169
+ epochs=50,
170
+ batch_size=32,
171
+ verbose=0,
172
+ callbacks=[early_stopping]
173
+ )
174
+
175
+ # Calculate metrics
176
+ rf_pred = self.rf_model.predict(X_test_scaled)
177
+ lstm_pred = self.lstm_model.predict(
178
+ X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))
179
+ )
180
+
181
+ metrics = {
182
+ 'rf_rmse': np.sqrt(mean_squared_error(y_test, rf_pred)),
183
+ 'rf_r2': r2_score(y_test, rf_pred),
184
+ 'lstm_rmse': np.sqrt(mean_squared_error(y_test, lstm_pred)),
185
+ 'lstm_r2': r2_score(y_test, lstm_pred)
186
+ }
187
+
188
+ logger.info("Model training completed successfully")
189
+ return metrics
190
+
191
+ except Exception as e:
192
+ logger.error(f"Error in model training: {str(e)}")
193
+ raise
194
+
195
+ def generate_predictions(self, X):
196
+ """Generate predictions using both models"""
197
+ try:
198
+ X_scaled = self.scaler.transform(X)
199
+
200
+ rf_pred = self.rf_model.predict(X_scaled)
201
+ lstm_pred = self.lstm_model.predict(
202
+ X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
203
+ )
204
+
205
+ # Combine predictions (ensemble)
206
+ final_pred = (rf_pred + lstm_pred.flatten()) / 2
207
+
208
+ return final_pred
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error generating predictions: {str(e)}")
212
+ raise
213
+
214
+ def fetch_real_time_data(ticker):
215
+ """Fetch real-time stock data using yfinance"""
216
+ try:
217
+ stock = yf.Ticker(ticker)
218
+ data = stock.history(period="1d")
219
+ return data
220
+ except Exception as e:
221
+ logger.error(f"Error fetching real-time data for {ticker}: {str(e)}")
222
+ raise
223
+
224
+ def create_gradio_interface(predictor):
225
+ def process_and_predict(file, ticker):
226
+ try:
227
+ # Read data
228
+ logger.info("Reading uploaded file...")
229
+ df = pd.read_csv(file.name)
230
+
231
+ # Show initial data info
232
+ logger.info(f"Columns in uploaded file: {', '.join(df.columns)}")
233
+ logger.info(f"Data types: {df.dtypes}")
234
+
235
+ # Validate and process data
236
+ predictor.validate_data(df)
237
+ df_processed = predictor.preprocess_data(df)
238
+
239
+ # Separate features and target
240
+ y = df_processed.iloc[:, -1] # Assume last column is target
241
+ X = df_processed.iloc[:, :-1]
242
+
243
+ # Feature selection and model training
244
+ X_selected = predictor.feature_selection(X, y)
245
+ metrics = predictor.train_models(X_selected, y)
246
+
247
+ # Generate predictions
248
+ predictions = predictor.generate_predictions(X_selected)
249
+
250
+ # Fetch real-time stock data
251
+ real_time_data = fetch_real_time_data(ticker)
252
+
253
+ # Create visualization
254
+ fig = go.Figure()
255
+ fig.add_trace(go.Scatter(y=y, name='Actual', line=dict(color='blue')))
256
+ fig.add_trace(go.Scatter(y=predictions, name='Predicted', line=dict(color='red')))
257
+ fig.add_trace(go.Scatter(y=real_time_data['Close'], name='Real-Time Data', line=dict(color='green')))
258
+ fig.update_layout(
259
+ title='Actual vs Predicted vs Real-Time Values',
260
+ xaxis_title='Time',
261
+ yaxis_title='Value',
262
+ template='plotly_white'
263
+ )
264
+
265
+ # Format output
266
+ output = f"""
267
+ Model Performance Metrics:
268
+ Random Forest RMSE: {metrics['rf_rmse']:.4f}
269
+ Random Forest R²: {metrics['rf_r2']:.4f}
270
+ LSTM RMSE: {metrics['lstm_rmse']:.4f}
271
+ LSTM R²: {metrics['lstm_r2']:.4f}
272
+
273
+ Data Processing Summary:
274
+ - Total records processed: {len(df)}
275
+ - Features selected: {len(X_selected.columns)}
276
+ - Date features created: month, day, day_of_week, is_weekend
277
+ - Training completed successfully
278
+
279
+ Real-Time Data Summary:
280
+ - Ticker: {ticker}
281
+ - Last Close Price: {real_time_data['Close'].iloc[-1]:.2f}
282
+ """
283
+
284
+ logger.info("Analysis completed successfully")
285
+ return fig, output
286
+
287
+ except Exception as e:
288
+ error_msg = f"""
289
+ Error occurred during processing:
290
+ {str(e)}
291
+
292
+ Please ensure your data:
293
+ 1. Is in CSV format
294
+ 2. Contains a 'date' column (will be automatically processed)
295
+ 3. Contains numeric feature columns
296
+ 4. Has at least 30 rows of data
297
+ 5. Has both feature columns and a target column
298
+ 6. Has no corrupted values
299
+
300
+ Technical details for debugging:
301
+ {traceback.format_exc()}
302
+ """
303
+ logger.error(f"Process failed: {str(e)}")
304
+ return None, error_msg
305
+
306
+ interface = gr.Interface(
307
+ fn=process_and_predict,
308
+ inputs=[
309
+ gr.File(label="Upload CSV file"),
310
+ gr.Textbox(label="Stock Ticker (e.g., AAPL)")
311
+ ],
312
+ outputs=[
313
+ gr.Plot(label="Predictions Visualization"),
314
+ gr.Textbox(label="Analysis Results", lines=10)
315
+ ],
316
+ title="Predictive & Prescriptive Analytics System",
317
+ description="""
318
+ Upload your CSV file containing historical data and enter a stock ticker to fetch real-time data.
319
+ Required format: Furtur Any contact Anupam Joshi 91-9878255748 @ joshianupam32@gmail.com
320
+ - A 'date' column in any standard date format
321
+ - Numeric feature columns
322
+ - A target column (last column)
323
+ - At least 30 rows of data
324
+
325
+ The system will automatically:
326
+ - Process the date column into useful features
327
+ - Handle any missing values
328
+ - Select the most important features
329
+ - Train and evaluate the models
330
+ - Fetch and display real-time stock data
331
+ """,
332
+ examples=[["sample_sales_data.csv", "AAPL"]]
333
+ )
334
+
335
+ return interface
336
+
337
+ # Initialize and launch
338
+ if __name__ == "__main__":
339
+ try:
340
+ predictor = PredictiveSystem()
341
+ interface = create_gradio_interface(predictor)
342
+ interface.launch(share=True)
343
+ except Exception as e:
344
+ logger.error(f"Failed to launch interface: {str(e)}")
345
+ raise