import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import LSTM, Dense, Dropout import gradio as gr import plotly.graph_objects as go from datetime import datetime, timedelta import warnings import logging import traceback import yfinance as yf # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class PredictiveSystem: def __init__(self): self.scaler = StandardScaler() self.rf_model = None self.lstm_model = None self.feature_importance = None def convert_dates(self, df): """Convert date columns to datetime""" try: df = df.copy() # Try to convert 'date' column to datetime if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'], errors='coerce') # Extract datetime features df['month'] = df['date'].dt.month df['day'] = df['date'].dt.day df['day_of_week'] = df['date'].dt.dayofweek df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int) # Drop original date column df = df.drop('date', axis=1) return df except Exception as e: logger.error(f"Error converting dates: {str(e)}") raise def validate_data(self, df): """Validate input data structure and contents""" try: # Check if dataframe is empty if df.empty: raise ValueError("The uploaded file contains no data") # Check minimum number of rows if len(df) < 30: raise ValueError("Dataset must contain at least 30 rows of data") # Check for minimum number of columns if len(df.columns) < 2: raise ValueError("Dataset must contain at least 2 columns (features and target)") # First convert date columns df = self.convert_dates(df) # Now check for remaining non-numeric columns non_numeric_cols = df.select_dtypes(exclude=['number']).columns if len(non_numeric_cols) > 0: raise ValueError(f"Non-numeric columns found after date processing: {', '.join(non_numeric_cols)}. Please ensure all features are numeric.") return True except Exception as e: logger.error(f"Data validation error: {str(e)}") raise def preprocess_data(self, df): """Clean and preprocess the data with error handling""" try: logger.info("Starting data preprocessing...") # Convert dates first df_processed = self.convert_dates(df) # Handle missing values missing_count = df_processed.isnull().sum().sum() if missing_count > 0: logger.info(f"Handling {missing_count} missing values") df_processed = df_processed.fillna(method='ffill').fillna(method='bfill') # Remove any remaining non-numeric columns numeric_cols = df_processed.select_dtypes(include=[np.number]).columns df_processed = df_processed[numeric_cols] logger.info("Data preprocessing completed successfully") return df_processed except Exception as e: logger.error(f"Error in preprocessing data: {str(e)}") raise def feature_selection(self, X, y): """Select important features using Random Forest with error handling""" try: logger.info("Starting feature selection...") rf = RandomForestRegressor(n_estimators=100, random_state=42) rf.fit(X, y) self.feature_importance = pd.DataFrame({ 'feature': X.columns, 'importance': rf.feature_importances_ }).sort_values('importance', ascending=False) selected_features = self.feature_importance['feature'].head( min(10, len(X.columns)) ) logger.info(f"Selected {len(selected_features)} features") return X[selected_features] except Exception as e: logger.error(f"Error in feature selection: {str(e)}") raise def train_models(self, X, y): """Train both Random Forest and LSTM models with error handling""" try: logger.info("Starting model training...") # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Scale data X_train_scaled = self.scaler.fit_transform(X_train) X_test_scaled = self.scaler.transform(X_test) # Train Random Forest logger.info("Training Random Forest model...") self.rf_model = RandomForestRegressor(n_estimators=100, random_state=42) self.rf_model.fit(X_train_scaled, y_train) # Train LSTM logger.info("Training LSTM model...") X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1])) self.lstm_model = Sequential([ LSTM(50, activation='relu', input_shape=(1, X_train_scaled.shape[1]), return_sequences=True), Dropout(0.2), LSTM(50, activation='relu'), Dense(1) ]) self.lstm_model.compile(optimizer='adam', loss='mse') # Use early stopping early_stopping = tf.keras.callbacks.EarlyStopping( monitor='loss', patience=5, restore_best_weights=True ) self.lstm_model.fit( X_train_lstm, y_train, epochs=50, batch_size=32, verbose=0, callbacks=[early_stopping] ) # Calculate metrics rf_pred = self.rf_model.predict(X_test_scaled) lstm_pred = self.lstm_model.predict( X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1])) ) metrics = { 'rf_rmse': np.sqrt(mean_squared_error(y_test, rf_pred)), 'rf_r2': r2_score(y_test, rf_pred), 'lstm_rmse': np.sqrt(mean_squared_error(y_test, lstm_pred)), 'lstm_r2': r2_score(y_test, lstm_pred) } logger.info("Model training completed successfully") return metrics except Exception as e: logger.error(f"Error in model training: {str(e)}") raise def generate_predictions(self, X): """Generate predictions using both models""" try: X_scaled = self.scaler.transform(X) rf_pred = self.rf_model.predict(X_scaled) lstm_pred = self.lstm_model.predict( X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1])) ) # Combine predictions (ensemble) final_pred = (rf_pred + lstm_pred.flatten()) / 2 return final_pred except Exception as e: logger.error(f"Error generating predictions: {str(e)}") raise def fetch_real_time_data(ticker): """Fetch real-time stock data using yfinance""" try: stock = yf.Ticker(ticker) data = stock.history(period="1d") return data except Exception as e: logger.error(f"Error fetching real-time data for {ticker}: {str(e)}") raise def create_gradio_interface(predictor): def process_and_predict(file, ticker): try: # Read data logger.info("Reading uploaded file...") df = pd.read_csv(file.name) # Show initial data info logger.info(f"Columns in uploaded file: {', '.join(df.columns)}") logger.info(f"Data types: {df.dtypes}") # Validate and process data predictor.validate_data(df) df_processed = predictor.preprocess_data(df) # Separate features and target y = df_processed.iloc[:, -1] # Assume last column is target X = df_processed.iloc[:, :-1] # Feature selection and model training X_selected = predictor.feature_selection(X, y) metrics = predictor.train_models(X_selected, y) # Generate predictions predictions = predictor.generate_predictions(X_selected) # Fetch real-time stock data real_time_data = fetch_real_time_data(ticker) # Create visualization fig = go.Figure() fig.add_trace(go.Scatter(y=y, name='Actual', line=dict(color='blue'))) fig.add_trace(go.Scatter(y=predictions, name='Predicted', line=dict(color='red'))) fig.add_trace(go.Scatter(y=real_time_data['Close'], name='Real-Time Data', line=dict(color='green'))) fig.update_layout( title='Actual vs Predicted vs Real-Time Values', xaxis_title='Time', yaxis_title='Value', template='plotly_white' ) # Format output output = f""" Model Performance Metrics: Random Forest RMSE: {metrics['rf_rmse']:.4f} Random Forest R²: {metrics['rf_r2']:.4f} LSTM RMSE: {metrics['lstm_rmse']:.4f} LSTM R²: {metrics['lstm_r2']:.4f} Data Processing Summary: - Total records processed: {len(df)} - Features selected: {len(X_selected.columns)} - Date features created: month, day, day_of_week, is_weekend - Training completed successfully Real-Time Data Summary: - Ticker: {ticker} - Last Close Price: {real_time_data['Close'].iloc[-1]:.2f} """ logger.info("Analysis completed successfully") return fig, output except Exception as e: error_msg = f""" Error occurred during processing: {str(e)} Please ensure your data: 1. Is in CSV format 2. Contains a 'date' column (will be automatically processed) 3. Contains numeric feature columns 4. Has at least 30 rows of data 5. Has both feature columns and a target column 6. Has no corrupted values Technical details for debugging: {traceback.format_exc()} """ logger.error(f"Process failed: {str(e)}") return None, error_msg interface = gr.Interface( fn=process_and_predict, inputs=[ gr.File(label="Upload CSV file"), gr.Textbox(label="Stock Ticker (e.g., AAPL)") ], outputs=[ gr.Plot(label="Predictions Visualization"), gr.Textbox(label="Analysis Results", lines=10) ], title="Predictive & Prescriptive Analytics System", description=""" Upload your CSV file containing historical data and enter a stock ticker to fetch real-time data. Required format: Furtur Any contact Anupam Joshi 91-9878255748 @ joshianupam32@gmail.com - A 'date' column in any standard date format - Numeric feature columns - A target column (last column) - At least 30 rows of data The system will automatically: - Process the date column into useful features - Handle any missing values - Select the most important features - Train and evaluate the models - Fetch and display real-time stock data """, examples=[["sample_sales_data.csv", "AAPL"]] ) return interface # Initialize and launch if __name__ == "__main__": try: predictor = PredictiveSystem() interface = create_gradio_interface(predictor) interface.launch(share=True) except Exception as e: logger.error(f"Failed to launch interface: {str(e)}") raise