Spaces:

lauracabayol
/

PENGUINS_CLASSIFIER

Sleeping

App Files Files Community

Laura Cabayol Garcia commited on Sep 29, 2024

Commit

1253e28

1 Parent(s): 3eaa4ae

Setup training and validation

Browse files

Files changed (4) hide show

VitalMetrics/config.py +14 -11
VitalMetrics/features.py +84 -17
VitalMetrics/modeling/predict.py +50 -14
VitalMetrics/modeling/train.py +77 -16

VitalMetrics/config.py CHANGED Viewed

@@ -12,21 +12,24 @@ logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
 DATA_DIR = PROJ_ROOT / "data"
 RAW_DATA_DIR = DATA_DIR / "raw"
-INTERIM_DATA_DIR = DATA_DIR / "interim"
 PROCESSED_DATA_DIR = DATA_DIR / "processed"
-EXTERNAL_DATA_DIR = DATA_DIR / "external"
 MODELS_DIR = PROJ_ROOT / "models"
 REPORTS_DIR = PROJ_ROOT / "reports"
 FIGURES_DIR = REPORTS_DIR / "figures"
-# If tqdm is installed, configure loguru with tqdm.write
-# https://github.com/Delgan/loguru/issues/135
-try:
-    from tqdm import tqdm
-    logger.remove(0)
-    logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
-except ModuleNotFoundError:
-    pass

 DATA_DIR = PROJ_ROOT / "data"
 RAW_DATA_DIR = DATA_DIR / "raw"
 PROCESSED_DATA_DIR = DATA_DIR / "processed"
+PREDICTED_DATA_DIR = DATA_DIR / "predictions"
 MODELS_DIR = PROJ_ROOT / "models"
 REPORTS_DIR = PROJ_ROOT / "reports"
 FIGURES_DIR = REPORTS_DIR / "figures"
+# Model parameters for different classifiers
+MODEL_PARAMS = {
+    'n_estimators': 100,
+    'max_depth': 5,
+    'learning_rate': 0.1,
+    'solver': 'lbfgs',
+    'max_iter': 200,
+    'C': 1.0,
+    'kernel': 'rbf',
+    'gamma': 'scale',
+    'n_neighbors': 5,
+    'random_state': 42
+}

VitalMetrics/features.py CHANGED Viewed

@@ -1,28 +1,95 @@
 from pathlib import Path
 import typer
 from loguru import logger
-from tqdm import tqdm
-from VitalMetrics.config import PROCESSED_DATA_DIR
 app = typer.Typer()
 @app.command()
 def main(
-    # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
-    input_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
-    output_path: Path = PROCESSED_DATA_DIR / "features.csv",
-    # -----------------------------------------
 ):
-    # ---- REPLACE THIS WITH YOUR OWN CODE ----
-    logger.info("Generating features from dataset...")
-    for i in tqdm(range(10), total=10):
-        if i == 5:
-            logger.info("Something happened for iteration 5.")
-    logger.success("Features generation complete.")
-    # -----------------------------------------
 if __name__ == "__main__":

 from pathlib import Path
+import pandas as pd
 import typer
 from loguru import logger
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from sklearn.model_selection import train_test_split
+from VitalMetrics.config import RAW_DATA_DIR, PROCESSED_DATA_DIR
 app = typer.Typer()
 @app.command()
 def main(
+    # Paths for input and output files
+    input_path: Path = RAW_DATA_DIR / "palmer-penguins-dataset-for-eda/penguins.csv",
+    train_output_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv",
+    test_output_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
+    test_size: float = 0.2,  # Size of the test set (20% by default)
+    random_state: int = 42,  # Reproducibility
+    verbose: bool = True     # Option to print out progress
 ):
+    """Function to load dataset, split into train/test, generate features, and save to output files."""
+    # Load dataset
+    logger.info(f"Loading dataset from {input_path}...")
+    try:
+        df = pd.read_csv(input_path)
+        logger.success("Dataset loaded successfully.")
+    except Exception as e:
+        logger.error(f"Failed to load dataset: {e}")
+        return
+    # Split the dataset into train and test sets
+    logger.info(f"Splitting the dataset into train/test with test size {test_size}...")
+    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
+    if verbose:
+        logger.info(f"Train set: {len(train_df)} samples, Test set: {len(test_df)} samples.")
+    # Feature engineering (encoding and scaling)
+    train_df, test_df = feature_engineering(train_df, test_df, verbose)
+    # Save the transformed features
+    logger.info(f"Saving processed training features to {train_output_path}...")
+    try:
+        train_df.to_csv(train_output_path, index=False)
+        logger.success("Training features saved successfully.")
+    except Exception as e:
+        logger.error(f"Failed to save training features: {e}")
+    logger.info(f"Saving processed test features to {test_output_path}...")
+    try:
+        test_df.to_csv(test_output_path, index=False)
+        logger.success("Test features saved successfully.")
+    except Exception as e:
+        logger.error(f"Failed to save test features: {e}")
+def feature_engineering(train_df: pd.DataFrame, test_df: pd.DataFrame, verbose: bool = True) -> (pd.DataFrame, pd.DataFrame):
+    """Performs feature engineering on the train and test datasets."""
+    if verbose:
+        logger.info("Starting feature engineering...")
+    # Drop rows with missing values
+    train_df.dropna(inplace=True)
+    test_df.dropna(inplace=True)
+    # Define columns to delete, encode, and scale
+    columns_to_delete = ['sex', 'year']
+    columns_to_encode = ['species','island']
+    features_to_scale = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']  # Numeric features to scale
+    # Delete columns
+    train_df.drop(columns = columns_to_delete, inplace=True)
+    test_df.drop(columns = columns_to_delete, inplace=True)
+    # Encoding categorical columns (using LabelEncoder)
+    logger.info("Applying label encoding on categorical features...")
+    LabEnc_mapping = {}
+    for col in columns_to_encode:
+        label_encoder = LabelEncoder()
+        train_df[col] = label_encoder.fit_transform(train_df[col].values)
+        test_df[col] = label_encoder.transform(test_df[col].values)
+        LabEnc_mapping[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
+    # Scaling numerical columns (fit only on the training set)
+    scaler = StandardScaler()
+    train_df[features_to_scale] = scaler.fit_transform(train_df[features_to_scale])
+    test_df[features_to_scale] = scaler.transform(test_df[features_to_scale])  # Transform the test set using the same scaler
+    if verbose:
+        logger.info("Feature engineering completed.")
+    return train_df, test_df
 if __name__ == "__main__":

VitalMetrics/modeling/predict.py CHANGED Viewed

@@ -1,30 +1,66 @@
 from pathlib import Path
 import typer
 from loguru import logger
 from tqdm import tqdm
-from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR
 app = typer.Typer()
 @app.command()
 def main(
-    # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
-    features_path: Path = PROCESSED_DATA_DIR / "test_features.csv",
-    model_path: Path = MODELS_DIR / "model.pkl",
-    predictions_path: Path = PROCESSED_DATA_DIR / "test_predictions.csv",
     # -----------------------------------------
 ):
-    # ---- REPLACE THIS WITH YOUR OWN CODE ----
-    logger.info("Performing inference for model...")
-    for i in tqdm(range(10), total=10):
-        if i == 5:
-            logger.info("Something happened for iteration 5.")
-    logger.success("Inference complete.")
-    # -----------------------------------------
 if __name__ == "__main__":
     app()

 from pathlib import Path
 import typer
+import pandas as pd
+import mlflow
+import mlflow.sklearn
+mlflow.set_tracking_uri("http://127.0.0.1:5000")
 from loguru import logger
 from tqdm import tqdm
+from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR
 app = typer.Typer()
 @app.command()
 def main(
+    # ---- Define input and output paths ----
+    features_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
+    model_name: str = "PENGUINS_CLASSIFIER",
+    model_version: int = 16,
+    predictions_path: Path = PREDICTED_DATA_DIR / "test_predictions.csv",
     # -----------------------------------------
 ):
+    # Load the model from MLflow
+    logger.info(f"Loading model '{model_name}' version {model_version}...")
+    model_uri = f"models:/{model_name}/{model_version}"
+    # Start MLflow run for logging (optional)
+    mlflow.start_run()
+    # Load the features for prediction
+    logger.info(f"Loading features from {features_path}...")
+    try:
+        df = pd.read_csv(features_path)
+        logger.success("Features loaded successfully.")
+    except Exception as e:
+        logger.error(f"Failed to load features: {e}")
+        return
+    # Make predictions
+    logger.info("Making predictions...")
+    # Load the model using MLflow
+    model = mlflow.sklearn.load_model(model_uri)
+    # Prepare features for prediction (drop unnecessary columns)
+    features = df.drop(columns=['id','species'])
+    # Make predictions with the logged model
+    predictions = model.predict(features)
+    accuracy = model.score(features.values, df.species)
+    logger.info(f"The accuracy in the predictions is {accuracy}")
+    # Save predictions to a CSV file
+    logger.info(f"Saving predictions to {predictions_path}...")
+    try:
+        pd.DataFrame(predictions, columns=["predicted_species"]).to_csv(predictions_path, index=False)
+        logger.success("Predictions saved successfully.")
+    except Exception as e:
+        logger.error(f"Failed to save predictions: {e}")
+    # Log predictions as an artifact
+    mlflow.log_artifact(predictions_path)
+    # End MLflow run
+    mlflow.end_run()
 if __name__ == "__main__":
     app()

VitalMetrics/modeling/train.py CHANGED Viewed

@@ -1,30 +1,91 @@
 from pathlib import Path
 import typer
 from loguru import logger
-from tqdm import tqdm
-from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR
-app = typer.Typer()
 @app.command()
 def main(
-    # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
-    features_path: Path = PROCESSED_DATA_DIR / "features.csv",
-    labels_path: Path = PROCESSED_DATA_DIR / "labels.csv",
-    model_path: Path = MODELS_DIR / "model.pkl",
-    # -----------------------------------------
 ):
-    # ---- REPLACE THIS WITH YOUR OWN CODE ----
-    logger.info("Training some model...")
-    for i in tqdm(range(10), total=10):
-        if i == 5:
-            logger.info("Something happened for iteration 5.")
-    logger.success("Modeling training complete.")
-    # -----------------------------------------
 if __name__ == "__main__":
     app()

 from pathlib import Path
 import typer
 from loguru import logger
+import pandas as pd
+import pickle
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
+import mlflow
+import mlflow.sklearn
+# Set the tracking URI
+mlflow.set_tracking_uri("http://127.0.0.1:5000")
+from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, MODEL_PARAMS, FIGURES_DIR
+from VitalMetrics.classifier import Classifier
+app = typer.Typer()
 @app.command()
 def main(
+    features_train_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv",
+    features_test_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
+    model_type: str = "RandomForest",
 ):
+    # Load data
+    logger.info(f"Loading training features from {features_train_path} and test from {features_test_path}...")
+    try:
+        df_train = pd.read_csv(features_train_path, header=0, sep=',')
+        df_test = pd.read_csv(features_test_path)
+        logger.success("Data loaded successfully.")
+    except Exception as e:
+        logger.error(f"Failed to load data: {e}")
+        return
+    # Split data
+    logger.info("Splitting the data into training features and labels...")
+    X_train, y_train = df_train.drop(columns=['id', 'species']).values, df_train.species.values
+    X_test, y_test = df_test.drop(columns=['id', 'species']).values, df_test.species.values
+    # MLflow Tracking
+    with mlflow.start_run() as run:
+        logger.info(f"Training {model_type} model...")
+        # Initialize the classifier
+        classifier = Classifier(model_type=model_type)
+        # Train the model
+        classifier.train(X_train, y_train)
+        # Evaluate the model
+        accuracy = classifier.score(X_test, y_test)
+        logger.success(f"Model training complete. {model_type} accuracy: {accuracy:.4f}")
+        # Evaluate and save confusion matrix
+        predictions = classifier.predict(X_test)
+        cm = confusion_matrix(y_test, predictions)
+        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
+        confusion_matrix_path = FIGURES_DIR / f'confusion_matrix_{model_type}.png'
+        plt.savefig(confusion_matrix_path)
+        # Log model parameters
+        mlflow.log_param("model_type", model_type)
+        if model_type == "RandomForest":
+            mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
+            mlflow.log_param("max_depth", MODEL_PARAMS["max_depth"])
+        elif model_type == "GradientBoosting":
+            mlflow.log_param("learning_rate", MODEL_PARAMS["learning_rate"])
+            mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
+        elif model_type == "SVM":
+            mlflow.log_param("C", MODEL_PARAMS["C"])
+            mlflow.log_param("kernel", MODEL_PARAMS["kernel"])
+        elif model_type == "KNN":
+            mlflow.log_param("n_neighbors", MODEL_PARAMS["n_neighbors"])
+        # Log accuracy metric
+        mlflow.log_metric("accuracy", accuracy)
+        # Log and register the model with MLflow
+        mlflow.sklearn.log_model(
+            sk_model=classifier,
+            artifact_path="model",
+            registered_model_name='PENGUINS_CLASSIFIER'
+        )
+        # Save the model locally (optional)
+        filename = f"{model_type}_model.pkl"
+        pickle.dump(classifier, open(MODELS_DIR / filename, "wb"))
 if __name__ == "__main__":
     app()