from pathlib import Path import typer from loguru import logger import pandas as pd import pickle import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import mlflow import mlflow.sklearn # Set the tracking URI mlflow.set_tracking_uri("http://127.0.0.1:5000") from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, MODEL_PARAMS, FIGURES_DIR from VitalMetrics.classifier import Classifier app = typer.Typer() @app.command() def main( features_train_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv", features_test_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv", model_type: str = "RandomForest", ): # Load data logger.info(f"Loading training features from {features_train_path} and test from {features_test_path}...") try: df_train = pd.read_csv(features_train_path, header=0, sep=',') df_test = pd.read_csv(features_test_path) logger.success("Data loaded successfully.") except Exception as e: logger.error(f"Failed to load data: {e}") return # Split data logger.info("Splitting the data into training features and labels...") X_train, y_train = df_train.drop(columns=['id', 'species']).values, df_train.species.values X_test, y_test = df_test.drop(columns=['id', 'species']).values, df_test.species.values # MLflow Tracking with mlflow.start_run() as run: logger.info(f"Training {model_type} model...") # Initialize the classifier classifier = Classifier(model_type=model_type) # Train the model classifier.train(X_train, y_train) # Evaluate the model accuracy = classifier.score(X_test, y_test) logger.success(f"Model training complete. {model_type} accuracy: {accuracy:.4f}") # Evaluate and save confusion matrix predictions = classifier.predict(X_test) cm = confusion_matrix(y_test, predictions) disp = ConfusionMatrixDisplay(confusion_matrix=cm) confusion_matrix_path = FIGURES_DIR / f'confusion_matrix_{model_type}.png' plt.savefig(confusion_matrix_path) # Log model parameters mlflow.log_param("model_type", model_type) if model_type == "RandomForest": mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"]) mlflow.log_param("max_depth", MODEL_PARAMS["max_depth"]) elif model_type == "GradientBoosting": mlflow.log_param("learning_rate", MODEL_PARAMS["learning_rate"]) mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"]) elif model_type == "SVM": mlflow.log_param("C", MODEL_PARAMS["C"]) mlflow.log_param("kernel", MODEL_PARAMS["kernel"]) elif model_type == "KNN": mlflow.log_param("n_neighbors", MODEL_PARAMS["n_neighbors"]) # Log accuracy metric mlflow.log_metric("accuracy", accuracy) # Log and register the model with MLflow mlflow.sklearn.log_model( sk_model=classifier, artifact_path="model", registered_model_name='PENGUINS_CLASSIFIER' ) # Save the model locally (optional) filename = f"{model_type}_model.pkl" pickle.dump(classifier, open(MODELS_DIR / filename, "wb")) if __name__ == "__main__": app()