Laura Cabayol Garcia
Setup training and validation
1253e28
raw
history blame
3.43 kB
from pathlib import Path
import typer
from loguru import logger
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import mlflow
import mlflow.sklearn
# Set the tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")
from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, MODEL_PARAMS, FIGURES_DIR
from VitalMetrics.classifier import Classifier
app = typer.Typer()
@app.command()
def main(
features_train_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv",
features_test_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
model_type: str = "RandomForest",
):
# Load data
logger.info(f"Loading training features from {features_train_path} and test from {features_test_path}...")
try:
df_train = pd.read_csv(features_train_path, header=0, sep=',')
df_test = pd.read_csv(features_test_path)
logger.success("Data loaded successfully.")
except Exception as e:
logger.error(f"Failed to load data: {e}")
return
# Split data
logger.info("Splitting the data into training features and labels...")
X_train, y_train = df_train.drop(columns=['id', 'species']).values, df_train.species.values
X_test, y_test = df_test.drop(columns=['id', 'species']).values, df_test.species.values
# MLflow Tracking
with mlflow.start_run() as run:
logger.info(f"Training {model_type} model...")
# Initialize the classifier
classifier = Classifier(model_type=model_type)
# Train the model
classifier.train(X_train, y_train)
# Evaluate the model
accuracy = classifier.score(X_test, y_test)
logger.success(f"Model training complete. {model_type} accuracy: {accuracy:.4f}")
# Evaluate and save confusion matrix
predictions = classifier.predict(X_test)
cm = confusion_matrix(y_test, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
confusion_matrix_path = FIGURES_DIR / f'confusion_matrix_{model_type}.png'
plt.savefig(confusion_matrix_path)
# Log model parameters
mlflow.log_param("model_type", model_type)
if model_type == "RandomForest":
mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
mlflow.log_param("max_depth", MODEL_PARAMS["max_depth"])
elif model_type == "GradientBoosting":
mlflow.log_param("learning_rate", MODEL_PARAMS["learning_rate"])
mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
elif model_type == "SVM":
mlflow.log_param("C", MODEL_PARAMS["C"])
mlflow.log_param("kernel", MODEL_PARAMS["kernel"])
elif model_type == "KNN":
mlflow.log_param("n_neighbors", MODEL_PARAMS["n_neighbors"])
# Log accuracy metric
mlflow.log_metric("accuracy", accuracy)
# Log and register the model with MLflow
mlflow.sklearn.log_model(
sk_model=classifier,
artifact_path="model",
registered_model_name='PENGUINS_CLASSIFIER'
)
# Save the model locally (optional)
filename = f"{model_type}_model.pkl"
pickle.dump(classifier, open(MODELS_DIR / filename, "wb"))
if __name__ == "__main__":
app()