Spaces:
Sleeping
Sleeping
Laura Cabayol Garcia
commited on
Commit
·
1253e28
1
Parent(s):
3eaa4ae
Setup training and validation
Browse files- VitalMetrics/config.py +14 -11
- VitalMetrics/features.py +84 -17
- VitalMetrics/modeling/predict.py +50 -14
- VitalMetrics/modeling/train.py +77 -16
VitalMetrics/config.py
CHANGED
@@ -12,21 +12,24 @@ logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
|
|
12 |
|
13 |
DATA_DIR = PROJ_ROOT / "data"
|
14 |
RAW_DATA_DIR = DATA_DIR / "raw"
|
15 |
-
INTERIM_DATA_DIR = DATA_DIR / "interim"
|
16 |
PROCESSED_DATA_DIR = DATA_DIR / "processed"
|
17 |
-
|
18 |
|
19 |
MODELS_DIR = PROJ_ROOT / "models"
|
20 |
-
|
21 |
REPORTS_DIR = PROJ_ROOT / "reports"
|
22 |
FIGURES_DIR = REPORTS_DIR / "figures"
|
23 |
|
24 |
-
# If tqdm is installed, configure loguru with tqdm.write
|
25 |
-
# https://github.com/Delgan/loguru/issues/135
|
26 |
-
try:
|
27 |
-
from tqdm import tqdm
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
DATA_DIR = PROJ_ROOT / "data"
|
14 |
RAW_DATA_DIR = DATA_DIR / "raw"
|
|
|
15 |
PROCESSED_DATA_DIR = DATA_DIR / "processed"
|
16 |
+
PREDICTED_DATA_DIR = DATA_DIR / "predictions"
|
17 |
|
18 |
MODELS_DIR = PROJ_ROOT / "models"
|
|
|
19 |
REPORTS_DIR = PROJ_ROOT / "reports"
|
20 |
FIGURES_DIR = REPORTS_DIR / "figures"
|
21 |
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
# Model parameters for different classifiers
|
24 |
+
MODEL_PARAMS = {
|
25 |
+
'n_estimators': 100,
|
26 |
+
'max_depth': 5,
|
27 |
+
'learning_rate': 0.1,
|
28 |
+
'solver': 'lbfgs',
|
29 |
+
'max_iter': 200,
|
30 |
+
'C': 1.0,
|
31 |
+
'kernel': 'rbf',
|
32 |
+
'gamma': 'scale',
|
33 |
+
'n_neighbors': 5,
|
34 |
+
'random_state': 42
|
35 |
+
}
|
VitalMetrics/features.py
CHANGED
@@ -1,28 +1,95 @@
|
|
1 |
from pathlib import Path
|
2 |
-
|
3 |
import typer
|
4 |
from loguru import logger
|
5 |
-
from
|
6 |
-
|
7 |
-
from VitalMetrics.config import PROCESSED_DATA_DIR
|
8 |
|
9 |
app = typer.Typer()
|
10 |
-
|
11 |
-
|
12 |
@app.command()
|
13 |
def main(
|
14 |
-
#
|
15 |
-
input_path: Path =
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
):
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
if __name__ == "__main__":
|
|
|
1 |
from pathlib import Path
|
2 |
+
import pandas as pd
|
3 |
import typer
|
4 |
from loguru import logger
|
5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from VitalMetrics.config import RAW_DATA_DIR, PROCESSED_DATA_DIR
|
8 |
|
9 |
app = typer.Typer()
|
|
|
|
|
10 |
@app.command()
|
11 |
def main(
|
12 |
+
# Paths for input and output files
|
13 |
+
input_path: Path = RAW_DATA_DIR / "palmer-penguins-dataset-for-eda/penguins.csv",
|
14 |
+
train_output_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv",
|
15 |
+
test_output_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
|
16 |
+
test_size: float = 0.2, # Size of the test set (20% by default)
|
17 |
+
random_state: int = 42, # Reproducibility
|
18 |
+
verbose: bool = True # Option to print out progress
|
19 |
):
|
20 |
+
"""Function to load dataset, split into train/test, generate features, and save to output files."""
|
21 |
+
|
22 |
+
# Load dataset
|
23 |
+
logger.info(f"Loading dataset from {input_path}...")
|
24 |
+
try:
|
25 |
+
df = pd.read_csv(input_path)
|
26 |
+
logger.success("Dataset loaded successfully.")
|
27 |
+
except Exception as e:
|
28 |
+
logger.error(f"Failed to load dataset: {e}")
|
29 |
+
return
|
30 |
+
|
31 |
+
# Split the dataset into train and test sets
|
32 |
+
logger.info(f"Splitting the dataset into train/test with test size {test_size}...")
|
33 |
+
train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
|
34 |
+
if verbose:
|
35 |
+
logger.info(f"Train set: {len(train_df)} samples, Test set: {len(test_df)} samples.")
|
36 |
+
|
37 |
+
# Feature engineering (encoding and scaling)
|
38 |
+
train_df, test_df = feature_engineering(train_df, test_df, verbose)
|
39 |
+
|
40 |
+
# Save the transformed features
|
41 |
+
logger.info(f"Saving processed training features to {train_output_path}...")
|
42 |
+
try:
|
43 |
+
train_df.to_csv(train_output_path, index=False)
|
44 |
+
logger.success("Training features saved successfully.")
|
45 |
+
except Exception as e:
|
46 |
+
logger.error(f"Failed to save training features: {e}")
|
47 |
+
|
48 |
+
logger.info(f"Saving processed test features to {test_output_path}...")
|
49 |
+
try:
|
50 |
+
test_df.to_csv(test_output_path, index=False)
|
51 |
+
logger.success("Test features saved successfully.")
|
52 |
+
except Exception as e:
|
53 |
+
logger.error(f"Failed to save test features: {e}")
|
54 |
+
|
55 |
+
|
56 |
+
def feature_engineering(train_df: pd.DataFrame, test_df: pd.DataFrame, verbose: bool = True) -> (pd.DataFrame, pd.DataFrame):
|
57 |
+
"""Performs feature engineering on the train and test datasets."""
|
58 |
+
|
59 |
+
if verbose:
|
60 |
+
logger.info("Starting feature engineering...")
|
61 |
+
|
62 |
+
# Drop rows with missing values
|
63 |
+
train_df.dropna(inplace=True)
|
64 |
+
test_df.dropna(inplace=True)
|
65 |
+
|
66 |
+
# Define columns to delete, encode, and scale
|
67 |
+
columns_to_delete = ['sex', 'year']
|
68 |
+
columns_to_encode = ['species','island']
|
69 |
+
features_to_scale = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] # Numeric features to scale
|
70 |
+
|
71 |
+
# Delete columns
|
72 |
+
train_df.drop(columns = columns_to_delete, inplace=True)
|
73 |
+
test_df.drop(columns = columns_to_delete, inplace=True)
|
74 |
+
|
75 |
+
# Encoding categorical columns (using LabelEncoder)
|
76 |
+
logger.info("Applying label encoding on categorical features...")
|
77 |
+
LabEnc_mapping = {}
|
78 |
+
for col in columns_to_encode:
|
79 |
+
label_encoder = LabelEncoder()
|
80 |
+
train_df[col] = label_encoder.fit_transform(train_df[col].values)
|
81 |
+
test_df[col] = label_encoder.transform(test_df[col].values)
|
82 |
+
LabEnc_mapping[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
|
83 |
+
|
84 |
+
# Scaling numerical columns (fit only on the training set)
|
85 |
+
scaler = StandardScaler()
|
86 |
+
train_df[features_to_scale] = scaler.fit_transform(train_df[features_to_scale])
|
87 |
+
test_df[features_to_scale] = scaler.transform(test_df[features_to_scale]) # Transform the test set using the same scaler
|
88 |
+
|
89 |
+
if verbose:
|
90 |
+
logger.info("Feature engineering completed.")
|
91 |
+
|
92 |
+
return train_df, test_df
|
93 |
|
94 |
|
95 |
if __name__ == "__main__":
|
VitalMetrics/modeling/predict.py
CHANGED
@@ -1,30 +1,66 @@
|
|
1 |
from pathlib import Path
|
2 |
-
|
3 |
import typer
|
|
|
|
|
|
|
|
|
4 |
from loguru import logger
|
5 |
from tqdm import tqdm
|
6 |
|
7 |
-
from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR
|
8 |
|
9 |
app = typer.Typer()
|
10 |
|
11 |
-
|
12 |
@app.command()
|
13 |
def main(
|
14 |
-
# ----
|
15 |
-
features_path: Path = PROCESSED_DATA_DIR / "
|
16 |
-
|
17 |
-
|
|
|
18 |
# -----------------------------------------
|
19 |
):
|
20 |
-
#
|
21 |
-
logger.info("
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
|
|
|
|
|
|
28 |
|
29 |
if __name__ == "__main__":
|
30 |
app()
|
|
|
1 |
from pathlib import Path
|
|
|
2 |
import typer
|
3 |
+
import pandas as pd
|
4 |
+
import mlflow
|
5 |
+
import mlflow.sklearn
|
6 |
+
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
7 |
from loguru import logger
|
8 |
from tqdm import tqdm
|
9 |
|
10 |
+
from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR
|
11 |
|
12 |
app = typer.Typer()
|
13 |
|
|
|
14 |
@app.command()
|
15 |
def main(
|
16 |
+
# ---- Define input and output paths ----
|
17 |
+
features_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
|
18 |
+
model_name: str = "PENGUINS_CLASSIFIER",
|
19 |
+
model_version: int = 16,
|
20 |
+
predictions_path: Path = PREDICTED_DATA_DIR / "test_predictions.csv",
|
21 |
# -----------------------------------------
|
22 |
):
|
23 |
+
# Load the model from MLflow
|
24 |
+
logger.info(f"Loading model '{model_name}' version {model_version}...")
|
25 |
+
model_uri = f"models:/{model_name}/{model_version}"
|
26 |
+
|
27 |
+
# Start MLflow run for logging (optional)
|
28 |
+
mlflow.start_run()
|
29 |
+
|
30 |
+
# Load the features for prediction
|
31 |
+
logger.info(f"Loading features from {features_path}...")
|
32 |
+
try:
|
33 |
+
df = pd.read_csv(features_path)
|
34 |
+
logger.success("Features loaded successfully.")
|
35 |
+
except Exception as e:
|
36 |
+
logger.error(f"Failed to load features: {e}")
|
37 |
+
return
|
38 |
+
|
39 |
+
# Make predictions
|
40 |
+
logger.info("Making predictions...")
|
41 |
+
# Load the model using MLflow
|
42 |
+
model = mlflow.sklearn.load_model(model_uri)
|
43 |
+
|
44 |
+
# Prepare features for prediction (drop unnecessary columns)
|
45 |
+
features = df.drop(columns=['id','species'])
|
46 |
+
|
47 |
+
# Make predictions with the logged model
|
48 |
+
predictions = model.predict(features)
|
49 |
+
accuracy = model.score(features.values, df.species)
|
50 |
+
logger.info(f"The accuracy in the predictions is {accuracy}")
|
51 |
+
|
52 |
+
# Save predictions to a CSV file
|
53 |
+
logger.info(f"Saving predictions to {predictions_path}...")
|
54 |
+
try:
|
55 |
+
pd.DataFrame(predictions, columns=["predicted_species"]).to_csv(predictions_path, index=False)
|
56 |
+
logger.success("Predictions saved successfully.")
|
57 |
+
except Exception as e:
|
58 |
+
logger.error(f"Failed to save predictions: {e}")
|
59 |
|
60 |
+
# Log predictions as an artifact
|
61 |
+
mlflow.log_artifact(predictions_path)
|
62 |
+
# End MLflow run
|
63 |
+
mlflow.end_run()
|
64 |
|
65 |
if __name__ == "__main__":
|
66 |
app()
|
VitalMetrics/modeling/train.py
CHANGED
@@ -1,30 +1,91 @@
|
|
1 |
from pathlib import Path
|
2 |
-
|
3 |
import typer
|
4 |
from loguru import logger
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
-
|
|
|
10 |
|
|
|
11 |
|
12 |
@app.command()
|
13 |
def main(
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
model_path: Path = MODELS_DIR / "model.pkl",
|
18 |
-
# -----------------------------------------
|
19 |
):
|
20 |
-
#
|
21 |
-
logger.info("
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
|
|
|
|
28 |
|
29 |
if __name__ == "__main__":
|
30 |
app()
|
|
|
1 |
from pathlib import Path
|
|
|
2 |
import typer
|
3 |
from loguru import logger
|
4 |
+
import pandas as pd
|
5 |
+
import pickle
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
|
9 |
+
import mlflow
|
10 |
+
import mlflow.sklearn
|
11 |
|
12 |
+
# Set the tracking URI
|
13 |
+
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
14 |
|
15 |
+
from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, MODEL_PARAMS, FIGURES_DIR
|
16 |
+
from VitalMetrics.classifier import Classifier
|
17 |
|
18 |
+
app = typer.Typer()
|
19 |
|
20 |
@app.command()
|
21 |
def main(
|
22 |
+
features_train_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv",
|
23 |
+
features_test_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
|
24 |
+
model_type: str = "RandomForest",
|
|
|
|
|
25 |
):
|
26 |
+
# Load data
|
27 |
+
logger.info(f"Loading training features from {features_train_path} and test from {features_test_path}...")
|
28 |
+
try:
|
29 |
+
df_train = pd.read_csv(features_train_path, header=0, sep=',')
|
30 |
+
df_test = pd.read_csv(features_test_path)
|
31 |
+
logger.success("Data loaded successfully.")
|
32 |
+
except Exception as e:
|
33 |
+
logger.error(f"Failed to load data: {e}")
|
34 |
+
return
|
35 |
+
|
36 |
+
# Split data
|
37 |
+
logger.info("Splitting the data into training features and labels...")
|
38 |
+
X_train, y_train = df_train.drop(columns=['id', 'species']).values, df_train.species.values
|
39 |
+
X_test, y_test = df_test.drop(columns=['id', 'species']).values, df_test.species.values
|
40 |
+
|
41 |
+
# MLflow Tracking
|
42 |
+
with mlflow.start_run() as run:
|
43 |
+
|
44 |
+
logger.info(f"Training {model_type} model...")
|
45 |
+
# Initialize the classifier
|
46 |
+
classifier = Classifier(model_type=model_type)
|
47 |
+
|
48 |
+
# Train the model
|
49 |
+
classifier.train(X_train, y_train)
|
50 |
+
|
51 |
+
# Evaluate the model
|
52 |
+
accuracy = classifier.score(X_test, y_test)
|
53 |
+
logger.success(f"Model training complete. {model_type} accuracy: {accuracy:.4f}")
|
54 |
+
|
55 |
+
# Evaluate and save confusion matrix
|
56 |
+
predictions = classifier.predict(X_test)
|
57 |
+
cm = confusion_matrix(y_test, predictions)
|
58 |
+
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
|
59 |
+
confusion_matrix_path = FIGURES_DIR / f'confusion_matrix_{model_type}.png'
|
60 |
+
plt.savefig(confusion_matrix_path)
|
61 |
+
|
62 |
+
# Log model parameters
|
63 |
+
mlflow.log_param("model_type", model_type)
|
64 |
+
if model_type == "RandomForest":
|
65 |
+
mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
|
66 |
+
mlflow.log_param("max_depth", MODEL_PARAMS["max_depth"])
|
67 |
+
elif model_type == "GradientBoosting":
|
68 |
+
mlflow.log_param("learning_rate", MODEL_PARAMS["learning_rate"])
|
69 |
+
mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
|
70 |
+
elif model_type == "SVM":
|
71 |
+
mlflow.log_param("C", MODEL_PARAMS["C"])
|
72 |
+
mlflow.log_param("kernel", MODEL_PARAMS["kernel"])
|
73 |
+
elif model_type == "KNN":
|
74 |
+
mlflow.log_param("n_neighbors", MODEL_PARAMS["n_neighbors"])
|
75 |
+
|
76 |
+
# Log accuracy metric
|
77 |
+
mlflow.log_metric("accuracy", accuracy)
|
78 |
+
|
79 |
+
# Log and register the model with MLflow
|
80 |
+
mlflow.sklearn.log_model(
|
81 |
+
sk_model=classifier,
|
82 |
+
artifact_path="model",
|
83 |
+
registered_model_name='PENGUINS_CLASSIFIER'
|
84 |
+
)
|
85 |
|
86 |
+
# Save the model locally (optional)
|
87 |
+
filename = f"{model_type}_model.pkl"
|
88 |
+
pickle.dump(classifier, open(MODELS_DIR / filename, "wb"))
|
89 |
|
90 |
if __name__ == "__main__":
|
91 |
app()
|