Laura Cabayol Garcia commited on
Commit
1253e28
·
1 Parent(s): 3eaa4ae

Setup training and validation

Browse files
VitalMetrics/config.py CHANGED
@@ -12,21 +12,24 @@ logger.info(f"PROJ_ROOT path is: {PROJ_ROOT}")
12
 
13
  DATA_DIR = PROJ_ROOT / "data"
14
  RAW_DATA_DIR = DATA_DIR / "raw"
15
- INTERIM_DATA_DIR = DATA_DIR / "interim"
16
  PROCESSED_DATA_DIR = DATA_DIR / "processed"
17
- EXTERNAL_DATA_DIR = DATA_DIR / "external"
18
 
19
  MODELS_DIR = PROJ_ROOT / "models"
20
-
21
  REPORTS_DIR = PROJ_ROOT / "reports"
22
  FIGURES_DIR = REPORTS_DIR / "figures"
23
 
24
- # If tqdm is installed, configure loguru with tqdm.write
25
- # https://github.com/Delgan/loguru/issues/135
26
- try:
27
- from tqdm import tqdm
28
 
29
- logger.remove(0)
30
- logger.add(lambda msg: tqdm.write(msg, end=""), colorize=True)
31
- except ModuleNotFoundError:
32
- pass
 
 
 
 
 
 
 
 
 
 
12
 
13
  DATA_DIR = PROJ_ROOT / "data"
14
  RAW_DATA_DIR = DATA_DIR / "raw"
 
15
  PROCESSED_DATA_DIR = DATA_DIR / "processed"
16
+ PREDICTED_DATA_DIR = DATA_DIR / "predictions"
17
 
18
  MODELS_DIR = PROJ_ROOT / "models"
 
19
  REPORTS_DIR = PROJ_ROOT / "reports"
20
  FIGURES_DIR = REPORTS_DIR / "figures"
21
 
 
 
 
 
22
 
23
+ # Model parameters for different classifiers
24
+ MODEL_PARAMS = {
25
+ 'n_estimators': 100,
26
+ 'max_depth': 5,
27
+ 'learning_rate': 0.1,
28
+ 'solver': 'lbfgs',
29
+ 'max_iter': 200,
30
+ 'C': 1.0,
31
+ 'kernel': 'rbf',
32
+ 'gamma': 'scale',
33
+ 'n_neighbors': 5,
34
+ 'random_state': 42
35
+ }
VitalMetrics/features.py CHANGED
@@ -1,28 +1,95 @@
1
  from pathlib import Path
2
-
3
  import typer
4
  from loguru import logger
5
- from tqdm import tqdm
6
-
7
- from VitalMetrics.config import PROCESSED_DATA_DIR
8
 
9
  app = typer.Typer()
10
-
11
-
12
  @app.command()
13
  def main(
14
- # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
- input_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
16
- output_path: Path = PROCESSED_DATA_DIR / "features.csv",
17
- # -----------------------------------------
 
 
 
18
  ):
19
- # ---- REPLACE THIS WITH YOUR OWN CODE ----
20
- logger.info("Generating features from dataset...")
21
- for i in tqdm(range(10), total=10):
22
- if i == 5:
23
- logger.info("Something happened for iteration 5.")
24
- logger.success("Features generation complete.")
25
- # -----------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  if __name__ == "__main__":
 
1
  from pathlib import Path
2
+ import pandas as pd
3
  import typer
4
  from loguru import logger
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+ from sklearn.model_selection import train_test_split
7
+ from VitalMetrics.config import RAW_DATA_DIR, PROCESSED_DATA_DIR
8
 
9
  app = typer.Typer()
 
 
10
  @app.command()
11
  def main(
12
+ # Paths for input and output files
13
+ input_path: Path = RAW_DATA_DIR / "palmer-penguins-dataset-for-eda/penguins.csv",
14
+ train_output_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv",
15
+ test_output_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
16
+ test_size: float = 0.2, # Size of the test set (20% by default)
17
+ random_state: int = 42, # Reproducibility
18
+ verbose: bool = True # Option to print out progress
19
  ):
20
+ """Function to load dataset, split into train/test, generate features, and save to output files."""
21
+
22
+ # Load dataset
23
+ logger.info(f"Loading dataset from {input_path}...")
24
+ try:
25
+ df = pd.read_csv(input_path)
26
+ logger.success("Dataset loaded successfully.")
27
+ except Exception as e:
28
+ logger.error(f"Failed to load dataset: {e}")
29
+ return
30
+
31
+ # Split the dataset into train and test sets
32
+ logger.info(f"Splitting the dataset into train/test with test size {test_size}...")
33
+ train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
34
+ if verbose:
35
+ logger.info(f"Train set: {len(train_df)} samples, Test set: {len(test_df)} samples.")
36
+
37
+ # Feature engineering (encoding and scaling)
38
+ train_df, test_df = feature_engineering(train_df, test_df, verbose)
39
+
40
+ # Save the transformed features
41
+ logger.info(f"Saving processed training features to {train_output_path}...")
42
+ try:
43
+ train_df.to_csv(train_output_path, index=False)
44
+ logger.success("Training features saved successfully.")
45
+ except Exception as e:
46
+ logger.error(f"Failed to save training features: {e}")
47
+
48
+ logger.info(f"Saving processed test features to {test_output_path}...")
49
+ try:
50
+ test_df.to_csv(test_output_path, index=False)
51
+ logger.success("Test features saved successfully.")
52
+ except Exception as e:
53
+ logger.error(f"Failed to save test features: {e}")
54
+
55
+
56
+ def feature_engineering(train_df: pd.DataFrame, test_df: pd.DataFrame, verbose: bool = True) -> (pd.DataFrame, pd.DataFrame):
57
+ """Performs feature engineering on the train and test datasets."""
58
+
59
+ if verbose:
60
+ logger.info("Starting feature engineering...")
61
+
62
+ # Drop rows with missing values
63
+ train_df.dropna(inplace=True)
64
+ test_df.dropna(inplace=True)
65
+
66
+ # Define columns to delete, encode, and scale
67
+ columns_to_delete = ['sex', 'year']
68
+ columns_to_encode = ['species','island']
69
+ features_to_scale = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'] # Numeric features to scale
70
+
71
+ # Delete columns
72
+ train_df.drop(columns = columns_to_delete, inplace=True)
73
+ test_df.drop(columns = columns_to_delete, inplace=True)
74
+
75
+ # Encoding categorical columns (using LabelEncoder)
76
+ logger.info("Applying label encoding on categorical features...")
77
+ LabEnc_mapping = {}
78
+ for col in columns_to_encode:
79
+ label_encoder = LabelEncoder()
80
+ train_df[col] = label_encoder.fit_transform(train_df[col].values)
81
+ test_df[col] = label_encoder.transform(test_df[col].values)
82
+ LabEnc_mapping[col] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
83
+
84
+ # Scaling numerical columns (fit only on the training set)
85
+ scaler = StandardScaler()
86
+ train_df[features_to_scale] = scaler.fit_transform(train_df[features_to_scale])
87
+ test_df[features_to_scale] = scaler.transform(test_df[features_to_scale]) # Transform the test set using the same scaler
88
+
89
+ if verbose:
90
+ logger.info("Feature engineering completed.")
91
+
92
+ return train_df, test_df
93
 
94
 
95
  if __name__ == "__main__":
VitalMetrics/modeling/predict.py CHANGED
@@ -1,30 +1,66 @@
1
  from pathlib import Path
2
-
3
  import typer
 
 
 
 
4
  from loguru import logger
5
  from tqdm import tqdm
6
 
7
- from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR
8
 
9
  app = typer.Typer()
10
 
11
-
12
  @app.command()
13
  def main(
14
- # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
- features_path: Path = PROCESSED_DATA_DIR / "test_features.csv",
16
- model_path: Path = MODELS_DIR / "model.pkl",
17
- predictions_path: Path = PROCESSED_DATA_DIR / "test_predictions.csv",
 
18
  # -----------------------------------------
19
  ):
20
- # ---- REPLACE THIS WITH YOUR OWN CODE ----
21
- logger.info("Performing inference for model...")
22
- for i in tqdm(range(10), total=10):
23
- if i == 5:
24
- logger.info("Something happened for iteration 5.")
25
- logger.success("Inference complete.")
26
- # -----------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
 
 
 
28
 
29
  if __name__ == "__main__":
30
  app()
 
1
  from pathlib import Path
 
2
  import typer
3
+ import pandas as pd
4
+ import mlflow
5
+ import mlflow.sklearn
6
+ mlflow.set_tracking_uri("http://127.0.0.1:5000")
7
  from loguru import logger
8
  from tqdm import tqdm
9
 
10
+ from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR
11
 
12
  app = typer.Typer()
13
 
 
14
  @app.command()
15
  def main(
16
+ # ---- Define input and output paths ----
17
+ features_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
18
+ model_name: str = "PENGUINS_CLASSIFIER",
19
+ model_version: int = 16,
20
+ predictions_path: Path = PREDICTED_DATA_DIR / "test_predictions.csv",
21
  # -----------------------------------------
22
  ):
23
+ # Load the model from MLflow
24
+ logger.info(f"Loading model '{model_name}' version {model_version}...")
25
+ model_uri = f"models:/{model_name}/{model_version}"
26
+
27
+ # Start MLflow run for logging (optional)
28
+ mlflow.start_run()
29
+
30
+ # Load the features for prediction
31
+ logger.info(f"Loading features from {features_path}...")
32
+ try:
33
+ df = pd.read_csv(features_path)
34
+ logger.success("Features loaded successfully.")
35
+ except Exception as e:
36
+ logger.error(f"Failed to load features: {e}")
37
+ return
38
+
39
+ # Make predictions
40
+ logger.info("Making predictions...")
41
+ # Load the model using MLflow
42
+ model = mlflow.sklearn.load_model(model_uri)
43
+
44
+ # Prepare features for prediction (drop unnecessary columns)
45
+ features = df.drop(columns=['id','species'])
46
+
47
+ # Make predictions with the logged model
48
+ predictions = model.predict(features)
49
+ accuracy = model.score(features.values, df.species)
50
+ logger.info(f"The accuracy in the predictions is {accuracy}")
51
+
52
+ # Save predictions to a CSV file
53
+ logger.info(f"Saving predictions to {predictions_path}...")
54
+ try:
55
+ pd.DataFrame(predictions, columns=["predicted_species"]).to_csv(predictions_path, index=False)
56
+ logger.success("Predictions saved successfully.")
57
+ except Exception as e:
58
+ logger.error(f"Failed to save predictions: {e}")
59
 
60
+ # Log predictions as an artifact
61
+ mlflow.log_artifact(predictions_path)
62
+ # End MLflow run
63
+ mlflow.end_run()
64
 
65
  if __name__ == "__main__":
66
  app()
VitalMetrics/modeling/train.py CHANGED
@@ -1,30 +1,91 @@
1
  from pathlib import Path
2
-
3
  import typer
4
  from loguru import logger
5
- from tqdm import tqdm
 
 
 
 
 
 
6
 
7
- from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR
 
8
 
9
- app = typer.Typer()
 
10
 
 
11
 
12
  @app.command()
13
  def main(
14
- # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
- features_path: Path = PROCESSED_DATA_DIR / "features.csv",
16
- labels_path: Path = PROCESSED_DATA_DIR / "labels.csv",
17
- model_path: Path = MODELS_DIR / "model.pkl",
18
- # -----------------------------------------
19
  ):
20
- # ---- REPLACE THIS WITH YOUR OWN CODE ----
21
- logger.info("Training some model...")
22
- for i in tqdm(range(10), total=10):
23
- if i == 5:
24
- logger.info("Something happened for iteration 5.")
25
- logger.success("Modeling training complete.")
26
- # -----------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
 
 
28
 
29
  if __name__ == "__main__":
30
  app()
 
1
  from pathlib import Path
 
2
  import typer
3
  from loguru import logger
4
+ import pandas as pd
5
+ import pickle
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
9
+ import mlflow
10
+ import mlflow.sklearn
11
 
12
+ # Set the tracking URI
13
+ mlflow.set_tracking_uri("http://127.0.0.1:5000")
14
 
15
+ from VitalMetrics.config import MODELS_DIR, PROCESSED_DATA_DIR, MODEL_PARAMS, FIGURES_DIR
16
+ from VitalMetrics.classifier import Classifier
17
 
18
+ app = typer.Typer()
19
 
20
  @app.command()
21
  def main(
22
+ features_train_path: Path = PROCESSED_DATA_DIR / "penguin_train_features.csv",
23
+ features_test_path: Path = PROCESSED_DATA_DIR / "penguin_test_features.csv",
24
+ model_type: str = "RandomForest",
 
 
25
  ):
26
+ # Load data
27
+ logger.info(f"Loading training features from {features_train_path} and test from {features_test_path}...")
28
+ try:
29
+ df_train = pd.read_csv(features_train_path, header=0, sep=',')
30
+ df_test = pd.read_csv(features_test_path)
31
+ logger.success("Data loaded successfully.")
32
+ except Exception as e:
33
+ logger.error(f"Failed to load data: {e}")
34
+ return
35
+
36
+ # Split data
37
+ logger.info("Splitting the data into training features and labels...")
38
+ X_train, y_train = df_train.drop(columns=['id', 'species']).values, df_train.species.values
39
+ X_test, y_test = df_test.drop(columns=['id', 'species']).values, df_test.species.values
40
+
41
+ # MLflow Tracking
42
+ with mlflow.start_run() as run:
43
+
44
+ logger.info(f"Training {model_type} model...")
45
+ # Initialize the classifier
46
+ classifier = Classifier(model_type=model_type)
47
+
48
+ # Train the model
49
+ classifier.train(X_train, y_train)
50
+
51
+ # Evaluate the model
52
+ accuracy = classifier.score(X_test, y_test)
53
+ logger.success(f"Model training complete. {model_type} accuracy: {accuracy:.4f}")
54
+
55
+ # Evaluate and save confusion matrix
56
+ predictions = classifier.predict(X_test)
57
+ cm = confusion_matrix(y_test, predictions)
58
+ disp = ConfusionMatrixDisplay(confusion_matrix=cm)
59
+ confusion_matrix_path = FIGURES_DIR / f'confusion_matrix_{model_type}.png'
60
+ plt.savefig(confusion_matrix_path)
61
+
62
+ # Log model parameters
63
+ mlflow.log_param("model_type", model_type)
64
+ if model_type == "RandomForest":
65
+ mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
66
+ mlflow.log_param("max_depth", MODEL_PARAMS["max_depth"])
67
+ elif model_type == "GradientBoosting":
68
+ mlflow.log_param("learning_rate", MODEL_PARAMS["learning_rate"])
69
+ mlflow.log_param("n_estimators", MODEL_PARAMS["n_estimators"])
70
+ elif model_type == "SVM":
71
+ mlflow.log_param("C", MODEL_PARAMS["C"])
72
+ mlflow.log_param("kernel", MODEL_PARAMS["kernel"])
73
+ elif model_type == "KNN":
74
+ mlflow.log_param("n_neighbors", MODEL_PARAMS["n_neighbors"])
75
+
76
+ # Log accuracy metric
77
+ mlflow.log_metric("accuracy", accuracy)
78
+
79
+ # Log and register the model with MLflow
80
+ mlflow.sklearn.log_model(
81
+ sk_model=classifier,
82
+ artifact_path="model",
83
+ registered_model_name='PENGUINS_CLASSIFIER'
84
+ )
85
 
86
+ # Save the model locally (optional)
87
+ filename = f"{model_type}_model.pkl"
88
+ pickle.dump(classifier, open(MODELS_DIR / filename, "wb"))
89
 
90
  if __name__ == "__main__":
91
  app()