|
import os |
|
import numpy as np |
|
import tensorflow as tf |
|
from tensorflow.keras import layers, Model |
|
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score |
|
import argparse |
|
import json |
|
import pandas as pd |
|
|
|
|
|
def create_simple_model(instance_shape, max_length): |
|
inputs = layers.Input(shape=(max_length, instance_shape[-1]), name="bag_input") |
|
flatten = layers.TimeDistributed(layers.Flatten())(inputs) |
|
dense_1 = layers.TimeDistributed(layers.Dense(256, activation="relu"))(flatten) |
|
dropout_1 = layers.TimeDistributed(layers.Dropout(0.5))(dense_1) |
|
dense_2 = layers.TimeDistributed(layers.Dense(64, activation="relu"))(dropout_1) |
|
dropout_2 = layers.TimeDistributed(layers.Dropout(0.5))(dense_2) |
|
aggregated = layers.GlobalAveragePooling1D()(dropout_2) |
|
norm_1 = layers.LayerNormalization()(aggregated) |
|
output = layers.Dense(1, activation="sigmoid")(norm_1) |
|
return Model(inputs, output) |
|
|
|
|
|
def compute_class_weights(labels): |
|
negative_count = len(np.where(labels == 0)[0]) |
|
positive_count = len(np.where(labels == 1)[0]) |
|
total_count = negative_count + positive_count |
|
return {0: (1 / negative_count) * (total_count / 2), 1: (1 / positive_count) * (total_count / 2)} |
|
|
|
|
|
def data_generator(data, labels, batch_size=1): |
|
class_weights = compute_class_weights(labels) |
|
while True: |
|
for i in range(0, len(data), batch_size): |
|
batch_data = np.array(data[i:i + batch_size], dtype=np.float32) |
|
batch_labels = np.array(labels[i:i + batch_size], dtype=np.float32) |
|
batch_weights = np.array([class_weights[int(label)] for label in batch_labels], dtype=np.float32) |
|
yield batch_data, batch_labels, batch_weights |
|
|
|
|
|
def lr_scheduler(epoch, lr): |
|
decay_rate = 0.1 |
|
decay_step = 10 |
|
if epoch % decay_step == 0 and epoch: |
|
return lr * decay_rate |
|
return lr |
|
|
|
|
|
def train(train_data, train_labels, val_data, val_labels, model, save_dir): |
|
model_path = os.path.join(save_dir, "best_model.h5") |
|
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path, monitor="val_loss", verbose=1, mode="min", save_best_only=True, save_weights_only=False) |
|
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, mode="min") |
|
lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler) |
|
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", "AUC"]) |
|
train_gen = data_generator(train_data, train_labels) |
|
val_gen = data_generator(val_data, val_labels) |
|
model.fit(train_gen, steps_per_epoch=len(train_data), validation_data=val_gen, validation_steps=len(val_data), epochs=50, batch_size=1, callbacks=[early_stopping, model_checkpoint, lr_callback], verbose=1) |
|
return model |
|
|
|
|
|
def compute_additional_metrics(X, Y, model): |
|
predictions = model.predict(X).flatten() |
|
predictions_binary = (predictions > 0.5).astype(int) |
|
auc = roc_auc_score(Y, predictions) |
|
precision = precision_score(Y, predictions_binary) |
|
recall = recall_score(Y, predictions_binary) |
|
f1 = f1_score(Y, predictions_binary) |
|
return auc, precision, recall, f1, predictions |
|
|
|
|
|
def evaluate_dataset(model, X, Y, dataset_name, save_dir): |
|
eval_metrics = model.evaluate(X, Y, verbose=0) |
|
auc, precision, recall, f1, predictions = compute_additional_metrics(X, Y, model) |
|
metrics = { |
|
'loss': eval_metrics[0], |
|
'accuracy': eval_metrics[1], |
|
'auc': auc, |
|
'precision': precision, |
|
'recall': recall, |
|
'f1_score': f1 |
|
} |
|
|
|
|
|
np.savez_compressed(os.path.join(save_dir, f'{dataset_name}_predictions.npz'), predictions=predictions, labels=Y) |
|
|
|
return metrics |
|
|
|
|
|
def evaluate_all_datasets(model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, save_dir): |
|
train_metrics = evaluate_dataset(model, train_X, train_Y, "train", save_dir) |
|
validate_metrics = evaluate_dataset(model, validate_X, validate_Y, "validate", save_dir) |
|
test_metrics = evaluate_dataset(model, test_X, test_Y, "test", save_dir) |
|
|
|
metrics = { |
|
'train': train_metrics, |
|
'validate': validate_metrics, |
|
'test': test_metrics |
|
} |
|
|
|
|
|
metrics_df = pd.DataFrame(metrics).T |
|
print(metrics_df.to_string()) |
|
|
|
|
|
with open(os.path.join(save_dir, 'evaluation_metrics.json'), 'w') as f: |
|
json.dump(metrics, f, indent=4) |
|
|
|
print("Evaluation metrics saved to evaluation_metrics.json") |
|
|
|
return metrics |
|
|
|
if __name__ == "__main__": |
|
|
|
parser = argparse.ArgumentParser(description='Train a multiple instance learning classifier on risk data.') |
|
parser.add_argument('--data_file', type=str, required=True, help='Path to the saved .npz file with training and validation data.') |
|
parser.add_argument('--save_dir', type=str, default='./model_save/', help='Directory to save the model and evaluation metrics.') |
|
parser.add_argument('--epochs', type=int, default=50, help='Number of training epochs.') |
|
|
|
args = parser.parse_args() |
|
|
|
if not os.path.exists(args.save_dir): |
|
os.makedirs(args.save_dir) |
|
|
|
|
|
data = np.load(args.data_file) |
|
train_X, train_Y = data['train_X'], data['train_Y'] |
|
validate_X, validate_Y = data['validate_X'], data['validate_Y'] |
|
test_X, test_Y = data['test_X'], data['test_Y'] |
|
|
|
|
|
instance_shape = (train_X.shape[-1],) |
|
max_length = train_X.shape[1] |
|
model = create_simple_model(instance_shape, max_length) |
|
|
|
|
|
trained_model = train(train_X, train_Y, validate_X, validate_Y, model, args.save_dir) |
|
|
|
|
|
final_model_path = os.path.join(args.save_dir, "risk_classifier_model.h5") |
|
trained_model.save(final_model_path) |
|
print(f"Model saved successfully to {final_model_path}") |
|
|
|
|
|
metrics = evaluate_all_datasets(trained_model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, args.save_dir) |
|
|