from ML_SLRC import * import os import numpy as np import pandas as pd from torch.utils.data import DataLoader from torch.optim import Adam import gc from torchmetrics import functional as fn import random from tqdm import tqdm from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_curve, auc import ipywidgets as widgets from IPython.display import display, clear_output import matplotlib.pyplot as plt import warnings import torch import time from sklearn.manifold import TSNE from copy import deepcopy import seaborn as sns import matplotlib.pylab as plt import json from pathlib import Path import re from collections import defaultdict # SEED = 2222 # gen_seed = torch.Generator().manual_seed(SEED) # Random seed function def random_seed(value): torch.backends.cudnn.deterministic=True torch.manual_seed(value) torch.cuda.manual_seed(value) np.random.seed(value) random.seed(value) # Tasks for meta-learner def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4): idxs = list(range(0,len(taskset))) if is_shuffle: random.shuffle(idxs) for i in range(0,len(idxs), batch_size): yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))] # Prepare data to process by Domain-learner def prepare_data(data, batch_size, tokenizer,max_seq_length, input = 'text', output = 'label', train_size_per_class = 5, global_datasets = False, treat_text_fun =None): data = data.reset_index().drop("index", axis=1) if global_datasets: global data_train, data_test # Sample task for training data_train = data.groupby('label').sample(train_size_per_class, replace=False) idex = data.index.isin(data_train.index) # The Test set to label by the model data_test = data # Transform in dataset to model ## Train dataset_train = SLR_DataSet( data = data_train.sample(frac=1), input = input, output = output, tokenizer=tokenizer, max_seq_length =max_seq_length, treat_text =treat_text_fun) ## Test dataset_test = SLR_DataSet( data = data_test, input = input, output = output, tokenizer=tokenizer, max_seq_length =max_seq_length, treat_text =treat_text_fun) # Dataloaders ## Train data_train_loader = DataLoader(dataset_train, shuffle=True, batch_size=batch_size['train'] ) ## Test if len(dataset_test) % batch_size['test'] == 1 : data_test_loader = DataLoader(dataset_test, batch_size=batch_size['test'], drop_last=True) else: data_test_loader = DataLoader(dataset_test, batch_size=batch_size['test'], drop_last=False) return data_train_loader, data_test_loader, data_train, data_test # Meta trainer def meta_train(data, model, device, Info, print_epoch =True, Test_resource =None, treat_text_fun =None): # Meta-learner model learner = Learner(model = model, device = device, **Info) # Testing tasks if isinstance(Test_resource, pd.DataFrame): test = MetaTask(Test_resource, num_task = 0, k_support=10, k_query=10, training=False,treat_text =treat_text_fun, **Info) torch.clear_autocast_cache() gc.collect() torch.cuda.empty_cache() # Meta epoch (Outer epoch) for epoch in tqdm(range(Info['meta_epoch']), desc= "Meta epoch ", ncols=80): # Train tasks train = MetaTask(data, num_task = Info['num_task_train'], k_support=Info['k_qry'], k_query=Info['k_spt'], treat_text =treat_text_fun, **Info) # Batch of train tasks db = create_batch_of_tasks(train, is_shuffle = True, batch_size = Info["outer_batch_size"]) if print_epoch: # Outer loop bach training for step, task_batch in enumerate(db): print("\n-----------------Training Mode","Meta_epoch:", epoch ,"-----------------\n") # meta-feedfoward (outer-feedfoward) acc = learner(task_batch, valid_train= print_epoch) print('Step:', step, '\ttraining Acc:', acc) if isinstance(Test_resource, pd.DataFrame): # Validating Model if ((epoch+1) % 4) + step == 0: random_seed(123) print("\n-----------------Testing Mode-----------------\n") # Batch of test tasks db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1) acc_all_test = [] # Looping testing tasks for test_batch in db_test: acc = learner(test_batch, training = False) acc_all_test.append(acc) print('Test acc:', np.mean(acc_all_test)) del acc_all_test, db_test # Restarting training randomly random_seed(int(time.time() % 10)) else: for step, task_batch in enumerate(db): # meta-feedfoward (outer-feedfoward) acc = learner(task_batch, print_epoch, valid_train= print_epoch) torch.clear_autocast_cache() gc.collect() torch.cuda.empty_cache() def train_loop(data_train_loader, data_test_loader, model, device, epoch = 4, lr = 1, print_info = True, name = 'name', weight_decay = 1): # Start the model's parameters model_meta = deepcopy(model) optimizer = Adam(model_meta.parameters(), lr=lr, weight_decay = weight_decay) model_meta.to(device) model_meta.train() # Task epoch (Inner epoch) for i in range(0, epoch): all_loss = [] # Inner training batch (support set) for inner_step, batch in enumerate(data_train_loader): batch = tuple(t.to(device) for t in batch) input_ids, attention_mask,q_token_type_ids, label_id = batch # Inner Feedfoward loss, _, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze()) # compute grads loss.backward() # update parameters optimizer.step() optimizer.zero_grad() all_loss.append(loss.item()) if (i % 2 == 0) & print_info: print("Loss: ", np.mean(all_loss)) # Test evaluation model_meta.eval() all_loss = [] all_acc = [] features = [] labels = [] predi_logit = [] with torch.no_grad(): # Test's Batch loop for inner_step, batch in enumerate(tqdm(data_test_loader, desc="Test validation | " + name, ncols=80)) : batch = tuple(t.to(device) for t in batch) input_ids, attention_mask,q_token_type_ids, label_id = batch # Predictions _, feature, _ = model_meta(input_ids, attention_mask,q_token_type_ids, labels = label_id.squeeze()) # prediction = prediction.detach().cpu().squeeze() # label_id = label_id.detach().cpu() logit = feature[1].detach().cpu() # feature_lat = feature[0].detach().cpu() # labels.append(label_id.numpy().squeeze()) # features.append(feature_lat.numpy()) predi_logit.append(logit.numpy()) # Accuracy over the test's bach # acc = fn.accuracy(prediction, label_id).item() # all_acc.append(acc) del input_ids, attention_mask, label_id, batch if print_info: print("acc:", np.mean(all_acc)) model_meta.to('cpu') gc.collect() torch.cuda.empty_cache() del model_meta, optimizer logits = np.concatenate(np.array(predi_logit,dtype=object)) logits = torch.tensor(logits.astype(np.float32)).detach().clone() # return features, labels, predi_logit return logits.detach().clone() # Process predictions and map the feature_map in tsne def map_feature_tsne(features, labels, predi_logit): features = np.concatenate(np.array(features,dtype=object)) features = torch.tensor(features.astype(np.float32)).detach().clone() labels = np.concatenate(np.array(labels,dtype=object)) labels = torch.tensor(labels.astype(int)).detach().clone() logits = np.concatenate(np.array(predi_logit,dtype=object)) logits = torch.tensor(logits.astype(np.float32)).detach().clone() # Dimention reduction X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(features.detach().clone()) return logits.detach().clone(), X_embedded, labels.detach().clone(), features.detach().clone() def wss_calc(logit, labels, trsh = 0.5): # Prediction label given the threshold predict_trash = torch.sigmoid(logit).squeeze() >= trsh # Compute confusion matrix values CM = confusion_matrix(labels, predict_trash.to(int) ) tn, fp, fne, tp = CM.ravel() P = (tp + fne) N = (tn + fp) recall = tp/(tp+fne) # WSS wss = (tn + fne)/len(labels) -(1- recall) # AWSS awss = (tn/N - fne/P) return { "wss": round(wss,4), "awss": round(awss,4), "R": round(recall,4), "CM": CM } # Compute the metrics def plot(logits, X_embedded, labels, threshold, show = True, namefig = "plot", make_plot = True, print_stats = True, save = True): col = pd.MultiIndex.from_tuples([ ("Predict", "0"), ("Predict", "1") ]) index = pd.MultiIndex.from_tuples([ ("Real", "0"), ("Real", "1") ]) predict = torch.sigmoid(logits).detach().clone() # Roc curve fpr, tpr, thresholds = roc_curve(labels, predict.squeeze()) # Given by a Recall of 95% (threshold avaliation) ## WSS ### Index to recall idx_wss95 = sum(tpr < 0.95) ### threshold thresholds95 = thresholds[idx_wss95] ### Compute the metrics wss95_info = wss_calc(logits,labels, thresholds95 ) acc_wss95 = fn.accuracy(predict, labels, threshold=thresholds95) f1_wss95 = fn.f1_score(predict, labels, threshold=thresholds95) # Given by a threshold (recall avaliation) ### Compute the metrics wss_info = wss_calc(logits,labels, threshold ) acc_wssR = fn.accuracy(predict, labels, threshold=threshold) f1_wssR = fn.f1_score(predict, labels, threshold=threshold) metrics= { # WSS "WSS@95": wss95_info['wss'], "AWSS@95": wss95_info['awss'], "WSS@R": wss_info['wss'], "AWSS@R": wss_info['awss'], # Recall "Recall_WSS@95": wss95_info['R'], "Recall_WSS@R": wss_info['R'], # acc "acc@95": acc_wss95.item(), "acc@R": acc_wssR.item(), # f1 "f1@95": f1_wss95.item(), "f1@R": f1_wssR.item(), # threshold 95 "threshold@95": thresholds95 } # Print stats if print_stats: wss95= f"WSS@95:{wss95_info['wss']}, R: {wss95_info['R']}" wss95_adj= f"ASSWSS@95:{wss95_info['awss']}" print(wss95) print(wss95_adj) print('Acc.:', round(acc_wss95.item(), 4)) print('F1-score:', round(f1_wss95.item(), 4)) print(f"threshold to wss95: {round(thresholds95, 4)}") cm = pd.DataFrame(wss95_info['CM'], index=index, columns=col) print("\nConfusion matrix:") print(cm) print("\n---Metrics with threshold:", threshold, "----\n") wss= f"WSS@R:{wss_info['wss']}, R: {wss_info['R']}" print(wss) wss_adj= f"AWSS@R:{wss_info['awss']}" print(wss_adj) print('Acc.:', round(acc_wssR.item(), 4)) print('F1-score:', round(f1_wssR.item(), 4)) cm = pd.DataFrame(wss_info['CM'], index=index, columns=col) print("\nConfusion matrix:") print(cm) # Plots if make_plot: fig, axes = plt.subplots(1, 4, figsize=(25,10)) alpha = torch.squeeze(predict).numpy() # TSNE p1 = sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=labels, alpha=alpha, ax = axes[0]).set_title('Predictions-TSNE', size=20) # WSS@95 t_wss = predict >= thresholds95 t_wss = t_wss.squeeze().numpy() p2 = sns.scatterplot(x=X_embedded[t_wss, 0], y=X_embedded[t_wss, 1], hue=labels[t_wss], alpha=alpha[t_wss], ax = axes[1]).set_title('WSS@95', size=20) # WSS@R t = predict >= threshold t = t.squeeze().numpy() p3 = sns.scatterplot(x=X_embedded[t, 0], y=X_embedded[t, 1], hue=labels[t], alpha=alpha[t], ax = axes[2]).set_title(f'Predictions-threshold {threshold}', size=20) # ROC-Curve roc_auc = auc(fpr, tpr) lw = 2 axes[3].plot( fpr, tpr, color="darkorange", lw=lw, label="ROC curve (area = %0.2f)" % roc_auc) axes[3].plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--") axes[3].axhline(y=0.95, color='r', linestyle='-') # axes[3].set(xlabel="False Positive Rate", ylabel="True Positive Rate") axes[3].legend(loc="lower right") axes[3].set_title(label= "ROC", size = 20) axes[3].set_ylabel("True Positive Rate", fontsize = 15) axes[3].set_xlabel("False Positive Rate", fontsize = 15) if show: plt.show() if save: fig.savefig(namefig, dpi=fig.dpi) return metrics def auc_plot(logits,labels, color = "darkorange", label = "test"): predict = torch.sigmoid(logits).detach().clone() fpr, tpr, thresholds = roc_curve(labels, predict.squeeze()) roc_auc = auc(fpr, tpr) lw = 2 label = label + str(round(roc_auc,2)) # print(label) plt.plot( fpr, tpr, color=color, lw=lw, label= label ) plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--") plt.axhline(y=0.95, color='r', linestyle='-') # Interface to evaluation class diagnosis(): def __init__(self, names, Valid_resource, batch_size_test, model,Info, device,treat_text_fun=None,start = 0): self.names=names self.Valid_resource=Valid_resource self.batch_size_test=batch_size_test self.model=model self.start=start self.Info = Info self.device = device self.treat_text_fun = treat_text_fun # BOX INPUT self.value_trash = widgets.FloatText( value=0.95, description='threshold', disabled=False ) self.valueb = widgets.IntText( value=10, description='size', disabled=False ) # Buttons self.train_b = widgets.Button(description="Train") self.next_b = widgets.Button(description="Next") self.eval_b = widgets.Button(description="Evaluation") self.hbox = widgets.HBox([self.train_b, self.valueb]) # Click buttons functions self.next_b.on_click(self.Next_button) self.train_b.on_click(self.Train_button) self.eval_b.on_click(self.Evaluation_button) # Next button def Next_button(self,p): clear_output() self.i=self.i+1 # Select the domain data self.domain = self.names[self.i] self.data = self.Valid_resource[self.Valid_resource['domain'] == self.domain] print("Name:", self.domain) print(self.data['label'].value_counts()) display(self.hbox) display(self.next_b) # Train button def Train_button(self, y): clear_output() print(self.domain) # Prepare data for training (domain-learner) self.data_train_loader, self.data_test_loader, self.data_train, self.data_test = prepare_data(self.data, train_size_per_class = self.valueb.value, batch_size = {'train': self.Info['inner_batch_size'], 'test': self.batch_size_test}, max_seq_length = self.Info['max_seq_length'], tokenizer = self.Info['tokenizer'], input = "text", output = "label", treat_text_fun=self.treat_text_fun) # Train the model and predict in the test set self.logits, self.X_embedded, self.labels, self.features = train_loop(self.data_train_loader, self.data_test_loader, self.model, self.device, epoch = self.Info['inner_update_step'], lr=self.Info['inner_update_lr'], print_info=True, name = self.domain) tresh_box = widgets.HBox([self.eval_b, self.value_trash]) display(self.hbox) display(tresh_box) display(self.next_b) # Evaluation button def Evaluation_button(self, te): clear_output() tresh_box = widgets.HBox([self.eval_b, self.value_trash]) print(self.domain) # print("\n") print("-------Train data-------") print(data_train['label'].value_counts()) print("-------Test data-------") print(data_test['label'].value_counts()) # print("\n") display(self.next_b) display(tresh_box) display(self.hbox) # Compute metrics metrics = plot(self.logits, self.X_embedded, self.labels, threshold=self.Info['threshold'], show = True, namefig= 'test', make_plot = True, print_stats = True, save=False) def __call__(self): self.i= self.start-1 clear_output() display(self.next_b) # Simulation attemps of domain learner def pipeline_simulation(Valid_resource, names_to_valid, path_save, model, Info, device, initializer_model, treat_text_fun=None): n_attempt = 5 batch_test = 100 # Create a directory to save informations for name in names_to_valid: name = re.sub("\.csv", "",name) Path(path_save + name + "/img").mkdir(parents=True, exist_ok=True) # Dict to sabe roc curves roc_stats = defaultdict(lambda: defaultdict( lambda: defaultdict( list ) ) ) all_metrics = [] # Loop over a list of domains for name in names_to_valid: # Select a domain dataset data = Valid_resource[Valid_resource['domain'] == name].reset_index().drop("index", axis=1) # Attempts simulation for attempt in range(n_attempt): print("---"*4,"attempt", attempt, "---"*4) # Prepare data to pass to the model data_train_loader, data_test_loader, _ , _ = prepare_data(data, train_size_per_class = Info['k_spt'], batch_size = {'train': Info['inner_batch_size'], 'test': batch_test}, max_seq_length = Info['max_seq_length'], tokenizer = Info['tokenizer'], input = "text", output = "label", treat_text_fun=treat_text_fun) # Train the model and evaluate on the test set of the domain logits, X_embedded, labels, features = train_loop(data_train_loader, data_test_loader, model, device, epoch = Info['inner_update_step'], lr=Info['inner_update_lr'], print_info=False, name = name) name_domain = re.sub("\.csv", "",name) # Compute the metrics metrics = plot(logits, X_embedded, labels, threshold=Info['threshold'], show = False, namefig= path_save + name_domain + "/img/" + str(attempt) + 'plots', make_plot = True, print_stats = False, save = True) # Compute the roc-curve fpr, tpr, _ = roc_curve(labels, torch.sigmoid(logits).squeeze()) # Save the correspoud information of the domain metrics['name'] = name_domain metrics['layer_size'] = Info['bert_layers'] metrics['attempt'] = attempt roc_stats[name_domain][str(Info['bert_layers'])]['fpr'].append(fpr.tolist()) roc_stats[name_domain][str(Info['bert_layers'])]['tpr'].append(tpr.tolist()) all_metrics.append(metrics) # Save the metrics and the roc curve of the attemp pd.DataFrame(all_metrics).to_csv(path_save+ "metrics.csv") roc_path = path_save + "roc_stats.json" with open(roc_path, 'w') as fp: json.dump(roc_stats, fp) del fpr, tpr, logits, X_embedded, labels del features, metrics, _ # Save the information used to evaluate the validation resource save_info = Info.copy() save_info['model'] = initializer_model.tokenizer.name_or_path save_info.pop("tokenizer") save_info.pop("bert_layers") info_path = path_save+"info.json" with open(info_path, 'w') as fp: json.dump(save_info, fp) # Loading dataset statistics def load_data_statistics(paths, names): size = [] pos = [] neg = [] for p in paths: data = pd.read_csv(p) data = data.dropna() # Dataset size size.append(len(data)) # Number of positive labels pos.append(data['labels'].value_counts()[1]) # Number of negative labels neg.append(data['labels'].value_counts()[0]) del data info_load = pd.DataFrame({ "size":size, "pos":pos, "neg":neg, "names":names, "paths": paths }) return info_load # Loading the datasets def load_data(train_info_load): col = ['abstract','title', 'labels', 'domain'] data_train = pd.DataFrame(columns=col) for p in train_info_load['paths']: data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']] data_temp = pd.read_csv(p).loc[:, ['labels', 'title', 'abstract']] data_temp['domain'] = os.path.basename(p) data_train = pd.concat([data_train, data_temp]) data_train['text'] = data_train['title'] + data_train['abstract'].replace(np.nan, '') return( data_train \ .replace({"labels":{0:"negative", 1:'positive'}})\ .rename({"labels":"label"} , axis=1)\ .loc[ :,("text","domain","label")] )