Runtime error
Runtime error
File size: 13,370 Bytes
9e90264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 |
import os
import random
import huggingface_hub
import numpy as np
from datasets import load_dataset, Dataset
from dotenv import load_dotenv
from pytorch_lightning import LightningDataModule
from pytorch_lightning.utilities.types import TRAIN_DATALOADERS, EVAL_DATALOADERS
from import DataLoader, IterableDataset
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
# from torchmetrics.classification import BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinaryPrecision, BinaryRecall
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, BertModel
from transformers import TrainingArguments, Trainer
import torch
import logging
import wandb
timber = logging.getLogger()
# logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO) # change to level=logging.DEBUG to print more logs...
black = "\u001b[30m"
red = "\u001b[31m"
green = "\u001b[32m"
yellow = "\u001b[33m"
blue = "\u001b[34m"
magenta = "\u001b[35m"
cyan = "\u001b[36m"
white = "\u001b[37m"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PRETRAINED_MODEL_NAME: str = "zhihan1996/DNA_bert_6"
def insert_debug_motif_at_random_position(seq, DEBUG_MOTIF):
start = 0
end = len(seq)
rand_pos = random.randrange(start, (end - len(DEBUG_MOTIF)))
random_end = rand_pos + len(DEBUG_MOTIF)
output = seq[start: rand_pos] + DEBUG_MOTIF + seq[random_end: end]
assert len(seq) == len(output)
return output
class PagingMQTLDataset(IterableDataset):
def __init__(self,
self.dataset = m_dataset
self.check_if_pipeline_is_ok_by_inserting_debug_motif = check_if_pipeline_is_ok_by_inserting_debug_motif
self.debug_motif = "ATCGCCTA"
self.seq_len = seq_len
self.bert_tokenizer = tokenizer
self.max_length = max_length
def __iter__(self):
for row in self.dataset:
processed = self.preprocess(row)
if processed is not None:
yield processed
def preprocess(self, row):
sequence = row['sequence'] # Fetch the 'sequence' column
if len(sequence) != self.seq_len:
return None # skip problematic row!
label = row['label'] # Fetch the 'label' column (or whatever target you use)
if label == 1 and self.check_if_pipeline_is_ok_by_inserting_debug_motif:
sequence = insert_debug_motif_at_random_position(seq=sequence, DEBUG_MOTIF=self.debug_motif)
input_ids = self.bert_tokenizer(sequence)["input_ids"]
tokenized_tensor = torch.tensor(input_ids)
label_tensor = torch.tensor(label)
output_dict = {"input_ids": tokenized_tensor, "labels": label_tensor} # so this is now you do it?
return output_dict # tokenized_tensor, label_tensor
class MqtlDataModule(LightningDataModule):
def __init__(self, train_ds, val_ds, test_ds, batch_size=16):
self.batch_size = batch_size
self.train_loader = DataLoader(train_ds, batch_size=self.batch_size, shuffle=False,
# collate_fn=collate_fn,
# persistent_workers=True
self.validate_loader = DataLoader(val_ds, batch_size=self.batch_size, shuffle=False,
# collate_fn=collate_fn,
# persistent_workers=True
self.test_loader = DataLoader(test_ds, batch_size=self.batch_size, shuffle=False,
# collate_fn=collate_fn,
# persistent_workers=True
def prepare_data(self):
def setup(self, stage: str) -> None:"inside setup: {stage = }")
def train_dataloader(self) -> TRAIN_DATALOADERS:
return self.train_loader
def val_dataloader(self) -> EVAL_DATALOADERS:
return self.validate_loader
def test_dataloader(self) -> EVAL_DATALOADERS:
return self.test_loader
def create_paging_train_val_test_datasets(tokenizer, WINDOW, is_debug, batch_size=1000):
data_files = {
# small samples
"train_binned_200": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_200_train_binned.csv",
"validate_binned_200": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_200_validate_binned.csv",
"test_binned_200": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_200_test_binned.csv",
# medium samples
"train_binned_1000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_1000_train_binned.csv",
"validate_binned_1000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_1000_validate_binned.csv",
"test_binned_1000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_1000_test_binned.csv",
# large samples
"train_binned_4000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv",
"validate_binned_4000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_validate_binned.csv",
"test_binned_4000": "/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_test_binned.csv",
dataset_map = None
is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
if is_my_laptop:
dataset_map = load_dataset("csv", data_files=data_files, streaming=True)
dataset_map = load_dataset("fahimfarhan/mqtl-classification-datasets", streaming=True)
train_dataset = PagingMQTLDataset(dataset_map[f"train_binned_{WINDOW}"],
val_dataset = PagingMQTLDataset(dataset_map[f"validate_binned_{WINDOW}"],
test_dataset = PagingMQTLDataset(dataset_map[f"test_binned_{WINDOW}"],
# data_module = MqtlDataModule(train_ds=train_dataset, val_ds=val_dataset, test_ds=test_dataset, batch_size=batch_size)
return train_dataset, val_dataset, test_dataset
def login_inside_huggingface_virtualmachine():
# Load the .env file, but don't crash if it's not found (e.g., in Hugging Face Space)
load_dotenv() # Only useful on your laptop if .env exists
print(".env file loaded successfully.")
except Exception as e:
print(f"Warning: Could not load .env file. Exception: {e}")
# Try to get the token from environment variables
token = os.getenv("HF_TOKEN")
if not token:
raise ValueError("HF_TOKEN not found. Make sure to set it in the environment variables or .env file.")
# Log in to Hugging Face Hub
print("Logged in to Hugging Face Hub successfully.")
except Exception as e:
print(f"Error during Hugging Face login: {e}")
# Handle the error appropriately (e.g., exit or retry)
# wand db login
api_key = os.getenv("WAND_DB_API_KEY")"{api_key = }")
if not api_key:
raise ValueError("WAND_DB_API_KEY not found. Make sure to set it in the environment variables or .env file.")
# Log in to Hugging Face Hub
print("Logged in to wand db successfully.")
except Exception as e:
print(f"Error during wand db Face login: {e}")
# use sklearn cz torchmetrics.classification gave array index out of bound exception :/ (whatever it is called in python)
def compute_metrics_using_sklearn(p):
pred, labels = p
# Get predicted class labels
pred_labels = np.argmax(pred, axis=1)
# Get predicted probabilities for the positive class
pred_probs = pred[:, 1] # Assuming binary classification and 2 output classes
accuracy = accuracy_score(y_true=labels, y_pred=pred_labels)
recall = recall_score(y_true=labels, y_pred=pred_labels)
precision = precision_score(y_true=labels, y_pred=pred_labels)
f1 = f1_score(y_true=labels, y_pred=pred_labels)
roc_auc = roc_auc_score(y_true=labels, y_score=pred_probs)
return {"accuracy": accuracy, "roc_auc": roc_auc, "precision": precision, "recall": recall, "f1": f1}
except Exception as x:
print(f"compute_metrics_using_sklearn failed with exception: {x}")
return {"accuracy": 0, "roc_auc": 0, "precision": 0, "recall": 0, "f1": 0}
def start():
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
WINDOW = 4000
batch_size = 100
model_local_directory = f"my-awesome-model-{WINDOW}"
model_remote_repository = f"fahimfarhan/dnabert-6-mqtl-classifier-{WINDOW}"
is_my_laptop = os.path.isfile("/home/soumic/Codes/mqtl-classification/src/inputdata/dataset_4000_train_binned.csv")
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME, trust_remote_code=True)
classifier_model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=2)
args = {
"output_dir": "output_dnabert-6-mqtl_classification",
"num_train_epochs": 1,
"max_steps": 20_000, # train 36k + val 4k = 40k
# Set the number of steps you expect to train, originally 1000, takes too much time. So I set it to 10 to run faster and check my code/pipeline
"run_name": "laptop_run_dna-bert-6-mqtl_classification", # Override run_name here
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 32,
"gradient_checkpointing": True,
"learning_rate": 1e-3,
"save_safetensors": False, # I added it. this solves the runtime error!
# not sure if it is a good idea. sklearn may slow down training, causing time loss... if so, disable these 2 lines below
"evaluation_strategy": "epoch", # To calculate metrics per epoch
"logging_strategy": "epoch" # Extra: to log training data stats for loss
training_args = TrainingArguments(**args)
# train_dataset, eval_dataset, test_dataset = create_data_module(tokenizer=tokenizer, WINDOW=WINDOW,
# batch_size=batch_size,
# is_debug=False)
""" # example code
max_length = 32_000
sequence = 'ACTG' * int(max_length / 4)
# sequence = 'ACTG' * int(1000) # seq_len = 4000 it works!
sequence = [sequence] * 8 # Create 8 identical samples
tokenized = tokenizer(sequence)["input_ids"]
labels = [0, 1] * 4
# Create a dataset for training
run_the_code_ds = Dataset.from_dict({"input_ids": tokenized, "labels": labels})
train_ds, val_ds, test_ds = create_paging_train_val_test_datasets(tokenizer, WINDOW=WINDOW, is_debug=False)
# train_ds, val_ds, test_ds = run_the_code_ds, run_the_code_ds, run_the_code_ds
# train_ds.set_format("pt") # doesn't work!
trainer = Trainer(
compute_metrics=compute_metrics_using_sklearn # torch_metrics.compute_metrics
# train, and validate
result = trainer.train()
print(f"{result = }")
except Exception as x:
print(f"{x = }")
# testing
# with torch.no_grad(): # didn't work :/
test_results = trainer.evaluate(eval_dataset=test_ds)
print(f"{test_results = }")
except Exception as oome:
print(f"{oome = }")
# save the model
model_name = "DnaBert6MQtlClassifier"
classifier_model.save_pretrained(save_directory=model_local_directory, safe_serialization=False)
# push to the hub
commit_message = f":tada: Push model for window size {WINDOW} from huggingface space"
if is_my_laptop:
commit_message = f":tada: Push model for window size {WINDOW} from zephyrus"
# subfolder=f"my-awesome-model-{WINDOW}", subfolder didn't work :/
commit_message=commit_message, # f":tada: Push model for window size {WINDOW}"
def interprete_demo():
is_my_laptop = True
WINDOW = 4000
batch_size = 100
model_local_directory = f"my-awesome-model-{WINDOW}"
model_remote_repository = f"fahimfarhan/dnabert-6-mqtl-classifier-{WINDOW}"
classifier_model = AutoModel.from_pretrained(model_remote_repository)
# todo: use captum / gentech-grelu to interpret the model
except Exception as x:
if __name__ == '__main__':