CELL-E_2-Image_Prediction / dataloader.py
EmaadKhwaja
update dataloader
0677b2b
import os
import numpy as np
from PIL import Image, ImageSequence
import json
import pandas as pd
import torch
from torch.utils.data import Dataset
from torchvision import transforms
import torchvision.transforms.functional as TF
from celle.utils import replace_outliers
def simple_conversion(seq):
"""Create 26-dim embedding"""
chars = [
"-",
"M",
"R",
"H",
"K",
"D",
"E",
"S",
"T",
"N",
"Q",
"C",
"U",
"G",
"P",
"A",
"V",
"I",
"F",
"Y",
"W",
"L",
"O",
"X",
"Z",
"B",
"J",
]
nums = range(len(chars))
seqs_x = np.zeros(len(seq))
for idx, char in enumerate(seq):
lui = chars.index(char)
seqs_x[idx] = nums[lui]
return torch.tensor([seqs_x]).long()
class CellLoader(Dataset):
"""imports mined opencell images with protein sequence"""
def __init__(
self,
data_csv=None,
dataset=None,
split_key=None,
resize=600,
crop_size=600,
crop_method="random",
sequence_mode="simple",
vocab="bert",
threshold="median",
text_seq_len=0,
pad_mode="random",
):
self.data_csv = data_csv
self.dataset = dataset
self.image_folders = []
self.crop_method = crop_method
self.resize = resize
self.crop_size = crop_size
self.sequence_mode = sequence_mode
self.threshold = threshold
self.text_seq_len = int(text_seq_len)
self.vocab = vocab
self.pad_mode = pad_mode
if self.sequence_mode == "embedding" or self.sequence_mode == "onehot":
if self.vocab == "esm1b" or self.vocab == "esm2":
from esm import Alphabet
self.tokenizer = Alphabet.from_architecture(
"ESM-1b"
).get_batch_converter()
self.text_seq_len += 2
if data_csv:
data = pd.read_csv(data_csv)
self.parent_path = os.path.dirname(data_csv).split(data_csv)[0]
if split_key == "train":
self.data = data[data["split"] == "train"]
elif split_key == "val":
self.data = data[data["split"] == "val"]
else:
self.data = data
self.data = self.data.reset_index(drop=True)
def __len__(self):
return len(self.data)
def __getitem__(
self,
idx,
get_sequence=True,
get_images=True,
):
if get_sequence and self.text_seq_len > 0:
protein_vector = self.get_protein_vector(idx)
else:
protein_vector = torch.zeros((1, 1))
if get_images:
nucleus, target, threshold = self.get_images(idx, self.dataset)
else:
nucleus, target, threshold = torch.zeros((3, 1))
data_dict = {
"nucleus": nucleus.float(),
"target": target.float(),
"threshold": threshold.float(),
"sequence": protein_vector.long(),
}
return data_dict
def get_protein_vector(self, idx):
if "protein_sequence" not in self.data.columns:
metadata = self.retrieve_metadata(idx)
protein_sequence = metadata["sequence"]
else:
protein_sequence = self.data.iloc[idx]["protein_sequence"]
protein_vector = self.tokenize_sequence(protein_sequence)
return protein_vector
def get_images(self, idx, dataset):
if dataset == "HPA":
nucleus = Image.open(
os.path.join(
self.parent_path, self.data.iloc[idx]["nucleus_image_path"]
)
)
target = Image.open(
os.path.join(self.parent_path, self.data.iloc[idx]["target_image_path"])
)
nucleus = TF.to_tensor(nucleus)[0]
target = TF.to_tensor(target)[0]
image = torch.stack([nucleus, target], axis=0)
normalize = (0.0655, 0.0650), (0.1732, 0.1208)
elif dataset == "OpenCell":
image = Image.open(
os.path.join(self.parent_path, self.data.iloc[idx]["image_path"])
)
nucleus, target = [page.copy() for page in ImageSequence.Iterator(image)]
nucleus = replace_outliers(torch.divide(TF.to_tensor(nucleus), 65536))[0]
target = replace_outliers(torch.divide(TF.to_tensor(target), 65536))[0]
image = torch.stack([nucleus, target], axis=0)
normalize = (
(0.0272, 0.0244),
(0.0486, 0.0671),
)
# # from https://discuss.pytorch.org/t/how-to-apply-same-transform-on-a-pair-of-picture/14914
t_forms = [transforms.Resize(self.resize, antialias=None)]
if self.crop_method == "random":
t_forms.append(transforms.RandomCrop(self.crop_size))
t_forms.append(transforms.RandomHorizontalFlip(p=0.5))
t_forms.append(transforms.RandomVerticalFlip(p=0.5))
elif self.crop_method == "center":
t_forms.append(transforms.CenterCrop(self.crop_size))
t_forms.append(transforms.Normalize(normalize[0], normalize[1]))
image = transforms.Compose(t_forms)(image)
nucleus, target = image
nucleus /= torch.abs(nucleus).max()
target -= target.min()
target /= target.max()
nucleus = nucleus.unsqueeze(0)
target = target.unsqueeze(0)
threshold = target
if self.threshold == "mean":
threshold = 1.0 * (threshold > (torch.mean(threshold)))
elif self.threshold == "median":
threshold = 1.0 * (threshold > (torch.median(threshold)))
elif self.threshold == "1090_IQR":
p10 = torch.quantile(threshold, 0.1, None)
p90 = torch.quantile(threshold, 0.9, None)
threshold = torch.clip(threshold, p10, p90)
nucleus = torch.nan_to_num(nucleus, 0.0, 1.0, 0.0)
target = torch.nan_to_num(target, 0.0, 1.0, 0.0)
threshold = torch.nan_to_num(threshold, 0.0, 1.0, 0.0)
return nucleus, target, threshold
def retrieve_metadata(self, idx):
with open(
os.path.join(self.parent_path, self.data.iloc[idx]["metadata_path"])
) as f:
metadata = json.load(f)
return metadata
def tokenize_sequence(self, protein_sequence):
pad_token = 0
if self.sequence_mode == "simple":
protein_vector = simple_conversion(protein_sequence)
elif self.sequence_mode == "center":
protein_sequence = protein_sequence.center(self.text_seq_length, "-")
protein_vector = simple_conversion(protein_sequence)
elif self.sequence_mode == "alternating":
protein_sequence = protein_sequence.center(self.text_seq_length, "-")
protein_sequence = protein_sequence[::18]
protein_sequence = protein_sequence.center(
int(self.text_seq_length / 18) + 1, "-"
)
protein_vector = simple_conversion(protein_sequence)
elif self.sequence_mode == "embedding":
if self.vocab == "esm1b" or self.vocab == "esm2":
pad_token = 1
protein_vector = self.tokenizer([("", protein_sequence)])[-1]
if protein_vector.shape[-1] < self.text_seq_len:
diff = self.text_seq_len - protein_vector.shape[-1]
if self.pad_mode == "end":
protein_vector = torch.nn.functional.pad(
protein_vector, (0, diff), "constant", pad_token
)
elif self.pad_mode == "random":
split = diff - np.random.randint(0, diff + 1)
protein_vector = torch.cat(
[torch.ones(1, split) * 0, protein_vector], dim=1
)
protein_vector = torch.nn.functional.pad(
protein_vector, (0, diff - split), "constant", pad_token
)
elif protein_vector.shape[-1] > self.text_seq_len:
start_int = np.random.randint(
0, protein_vector.shape[-1] - self.text_seq_len
)
protein_vector = protein_vector[
:, start_int : start_int + self.text_seq_len
]
return protein_vector.long()