import logging import sys import numpy as np sys.path.append("../") # from tdc.multi_pred import GDA import pandas as pd from torch.utils.data import Dataset LOGGER = logging.getLogger(__name__) class GDA_Dataset(Dataset): """ Candidate Dataset for: ALL gene-to-disease interactions """ def __init__(self, data_examples): self.protein_seqs = data_examples[0] self.disease_dess = data_examples[1] self.scores = data_examples[2] def __getitem__(self, query_idx): protein_seq = self.protein_seqs[query_idx] disease_des = self.disease_dess[query_idx] score = self.scores[query_idx] return protein_seq, disease_des, score def __len__(self): return len(self.protein_seqs) class TDC_Pretrain_Dataset(Dataset): """ Dataset of TDC: ALL gene-disease associations """ def __init__(self, data_dir="../../data/pretrain/", test=False): LOGGER.info("Initializing TDC Pretraining Dataset ! ...") data = GDA(name="DisGeNET") # , path=data_dir data.neg_sample(frac = 1) data.binarize(threshold = 0, order = 'ascending') self.datasets = data.get_split() self.name = "DisGeNET" self.dataset_df = self.datasets['train'] # self.dataset_df = pd.read_csv(f"{data_dir}/disgenet_gda.csv") self.dataset_df = self.dataset_df[ ["Gene", "Disease", "Y"] ].dropna() # Drop missing values. # print(self.dataset_df.head()) print( f"{data_dir}TDC training dataset loaded, found associations: {len(self.dataset_df.index)}" ) self.protein_seqs = self.dataset_df["Gene"].values self.disease_dess = self.dataset_df["Disease"].values self.scores = len(self.dataset_df["Y"].values) * [1] def __getitem__(self, query_idx): protein_seq = self.protein_seqs[query_idx] disease_des = self.disease_dess[query_idx] score = self.scores[query_idx] return protein_seq, disease_des, score def __len__(self): return len(self.protein_seqs) class GDA_Pretrain_Dataset(Dataset): """ Candidate Dataset for: ALL gene-disease associations """ def __init__(self, data_dir="../../data/pretrain/", test=False, split="train", val_ratio=0.2): LOGGER.info("Initializing GDA Pretraining Dataset ! ...") self.dataset_df = pd.read_csv(f"{data_dir}/disgenet_gda.csv") self.dataset_df = self.dataset_df[["proteinSeq", "diseaseDes", "score"]].dropna() self.dataset_df = self.dataset_df.sample(frac=1, random_state=42).reset_index(drop=True) num_val_samples = int(len(self.dataset_df) * val_ratio) if split == "train": self.dataset_df = self.dataset_df[:-num_val_samples] print(f"{data_dir}disgenet_gda.csv loaded, found train associations: {len(self.dataset_df.index)}") elif split == "val": self.dataset_df = self.dataset_df[-num_val_samples:] print(f"{data_dir}disgenet_gda.csv loaded, found valid associations: {len(self.dataset_df.index)}") if test: self.protein_seqs = self.dataset_df["proteinSeq"].values[:128] self.disease_dess = self.dataset_df["diseaseDes"].values[:128] self.scores = 128 * [1] else: self.protein_seqs = self.dataset_df["proteinSeq"].values self.disease_dess = self.dataset_df["diseaseDes"].values self.scores = len(self.dataset_df["score"].values) * [1] def __getitem__(self, query_idx): protein_seq = self.protein_seqs[query_idx] disease_des = self.disease_dess[query_idx] score = self.scores[query_idx] return protein_seq, disease_des, score def __len__(self): return len(self.protein_seqs) # # 分离正负样本 # positive_samples = self.dataset_df[self.dataset_df["score"] == 1] # negative_samples = self.dataset_df[self.dataset_df["score"] == 0] # # 打乱并划分正样本 # positive_samples = positive_samples.sample(frac=1, random_state=42).reset_index(drop=True) # num_pos_val_samples = int(len(positive_samples) * val_ratio) # # 打乱并划分负样本 # negative_samples = negative_samples.sample(frac=1, random_state=42).reset_index(drop=True) # num_neg_val_samples = int(len(negative_samples) * val_ratio) # if split == "train": # self.dataset_df = pd.concat([positive_samples[:-num_pos_val_samples], negative_samples[:-num_neg_val_samples]]) # print(f"{data_dir}disgenet_gda.csv loaded, found associations: {len(self.dataset_df.index)}") # elif split == "val": # self.dataset_df = pd.concat([positive_samples[-num_pos_val_samples:], negative_samples[-num_neg_val_samples:]]) # print(f"{data_dir}disgenet_gda.csv loaded, found associations: {len(self.dataset_df.index)}") # Shuffle and split data # class GDA_Pretrain_Dataset(Dataset): # """ # Candidate Dataset for: # ALL gene-disease associations # """ # def __init__(self, data_dir="../../data/pretrain/", test=False): # LOGGER.info("Initializing GDA Pretraining Dataset ! ...") # updated = pd.read_csv(f"{data_dir}/disgenet_updated.csv") # data = GDA(name="DisGeNET") # data = data.get_data() # data = data[['Gene_ID','Disease_ID']].dropna() # self.dataset_df = pd.read_csv(f"{data_dir}/disgenet_gda.csv") # num_unique_diseaseId = self.dataset_df['diseaseId'].nunique() # num_unique_geneId = self.dataset_df['geneId'].nunique() # print(f"Number of unique 'diseaseId': {num_unique_diseaseId}") # print(f"Number of unique 'geneId': {num_unique_geneId}") # num_of_c0002395 = self.dataset_df[self.dataset_df['diseaseId'] == 'C0002395'].shape[0] # print(f"Alzheimer Number in 2020:{num_of_c0002395}") # Convert 'Gene_ID' and 'Disease_ID' to str before merge # data['Gene_ID'] = data['Gene_ID'].astype(str) # data['Disease_ID'] = data['Disease_ID'].astype(str) # Similarly for 'geneId' and 'diseaseId', if they're not already of type 'str' # self.dataset_df['geneId'] = self.dataset_df['geneId'].astype(str) # self.dataset_df['diseaseId'] = self.dataset_df['diseaseId'].astype(str) # # 合并两个DataFrame并找出不同的行 # merged = df.merge(self.dataset_df, how='outer', indicator=True) # differences = merged[merged['_merge'] != 'both'] # differences.to_csv('/nfs/dpa_pretrain/data/pretrain/differences.csv', index=False) # Check for overlap between TDC dataset and DisGeNET dataset # merged_df = pd.merge(data, self.dataset_df, how='inner', left_on=['Gene_ID','Disease_ID'], right_on=['geneId','diseaseId']) # num_matched_pairs = merged_df.shape[0] # print(f"Number of matched pairs TDC: {num_matched_pairs}") # merged_dis = pd.merge(data, updated, how='inner', left_on=['Gene','Disease'], right_on=['proteinSeq','diseaseDes']) # num_matched = merged_dis.shape[0] # print(f"Number of matched pairs DisGeNET_test: {num_matched}") # self.dataset_df = self.dataset_df[ # ["proteinSeq", "diseaseDes", "score"] # ].dropna() # Drop missing values. # print(self.dataset_df.head()) "proteinSeq", "diseaseDes", "score" # print( # f"{data_dir}disgenet_gda.csv loaded, found associations: {len(self.dataset_df.index)}" # ) # df1 = pd.read_csv(f"{data_dir}/disgenet_gda.csv") # df1 = df1[ # ["proteinSeq", "diseaseDes", "score"] # ].dropna() # # 合并两个DataFrame并找出不同的行 # merged = df1.merge(self.dataset_df, how='outer', indicator=True) # differences = merged[merged['_merge'] != 'both'] # # 将结果保存到新的文件中 # differences.to_csv('/nfs/dpa_pretrain/data/pretrain/differences.csv', index=False) # if test: # self.protein_seqs = self.dataset_df["proteinSeq"].values[:128] # self.disease_dess = self.dataset_df["diseaseDes"].values[:128] # self.scores = 128 * [1] # else: # self.protein_seqs = self.dataset_df["proteinSeq"].values # self.disease_dess = self.dataset_df["diseaseDes"].values # self.scores = len(self.dataset_df["score"].values) * [1] # def __getitem__(self, query_idx): # protein_seq = self.protein_seqs[query_idx] # disease_des = self.disease_dess[query_idx] # score = self.scores[query_idx] # return protein_seq, disease_des, score # def __len__(self): # return len(self.protein_seqs) class PPI_Pretrain_Dataset(Dataset): """ Candidate Dataset for: ALL protein-to-protein interactions """ def __init__(self, data_dir="../../data/pretrain/", test=False): LOGGER.info("Initializing metric learning data set! ...") self.dataset_df = pd.read_csv(f"{data_dir}/string_ppi_900_2m.csv") self.dataset_df = self.dataset_df[["item_seq_a", "item_seq_b", "score"]] self.dataset_df = self.dataset_df.dropna() if test: self.dataset_df = self.dataset_df.sample(100) print( f"{data_dir}/string_ppi_900_2m.csv loaded, found interactions: {len(self.dataset_df.index)}" ) self.protein_seq1 = self.dataset_df["item_seq_a"].values self.protein_seq2 = self.dataset_df["item_seq_b"].values self.scores = len(self.dataset_df["score"].values) * [1] def __getitem__(self, query_idx): protein_seq1 = self.protein_seq1[query_idx] protein_seq2 = self.protein_seq2[query_idx] score = self.scores[query_idx] return protein_seq1, protein_seq2, score def __len__(self): return len(self.protein_seq1) class PPI_Dataset(Dataset): """ Candidate Dataset for: ALL protein-to-protein interactions """ def __init__(self, protein_seq1, protein_seq2, score): self.protein_seq1 = protein_seq1 self.protein_seq2 = protein_seq2 self.scores = score def __getitem__(self, query_idx): protein_seq1 = self.protein_seq1[query_idx] protein_seq2 = self.protein_seq2[query_idx] score = self.scores[query_idx] return protein_seq1, protein_seq2, score def __len__(self): return len(self.protein_seq1) class DDA_Dataset(Dataset): """ Candidate Dataset for: ALL disease-to-disease associations """ def __init__(self, diseaseDes1, diseaseDes2, label): self.diseaseDes1 = diseaseDes1 self.diseaseDes2 = diseaseDes2 self.label = label def __getitem__(self, query_idx): diseaseDes1 = self.diseaseDes1[query_idx] diseaseDes2 = self.diseaseDes2[query_idx] label = self.label[query_idx] return diseaseDes1, diseaseDes2, label def __len__(self): return len(self.diseaseDes1) class DDA_Pretrain_Dataset(Dataset): """ Candidate Dataset for: ALL protein-to-protein interactions """ def __init__(self, data_dir="../../data/pretrain/", test=False): LOGGER.info("Initializing metric learning data set! ...") self.dataset_df = pd.read_csv(f"{data_dir}disgenet_dda.csv") self.dataset_df = self.dataset_df.dropna() # Drop missing values. if test: self.dataset_df = self.dataset_df.sample(100) print( f"{data_dir}disgenet_dda.csv loaded, found associations: {len(self.dataset_df.index)}" ) self.disease_des1 = self.dataset_df["diseaseDes1"].values self.disease_des2 = self.dataset_df["diseaseDes2"].values self.scores = len(self.dataset_df["jaccard_variant"].values) * [1] def __getitem__(self, query_idx): disease_des1 = self.disease_des1[query_idx] disease_des2 = self.disease_des2[query_idx] score = self.scores[query_idx] return disease_des1, disease_des2, score def __len__(self): return len(self.disease_des1)