import json import sys import os import torch from utils.data_loader import GDA_Dataset from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold import numpy as np import pandas as pd sys.path.append("../") class DisGeNETProcessor: def __init__(self,input_csv_path): train_data = pd.read_csv('data/downstream/GDA_Data/train.csv') valid_data = pd.read_csv('data/downstream/GDA_Data/valid.csv') test_data = pd.read_csv(input_csv_path) # test_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/GDA_Data/test.csv') # valid_data, test_data = train_test_split(valid_data, test_size=1/3, random_state=42) # train_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/test/train.csv') # valid_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/test/valid.csv') # train_data = pd.read_csv('/nfs/dpa_pretrain/data/downstream/disgenet_finetune.csv') # train_data, valid_data = train_test_split(train_data, test_size=0.2, random_state=42) # valid_data, test_data = train_test_split(valid_data, test_size=1/3, random_state=42) # alzheimer and stomach dataset use [["proteinSeq", "diseaseDes", "Y"]].dropna() self.name = "DisGeNET" self.train_dataset_df = train_data[["proteinSeq", "diseaseDes", "score"]].dropna() self.val_dataset_df = valid_data[["proteinSeq", "diseaseDes", "score"]].dropna() self.test_dataset_df = test_data[["proteinSeq", "diseaseDes", "score"]].dropna() # self.test_dataset_df = test_data[["proteinSeq", "diseaseDes", "Y"]].dropna() def get_train_examples(self, test=False): """get training examples Args: test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. Returns: _type_: _description_ """ if test == 1: # Small testing set, to reduce the running time return ( self.train_dataset_df["proteinSeq"].values[:4096], self.train_dataset_df["diseaseDes"].values[:4096], self.train_dataset_df["score"].values[:4096], ) elif test > 1: return ( self.train_dataset_df["proteinSeq"].values[:test], self.train_dataset_df["diseaseDes"].values[:test], self.train_dataset_df["score"].values[:test], ) else: return GDA_Dataset( ( self.train_dataset_df["proteinSeq"].values, self.train_dataset_df["diseaseDes"].values, self.train_dataset_df["score"].values, )) def get_val_examples(self, test=False): """get validation examples Args: test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. Returns: _type_: _description_ """ if test == 1: # Small testing set, to reduce the running time return ( self.val_dataset_df["proteinSeq"].values[:1024], self.val_dataset_df["diseaseDes"].values[:1024], self.val_dataset_df["score"].values[:1024], ) elif test > 1: return ( self.val_dataset_df["proteinSeq"].values[:test], self.val_dataset_df["diseaseDes"].values[:test], self.val_dataset_df["score"].values[:test], ) else: return GDA_Dataset(( self.val_dataset_df["proteinSeq"].values, self.val_dataset_df["diseaseDes"].values, self.val_dataset_df["score"].values, )) # def get_test_examples(self, test=False): # """get test examples # Args: # test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. # Returns: # _type_: _description_ # """ # if test == 1: # Small testing set, to reduce the running time # return ( # self.test_dataset_df["proteinSeq"].values[:1024], # self.test_dataset_df["diseaseDes"].values[:1024], # self.test_dataset_df["Y"].values[:1024], # ) # elif test > 1: # return ( # self.test_dataset_df["proteinSeq"].values[:test], # self.test_dataset_df["diseaseDes"].values[:test], # self.test_dataset_df["Y"].values[:test], # ) # else: # return GDA_Dataset( ( # self.test_dataset_df["proteinSeq"].values, # self.test_dataset_df["diseaseDes"].values, # self.test_dataset_df["Y"].values, # )) def get_test_examples(self, test=False): """get test examples Args: test (bool, optional): test can be int or bool. If test>1, will take test as the number of test examples. Defaults to False. Returns: _type_: _description_ """ if test == 1: # Small testing set, to reduce the running time return ( self.test_dataset_df["proteinSeq"].values[:1024], self.test_dataset_df["diseaseDes"].values[:1024], self.test_dataset_df["score"].values[:1024], ) elif test > 1: return ( self.test_dataset_df["proteinSeq"].values[:test], self.test_dataset_df["diseaseDes"].values[:test], self.test_dataset_df["score"].values[:test], ) else: return GDA_Dataset( ( self.test_dataset_df["proteinSeq"].values, self.test_dataset_df["diseaseDes"].values, self.test_dataset_df["score"].values, ))