# Cell classifier def finetune_cells(token_set = Path('geneformer/token_dictionary.pkl'), median_set = Path('geneformer/gene_median_dictionary.pkl'), pretrained_model = ".", dataset = 'Genecorpus-30M/example_input_files/cell_classification/cell_type_annotation/cell_type_train_data.dataset/', dataset_split = None, filter_cells = .005, epochs = 1, cpu_cores = os.cpu_count(), geneformer_batch_size = 12, optimizer = 'adamw', max_lr = 5e-5, num_gpus = torch.cuda.device_count(), max_input_size = 2 ** 11, lr_schedule_fn = "linear", warmup_steps = 500, freeze_layers = 0, emb_extract = False,
max_cells = 1000, emb_layer = 0, emb_filter = None, emb_dir = 'embeddings', overwrite = True, label = "cell_type", data_filter = None, forward_batch = 200, model_location = None, skip_training = False, sample_data = 1, inference = False, optimize_hyperparameters = False, output_dir = None):

'''
Primary Parameters
-------------------
dataset: path
    Path to fine-tuning/testing dataset for training

model_location: path
    Path to location of existing model to use for inference and embedding extraction
    
pretrained_model: path
    Path to pretrained GeneFormer 30M model before fine-tuning
    
inference: bool 
    Chooses whether to perform inference (which causes the function to return the list of similarities). Defaults to False

skip_training: bool 
    Chooses whether to skip training the model. Defaults to False
    
emb_extract: bool
    Choose whether to extract embeddings and calculate similarities. Defaults to True
    
optimize_hyperparameters: bool
    Choose whether to optimize model hyperparamters. Defaults to False
label: string
    The label string in the formatted dataset that contains true class labels. Defaults to "label"    

Customization Parameters
-------------------

dataset_split: str
    How the dataset should be partitioned (if at all), and what ID should be used for partitioning
    
data_filter: list
    (For embeddings and inference) Runs analysis subsets of the dataset by the ID defined by dataset_split
    
label: str
    What feature should be read as a classification label

emb_layer: int
    What layer embeddings should be extracted and compared from.
    
emb_filter: ['cell1', 'cell2'...]
    Allows user to narrow down range of cells that embeddings will be extracted from.
    
max_cells: int
    How many embeddings from cells should be extracted. 
    
freeze_layers: int
    Number of layers should be permanently frozen during fine-tuning (starting from the first layer, 4 brings it up to the pretrained model).
    
sample_data: float
    What proportion of the HF dataset should be used
    
'''

Gene Classifier

def classify_genes(gene_info = "Genecorpus-30M/example_input_files/gene_info_table.csv", genes = "Genecorpus-30M/example_input_files/gene_classification/dosage_sensitive_tfs/dosage_sens_tf_labels.csv", corpus_30M = "Genecorpus-30M/genecorpus_30M_2048.dataset/", model = '.', max_input_size = 2 ** 11, max_lr = 5e-5, freeze_layers = 4, num_gpus = 1, num_proc = os.cpu_count(), geneformer_batch_size = 9, epochs = 1, filter_dataset = 50_000, emb_extract = True, emb_layer = 0, forward_batch = 200, filter_data = None, inference = False, k_validate = True, model_location = "230917_geneformer_GeneClassifier_dosageTF_L2048_B12_LR5e-05_LSlinear_WU500_E1_Oadamw_n10000_F4/", skip_training = False, emb_dir = 'gene_emb', output_dir = None, max_cells = 1000, num_cpus = os.cpu_count()):

""""
Primary Parameters
-----------

gene_info: path
    Path to gene mappings

corpus_30M: path
    Path to 30M Gene Corpus
    
model: path
    Path to pretrained GeneFormer model
    
genes: path
    Path to csv file containing different columns of genes and the column labels
    
inference: bool
    Whether the model should be used to run inference. If False, model will train with labeled data instead. Defaults to False
    
k_validate: bool
    Whether the model should run k-fold validation or simply perform regular training/evaluate. Defaults to True
    
skip_training: bool
    Whether the model should skip the training portion. Defaults to False
    
emb_extract: bool
    WHether the model should extract embeddings for a given gene (WIP)
    

Customization Parameters
-----------

freeze_layers: int
    Freezes x number of layers from the model. Default is 4 (2 non-frozen layers)
    
filter_dataset: int
    Number of cells to filter from 30M dataset. Default is 50_000
    
emb_layer: int
    What layer embeddings are extracted from. Default is 4
    
filter_data: str, list
    Filters down embeddings to a single category. Default is None
    

"""