genetransformer

#324

by sofiaztj - opened Apr 2

base: refs/heads/main

←

from: refs/pr/324

Discussion Files changed

+177

-383

Files changed (11) hide show

geneformer/__init__.py +2 -8
geneformer/classifier.py +47 -81
geneformer/classifier_utils.py +34 -73
geneformer/collator_for_classification.py +1 -6
geneformer/emb_extractor.py +44 -78
geneformer/evaluation_utils.py +1 -1
geneformer/in_silico_perturber.py +5 -15
geneformer/in_silico_perturber_stats.py +22 -46
geneformer/perturber_utils.py +4 -58
geneformer/pretrainer.py +10 -11
geneformer/tokenizer.py +7 -6

geneformer/__init__.py CHANGED Viewed

@@ -1,10 +1,4 @@
 # ruff: noqa: F401
-from pathlib import Path
-GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary.pkl"
-TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary.pkl"
-ENSEMBL_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
 from . import (
     collator_for_classification,
     emb_extractor,
@@ -17,11 +11,11 @@ from .collator_for_classification import (
     DataCollatorForCellClassification,
     DataCollatorForGeneClassification,
 )
-from .emb_extractor import EmbExtractor, get_embs
 from .in_silico_perturber import InSilicoPerturber
 from .in_silico_perturber_stats import InSilicoPerturberStats
 from .pretrainer import GeneformerPretrainer
 from .tokenizer import TranscriptomeTokenizer
 from . import classifier  # noqa # isort:skip
-from .classifier import Classifier  # noqa # isort:skip

 # ruff: noqa: F401
 from . import (
     collator_for_classification,
     emb_extractor,
     DataCollatorForCellClassification,
     DataCollatorForGeneClassification,
 )
+from .emb_extractor import EmbExtractor
 from .in_silico_perturber import InSilicoPerturber
 from .in_silico_perturber_stats import InSilicoPerturberStats
 from .pretrainer import GeneformerPretrainer
 from .tokenizer import TranscriptomeTokenizer
 from . import classifier  # noqa # isort:skip
+from .classifier import Classifier  # noqa # isort:skip

geneformer/classifier.py CHANGED Viewed

@@ -53,6 +53,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from tqdm.auto import tqdm, trange
 from transformers import Trainer
 from transformers.training_args import TrainingArguments
@@ -61,7 +62,7 @@ from . import DataCollatorForCellClassification, DataCollatorForGeneClassificati
 from . import classifier_utils as cu
 from . import evaluation_utils as eu
 from . import perturber_utils as pu
-from . import TOKEN_DICTIONARY_FILE
 sns.set()
@@ -85,7 +86,6 @@ class Classifier:
         "no_eval": {bool},
         "stratify_splits_col": {None, str},
         "forward_batch_size": {int},
-        "token_dictionary_file": {None, str},
         "nproc": {int},
         "ngpu": {int},
     }
@@ -107,7 +107,6 @@ class Classifier:
         stratify_splits_col=None,
         no_eval=False,
         forward_batch_size=100,
-        token_dictionary_file=None,
         nproc=4,
         ngpu=1,
     ):
@@ -176,9 +175,6 @@ class Classifier:
             | Otherwise, will perform eval during training.
         forward_batch_size : int
             | Batch size for forward pass (for evaluation, not training).
-        token_dictionary_file : None, str
-            | Default is to use token dictionary file from Geneformer
-            | Otherwise, will load custom gene token dictionary.
         nproc : int
             | Number of CPU processes to use.
         ngpu : int
@@ -187,10 +183,6 @@ class Classifier:
         """
         self.classifier = classifier
-        if self.classifier == "cell":
-            self.model_type = "CellClassifier"
-        elif self.classifier == "gene":
-            self.model_type = "GeneClassifier"
         self.cell_state_dict = cell_state_dict
         self.gene_class_dict = gene_class_dict
         self.filter_data = filter_data
@@ -209,7 +201,6 @@ class Classifier:
         self.stratify_splits_col = stratify_splits_col
         self.no_eval = no_eval
         self.forward_batch_size = forward_batch_size
-        self.token_dictionary_file = token_dictionary_file
         self.nproc = nproc
         self.ngpu = ngpu
@@ -231,9 +222,7 @@ class Classifier:
                 ] = self.cell_state_dict["states"]
         # load token dictionary (Ensembl IDs:token)
-        if self.token_dictionary_file is None:
-            self.token_dictionary_file = TOKEN_DICTIONARY_FILE
-        with open(self.token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
@@ -278,7 +267,7 @@ class Classifier:
                     continue
             valid_type = False
             for option in valid_options:
-                if (option in [int, float, list, dict, bool, str]) and isinstance(
                     attr_value, option
                 ):
                     valid_type = True
@@ -445,8 +434,8 @@ class Classifier:
             test_data_output_path = (
                 Path(output_directory) / f"{output_prefix}_labeled_test"
             ).with_suffix(".dataset")
-            data_dict["train"].save_to_disk(str(train_data_output_path))
-            data_dict["test"].save_to_disk(str(test_data_output_path))
         elif (test_size is not None) and (self.classifier == "cell"):
             if 1 > test_size > 0:
                 if attr_to_split is None:
@@ -461,8 +450,8 @@ class Classifier:
                     test_data_output_path = (
                         Path(output_directory) / f"{output_prefix}_labeled_test"
                     ).with_suffix(".dataset")
-                    data_dict["train"].save_to_disk(str(train_data_output_path))
-                    data_dict["test"].save_to_disk(str(test_data_output_path))
                 else:
                     data_dict, balance_df = cu.balance_attr_splits(
                         data,
@@ -483,19 +472,19 @@ class Classifier:
                     test_data_output_path = (
                         Path(output_directory) / f"{output_prefix}_labeled_test"
                     ).with_suffix(".dataset")
-                    data_dict["train"].save_to_disk(str(train_data_output_path))
-                    data_dict["test"].save_to_disk(str(test_data_output_path))
             else:
                 data_output_path = (
                     Path(output_directory) / f"{output_prefix}_labeled"
                 ).with_suffix(".dataset")
-                data.save_to_disk(str(data_output_path))
                 print(data_output_path)
         else:
             data_output_path = (
                 Path(output_directory) / f"{output_prefix}_labeled"
             ).with_suffix(".dataset")
-            data.save_to_disk(str(data_output_path))
     def train_all_data(
         self,
@@ -641,6 +630,7 @@ class Classifier:
             | Number of trials to run for hyperparameter optimization
             | If 0, will not optimize hyperparameters
         """
         if self.num_crossval_splits == 0:
             logger.error("num_crossval_splits must be 1 or 5 to validate.")
             raise
@@ -782,20 +772,17 @@ class Classifier:
                 ]
             )
             assert len(targets) == len(labels)
-            n_splits = int(1 / (1 - self.train_size))
-            skf = cu.StratifiedKFold3(n_splits=n_splits, random_state=0, shuffle=True)
             # (Cross-)validate
-            test_ratio = self.oos_test_size / (self.eval_size + self.oos_test_size)
-            for train_index, eval_index, test_index in tqdm(
-                skf.split(targets, labels, test_ratio)
-            ):
                 print(
                     f"****** Validation split: {iteration_num}/{self.num_crossval_splits} ******\n"
                 )
                 ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
                 # filter data for examples containing classes for this split
                 # subsample to max_ncells and relabel data in column "labels"
-                train_data, eval_data = cu.prep_gene_classifier_train_eval_split(
                     data,
                     targets,
                     labels,
@@ -806,18 +793,6 @@ class Classifier:
                     self.nproc,
                 )
-                if self.oos_test_size > 0:
-                    test_data = cu.prep_gene_classifier_split(
-                        data,
-                        targets,
-                        labels,
-                        test_index,
-                        "test",
-                        self.max_ncells,
-                        iteration_num,
-                        self.nproc,
-                    )
                 if n_hyperopt_trials == 0:
                     trainer = self.train_classifier(
                         model_directory,
@@ -827,15 +802,6 @@ class Classifier:
                         ksplit_output_dir,
                         predict_trainer,
                     )
-                    result = self.evaluate_model(
-                        trainer.model,
-                        num_classes,
-                        id_class_dict,
-                        eval_data,
-                        predict_eval,
-                        ksplit_output_dir,
-                        output_prefix,
-                    )
                 else:
                     trainer = self.hyperopt_classifier(
                         model_directory,
@@ -845,27 +811,20 @@ class Classifier:
                         ksplit_output_dir,
                         n_trials=n_hyperopt_trials,
                     )
-                    model = cu.load_best_model(
-                        ksplit_output_dir, self.model_type, num_classes
-                    )
-                    if self.oos_test_size > 0:
-                        result = self.evaluate_model(
-                            model,
-                            num_classes,
-                            id_class_dict,
-                            test_data,
-                            predict_eval,
-                            ksplit_output_dir,
-                            output_prefix,
-                        )
                     else:
-                        if iteration_num == self.num_crossval_splits:
-                            return
-                        else:
-                            iteration_num = iteration_num + 1
-                            continue
                 results += [result]
                 all_conf_mat = all_conf_mat + result["conf_mat"]
                 # break after 1 or 5 splits, each with train/eval proportions dictated by eval_size
@@ -966,7 +925,12 @@ class Classifier:
         subprocess.call(f"mkdir {output_directory}", shell=True)
         ##### Load model and training args #####
-        model = pu.load_model(self.model_type, num_classes, model_directory, "train")
         def_training_args, def_freeze_layers = cu.get_default_train_args(
             model, self.classifier, train_data, output_directory
         )
@@ -982,9 +946,6 @@ class Classifier:
         if eval_data is None:
             def_training_args["evaluation_strategy"] = "no"
             def_training_args["load_best_model_at_end"] = False
-        def_training_args.update(
-            {"save_strategy": "epoch", "save_total_limit": 1}
-        )  # only save last model for each run
         training_args_init = TrainingArguments(**def_training_args)
         ##### Fine-tune the model #####
@@ -996,9 +957,7 @@ class Classifier:
         # define function to initiate model
         def model_init():
-            model = pu.load_model(
-                self.model_type, num_classes, model_directory, "train"
-            )
             if self.freeze_layers is not None:
                 def_freeze_layers = self.freeze_layers
@@ -1059,7 +1018,6 @@ class Classifier:
                 metric="eval_macro_f1",
                 metric_columns=["loss", "eval_loss", "eval_accuracy", "eval_macro_f1"],
             ),
-            local_dir=output_directory,
         )
         return trainer
@@ -1122,7 +1080,11 @@ class Classifier:
         subprocess.call(f"mkdir {output_directory}", shell=True)
         ##### Load model and training args #####
-        model = pu.load_model(self.model_type, num_classes, model_directory, "train")
         def_training_args, def_freeze_layers = cu.get_default_train_args(
             model, self.classifier, train_data, output_directory
@@ -1276,7 +1238,11 @@ class Classifier:
         test_data = pu.load_and_filter(None, self.nproc, test_data_file)
         # load previously fine-tuned model
-        model = pu.load_model(self.model_type, num_classes, model_directory, "eval")
         # evaluate the model
         result = self.evaluate_model(

 import numpy as np
 import pandas as pd
 import seaborn as sns
+from sklearn.model_selection import StratifiedKFold
 from tqdm.auto import tqdm, trange
 from transformers import Trainer
 from transformers.training_args import TrainingArguments
 from . import classifier_utils as cu
 from . import evaluation_utils as eu
 from . import perturber_utils as pu
+from .tokenizer import TOKEN_DICTIONARY_FILE
 sns.set()
         "no_eval": {bool},
         "stratify_splits_col": {None, str},
         "forward_batch_size": {int},
         "nproc": {int},
         "ngpu": {int},
     }
         stratify_splits_col=None,
         no_eval=False,
         forward_batch_size=100,
         nproc=4,
         ngpu=1,
     ):
             | Otherwise, will perform eval during training.
         forward_batch_size : int
             | Batch size for forward pass (for evaluation, not training).
         nproc : int
             | Number of CPU processes to use.
         ngpu : int
         """
         self.classifier = classifier
         self.cell_state_dict = cell_state_dict
         self.gene_class_dict = gene_class_dict
         self.filter_data = filter_data
         self.stratify_splits_col = stratify_splits_col
         self.no_eval = no_eval
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
         self.ngpu = ngpu
                 ] = self.cell_state_dict["states"]
         # load token dictionary (Ensembl IDs:token)
+        with open(TOKEN_DICTIONARY_FILE, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
                     continue
             valid_type = False
             for option in valid_options:
+                if (option in [int, float, list, dict, bool]) and isinstance(
                     attr_value, option
                 ):
                     valid_type = True
             test_data_output_path = (
                 Path(output_directory) / f"{output_prefix}_labeled_test"
             ).with_suffix(".dataset")
+            data_dict["train"].save_to_disk(train_data_output_path)
+            data_dict["test"].save_to_disk(test_data_output_path)
         elif (test_size is not None) and (self.classifier == "cell"):
             if 1 > test_size > 0:
                 if attr_to_split is None:
                     test_data_output_path = (
                         Path(output_directory) / f"{output_prefix}_labeled_test"
                     ).with_suffix(".dataset")
+                    data_dict["train"].save_to_disk(train_data_output_path)
+                    data_dict["test"].save_to_disk(test_data_output_path)
                 else:
                     data_dict, balance_df = cu.balance_attr_splits(
                         data,
                     test_data_output_path = (
                         Path(output_directory) / f"{output_prefix}_labeled_test"
                     ).with_suffix(".dataset")
+                    data_dict["train"].save_to_disk(train_data_output_path)
+                    data_dict["test"].save_to_disk(test_data_output_path)
             else:
                 data_output_path = (
                     Path(output_directory) / f"{output_prefix}_labeled"
                 ).with_suffix(".dataset")
+                data.save_to_disk(data_output_path)
                 print(data_output_path)
         else:
             data_output_path = (
                 Path(output_directory) / f"{output_prefix}_labeled"
             ).with_suffix(".dataset")
+            data.save_to_disk(data_output_path)
     def train_all_data(
         self,
             | Number of trials to run for hyperparameter optimization
             | If 0, will not optimize hyperparameters
         """
         if self.num_crossval_splits == 0:
             logger.error("num_crossval_splits must be 1 or 5 to validate.")
             raise
                 ]
             )
             assert len(targets) == len(labels)
+            n_splits = int(1 / self.eval_size)
+            skf = StratifiedKFold(n_splits=n_splits, random_state=0, shuffle=True)
             # (Cross-)validate
+            for train_index, eval_index in tqdm(skf.split(targets, labels)):
                 print(
                     f"****** Validation split: {iteration_num}/{self.num_crossval_splits} ******\n"
                 )
                 ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
                 # filter data for examples containing classes for this split
                 # subsample to max_ncells and relabel data in column "labels"
+                train_data, eval_data = cu.prep_gene_classifier_split(
                     data,
                     targets,
                     labels,
                     self.nproc,
                 )
                 if n_hyperopt_trials == 0:
                     trainer = self.train_classifier(
                         model_directory,
                         ksplit_output_dir,
                         predict_trainer,
                     )
                 else:
                     trainer = self.hyperopt_classifier(
                         model_directory,
                         ksplit_output_dir,
                         n_trials=n_hyperopt_trials,
                     )
+                    if iteration_num == self.num_crossval_splits:
+                        return
                     else:
+                        iteration_num = iteration_num + 1
+                        continue
+                result = self.evaluate_model(
+                    trainer.model,
+                    num_classes,
+                    id_class_dict,
+                    eval_data,
+                    predict_eval,
+                    ksplit_output_dir,
+                    output_prefix,
+                )
                 results += [result]
                 all_conf_mat = all_conf_mat + result["conf_mat"]
                 # break after 1 or 5 splits, each with train/eval proportions dictated by eval_size
         subprocess.call(f"mkdir {output_directory}", shell=True)
         ##### Load model and training args #####
+        if self.classifier == "cell":
+            model_type = "CellClassifier"
+        elif self.classifier == "gene":
+            model_type = "GeneClassifier"
+        model = pu.load_model(model_type, num_classes, model_directory, "train")
         def_training_args, def_freeze_layers = cu.get_default_train_args(
             model, self.classifier, train_data, output_directory
         )
         if eval_data is None:
             def_training_args["evaluation_strategy"] = "no"
             def_training_args["load_best_model_at_end"] = False
         training_args_init = TrainingArguments(**def_training_args)
         ##### Fine-tune the model #####
         # define function to initiate model
         def model_init():
+            model = pu.load_model(model_type, num_classes, model_directory, "train")
             if self.freeze_layers is not None:
                 def_freeze_layers = self.freeze_layers
                 metric="eval_macro_f1",
                 metric_columns=["loss", "eval_loss", "eval_accuracy", "eval_macro_f1"],
             ),
         )
         return trainer
         subprocess.call(f"mkdir {output_directory}", shell=True)
         ##### Load model and training args #####
+        if self.classifier == "cell":
+            model_type = "CellClassifier"
+        elif self.classifier == "gene":
+            model_type = "GeneClassifier"
+        model = pu.load_model(model_type, num_classes, model_directory, "train")
         def_training_args, def_freeze_layers = cu.get_default_train_args(
             model, self.classifier, train_data, output_directory
         test_data = pu.load_and_filter(None, self.nproc, test_data_file)
         # load previously fine-tuned model
+        if self.classifier == "cell":
+            model_type = "CellClassifier"
+        elif self.classifier == "gene":
+            model_type = "GeneClassifier"
+        model = pu.load_model(model_type, num_classes, model_directory, "eval")
         # evaluate the model
         result = self.evaluate_model(

geneformer/classifier_utils.py CHANGED Viewed

@@ -1,6 +1,4 @@
-import json
 import logging
-import os
 import random
 from collections import Counter, defaultdict
@@ -8,7 +6,6 @@ import numpy as np
 import pandas as pd
 from scipy.stats import chisquare, ranksums
 from sklearn.metrics import accuracy_score, f1_score
-from sklearn.model_selection import StratifiedKFold, train_test_split
 from . import perturber_utils as pu
@@ -136,55 +133,61 @@ def label_gene_classes(example, class_id_dict, gene_class_dict):
     ]
-def prep_gene_classifier_train_eval_split(
-    data, targets, labels, train_index, eval_index, max_ncells, iteration_num, num_proc
-):
-    # generate cross-validation splits
-    train_data = prep_gene_classifier_split(
-        data, targets, labels, train_index, "train", max_ncells, iteration_num, num_proc
-    )
-    eval_data = prep_gene_classifier_split(
-        data, targets, labels, eval_index, "eval", max_ncells, iteration_num, num_proc
-    )
-    return train_data, eval_data
 def prep_gene_classifier_split(
-    data, targets, labels, index, subset_name, max_ncells, iteration_num, num_proc
 ):
     # generate cross-validation splits
     targets = np.array(targets)
     labels = np.array(labels)
-    targets_subset = targets[index]
-    labels_subset = labels[index]
-    label_dict_subset = dict(zip(targets_subset, labels_subset))
     # function to filter by whether contains train or eval labels
-    def if_contains_subset_label(example):
-        a = targets_subset
         b = example["input_ids"]
         return not set(a).isdisjoint(b)
     # filter dataset for examples containing classes for this split
-    logger.info(f"Filtering data for {subset_name} genes in split {iteration_num}")
-    subset_data = data.filter(if_contains_subset_label, num_proc=num_proc)
     logger.info(
-        f"Filtered {round((1-len(subset_data)/len(data))*100)}%; {len(subset_data)} remain\n"
     )
     # subsample to max_ncells
-    subset_data = downsample_and_shuffle(subset_data, max_ncells, None, None)
     # relabel genes for this split
-    def subset_classes_to_ids(example):
         example["labels"] = [
-            label_dict_subset.get(token_id, -100) for token_id in example["input_ids"]
         ]
         return example
-    subset_data = subset_data.map(subset_classes_to_ids, num_proc=num_proc)
-    return subset_data
 def prep_gene_classifier_all_data(data, targets, labels, max_ncells, num_proc):
@@ -306,7 +309,7 @@ def balance_attr_splits(
                     exp_counts[cat] * sum(obs) / sum(exp_counts.values())
                     for cat in all_categ
                 ]
-                pval = chisquare(f_obs=obs, f_exp=exp).pvalue
                 train_attr_counts = str(obs_counts).strip("Counter(").strip(")")
                 eval_attr_counts = str(exp_counts).strip("Counter(").strip(")")
                 df_vals += [train_attr_counts, eval_attr_counts, pval]
@@ -420,45 +423,3 @@ def get_default_train_args(model, classifier, data, output_dir):
     training_args.update(default_training_args)
     return training_args, freeze_layers
-def load_best_model(directory, model_type, num_classes, mode="eval"):
-    file_dict = dict()
-    for subdir, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith("result.json"):
-                with open(f"{subdir}/{file}", "rb") as fp:
-                    result_json = json.load(fp)
-                file_dict[f"{subdir}"] = result_json["eval_macro_f1"]
-    file_df = pd.DataFrame(
-        {"dir": file_dict.keys(), "eval_macro_f1": file_dict.values()}
-    )
-    model_superdir = (
-        "run-"
-        + file_df.iloc[file_df["eval_macro_f1"].idxmax()]["dir"]
-        .split("_objective_")[2]
-        .split("_")[0]
-    )
-    for subdir, dirs, files in os.walk(f"{directory}/{model_superdir}"):
-        for file in files:
-            if file.endswith("model.safetensors"):
-                model = pu.load_model(model_type, num_classes, f"{subdir}", mode)
-    return model
-class StratifiedKFold3(StratifiedKFold):
-    def split(self, targets, labels, test_ratio=0.5, groups=None):
-        s = super().split(targets, labels, groups)
-        for train_indxs, test_indxs in s:
-            if test_ratio == 0:
-                yield train_indxs, test_indxs, None
-            else:
-                labels_test = np.array(labels)[test_indxs]
-                valid_indxs, test_indxs = train_test_split(
-                    test_indxs,
-                    stratify=labels_test,
-                    test_size=test_ratio,
-                    random_state=0,
-                )
-                yield train_indxs, valid_indxs, test_indxs

 import logging
 import random
 from collections import Counter, defaultdict
 import pandas as pd
 from scipy.stats import chisquare, ranksums
 from sklearn.metrics import accuracy_score, f1_score
 from . import perturber_utils as pu
     ]
 def prep_gene_classifier_split(
+    data, targets, labels, train_index, eval_index, max_ncells, iteration_num, num_proc
 ):
     # generate cross-validation splits
     targets = np.array(targets)
     labels = np.array(labels)
+    targets_train, targets_eval = targets[train_index], targets[eval_index]
+    labels_train, labels_eval = labels[train_index], labels[eval_index]
+    label_dict_train = dict(zip(targets_train, labels_train))
+    label_dict_eval = dict(zip(targets_eval, labels_eval))
     # function to filter by whether contains train or eval labels
+    def if_contains_train_label(example):
+        a = targets_train
+        b = example["input_ids"]
+        return not set(a).isdisjoint(b)
+    def if_contains_eval_label(example):
+        a = targets_eval
         b = example["input_ids"]
         return not set(a).isdisjoint(b)
     # filter dataset for examples containing classes for this split
+    logger.info(f"Filtering training data for genes in split {iteration_num}")
+    train_data = data.filter(if_contains_train_label, num_proc=num_proc)
     logger.info(
+        f"Filtered {round((1-len(train_data)/len(data))*100)}%; {len(train_data)} remain\n"
+    )
+    logger.info(f"Filtering evalation data for genes in split {iteration_num}")
+    eval_data = data.filter(if_contains_eval_label, num_proc=num_proc)
+    logger.info(
+        f"Filtered {round((1-len(eval_data)/len(data))*100)}%; {len(eval_data)} remain\n"
     )
     # subsample to max_ncells
+    train_data = downsample_and_shuffle(train_data, max_ncells, None, None)
+    eval_data = downsample_and_shuffle(eval_data, max_ncells, None, None)
     # relabel genes for this split
+    def train_classes_to_ids(example):
         example["labels"] = [
+            label_dict_train.get(token_id, -100) for token_id in example["input_ids"]
         ]
         return example
+    def eval_classes_to_ids(example):
+        example["labels"] = [
+            label_dict_eval.get(token_id, -100) for token_id in example["input_ids"]
+        ]
+        return example
+    train_data = train_data.map(train_classes_to_ids, num_proc=num_proc)
+    eval_data = eval_data.map(eval_classes_to_ids, num_proc=num_proc)
+    return train_data, eval_data
 def prep_gene_classifier_all_data(data, targets, labels, max_ncells, num_proc):
                     exp_counts[cat] * sum(obs) / sum(exp_counts.values())
                     for cat in all_categ
                 ]
+                chisquare(f_obs=obs, f_exp=exp).pvalue
                 train_attr_counts = str(obs_counts).strip("Counter(").strip(")")
                 eval_attr_counts = str(exp_counts).strip("Counter(").strip(")")
                 df_vals += [train_attr_counts, eval_attr_counts, pval]
     training_args.update(default_training_args)
     return training_args, freeze_layers

geneformer/collator_for_classification.py CHANGED Viewed

@@ -4,7 +4,6 @@ Geneformer collator for gene and cell classification.
 Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
 """
 import numpy as np
-import pickle
 import torch
 import warnings
 from enum import Enum
@@ -18,11 +17,7 @@ from transformers import (
 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
-from . import TOKEN_DICTIONARY_FILE
-# load token dictionary (Ensembl IDs:token)
-with open(TOKEN_DICTIONARY_FILE, "rb") as f:
-    token_dictionary = pickle.load(f)
 EncodedInput = List[int]
 logger = logging.get_logger(__name__)

 Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
 """
 import numpy as np
 import torch
 import warnings
 from enum import Enum
 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
+from .pretrainer import token_dictionary
 EncodedInput = List[int]
 logger = logging.get_logger(__name__)

geneformer/emb_extractor.py CHANGED Viewed

@@ -25,7 +25,7 @@ from tdigest import TDigest
 from tqdm.auto import trange
 from . import perturber_utils as pu
-from . import TOKEN_DICTIONARY_FILE
 logger = logging.getLogger(__name__)
@@ -38,19 +38,19 @@ def get_embs(
     layer_to_quant,
     pad_token_id,
     forward_batch_size,
-    token_gene_dict,
-    special_token=False,
     summary_stat=None,
     silent=False,
 ):
     model_input_size = pu.get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     if summary_stat is None:
         embs_list = []
     elif summary_stat is not None:
-        # get # of emb dims
-        emb_dims = pu.get_model_emb_dims(model)
         if emb_mode == "cell":
             # initiate tdigests for # of emb dims
             embs_tdigests = [TDigest() for _ in range(emb_dims)]
@@ -67,23 +67,8 @@ def get_embs(
                 k: [TDigest() for _ in range(emb_dims)] for k in gene_set
             }
-    # Check if CLS and EOS token is present in the token dictionary
-    cls_present = any("<cls>" in value for value in token_gene_dict.values())
-    eos_present = any("<eos>" in value for value in token_gene_dict.values())
-    if emb_mode == "cls":
-        assert cls_present, "<cls> token missing in token dictionary"
-        # Check to make sure that the first token of the filtered input data is cls token
-        gene_token_dict = {v:k for k,v in token_gene_dict.items()}
-        cls_token_id = gene_token_dict["<cls>"]
-        assert filtered_input_data["input_ids"][0][0] == cls_token_id, "First token is not <cls> token value"
-    elif emb_mode == "cell":
-        if cls_present:
-            logger.warning("CLS token present in token dictionary, excluding from average.")
-        if eos_present:
-            logger.warning("EOS token present in token dictionary, excluding from average.")
     overall_max_len = 0
     for i in trange(0, total_batch_length, forward_batch_size, leave=(not silent)):
         max_range = min(i + forward_batch_size, total_batch_length)
@@ -107,14 +92,7 @@ def get_embs(
         embs_i = outputs.hidden_states[layer_to_quant]
         if emb_mode == "cell":
-            if cls_present:
-                non_cls_embs = embs_i[:, 1:, :] # Get all layers except the embs
-                if eos_present:
-                    mean_embs = pu.mean_nonpadding_embs(non_cls_embs, original_lens - 2)
-                else:
-                    mean_embs = pu.mean_nonpadding_embs(non_cls_embs, original_lens - 1)
-            else:
-                mean_embs = pu.mean_nonpadding_embs(embs_i, original_lens)
             if summary_stat is None:
                 embs_list.append(mean_embs)
             elif summary_stat is not None:
@@ -143,13 +121,7 @@ def get_embs(
                         accumulate_tdigests(
                             embs_tdigests_dict[int(k)], dict_h[k], emb_dims
                         )
-                    del embs_h
-                    del dict_h
-        elif emb_mode == "cls":
-            cls_embs = embs_i[:,0,:].clone().detach() # CLS token layer
-            embs_list.append(cls_embs)
-            del cls_embs
         overall_max_len = max(overall_max_len, max_len)
         del outputs
         del minibatch
@@ -157,10 +129,9 @@ def get_embs(
         del embs_i
         torch.cuda.empty_cache()
     if summary_stat is None:
-        if (emb_mode == "cell") or (emb_mode == "cls"):
             embs_stack = torch.cat(embs_list, dim=0)
         elif emb_mode == "gene":
             embs_stack = pu.pad_tensor_list(
@@ -204,6 +175,7 @@ def accumulate_tdigests(embs_tdigests, mean_embs, emb_dims):
         for j in range(emb_dims)
     ]
 def update_tdigest_dict(embs_tdigests_dict, gene, gene_embs, emb_dims):
     embs_tdigests_dict[gene] = accumulate_tdigests(
         embs_tdigests_dict[gene], gene_embs, emb_dims
@@ -237,6 +209,14 @@ def tdigest_median(embs_tdigests, emb_dims):
     return [embs_tdigests[i].percentile(50) for i in range(emb_dims)]
 def label_cell_embs(embs, downsampled_data, emb_labels):
     embs_df = pd.DataFrame(embs.cpu().numpy())
     if emb_labels is not None:
@@ -272,7 +252,7 @@ def label_gene_embs(embs, downsampled_data, token_gene_dict):
     return embs_df
-def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict, seed=0):
     only_embs_df = embs_df.iloc[:, :emb_dims]
     only_embs_df.index = pd.RangeIndex(0, only_embs_df.shape[0], name=None).astype(str)
     only_embs_df.columns = pd.RangeIndex(0, only_embs_df.shape[1], name=None).astype(
@@ -282,17 +262,15 @@ def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict, seed=0):
     obs_dict = {"cell_id": list(only_embs_df.index), f"{label}": list(embs_df[label])}
     adata = anndata.AnnData(X=only_embs_df, obs=obs_dict, var=vars_dict)
     sc.tl.pca(adata, svd_solver="arpack")
-    sc.pp.neighbors(adata, random_state=seed)
-    sc.tl.umap(adata, random_state=seed)
     sns.set(rc={"figure.figsize": (10, 10)}, font_scale=2.3)
     sns.set_style("white")
     default_kwargs_dict = {"palette": "Set2", "size": 200}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
-    with plt.rc_context():
-        sc.pl.umap(adata, color=label, **default_kwargs_dict)
-        plt.savefig(output_file, bbox_inches="tight")
 def gen_heatmap_class_colors(labels, df):
@@ -368,8 +346,7 @@ def plot_heatmap(embs_df, emb_dims, label, output_file, kwargs_dict):
             bbox_to_anchor=(0.5, 1),
             facecolor="white",
         )
-    plt.show()
-    logger.info(f"Output file: {output_file}")
     plt.savefig(output_file, bbox_inches="tight")
@@ -377,7 +354,7 @@ class EmbExtractor:
     valid_option_dict = {
         "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
         "num_classes": {int},
-        "emb_mode": {"cls", "cell", "gene"},
         "cell_emb_style": {"mean_pool"},
         "gene_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
@@ -386,7 +363,6 @@ class EmbExtractor:
         "emb_label": {None, list},
         "labels_to_plot": {None, list},
         "forward_batch_size": {int},
-        "token_dictionary_file" : {None, str},
         "nproc": {int},
         "summary_stat": {None, "mean", "median", "exact_mean", "exact_median"},
     }
@@ -406,7 +382,7 @@ class EmbExtractor:
         forward_batch_size=100,
         nproc=4,
         summary_stat=None,
-        token_dictionary_file=None,
     ):
         """
         Initialize embedding extractor.
@@ -418,11 +394,10 @@ class EmbExtractor:
         num_classes : int
             | If model is a gene or cell classifier, specify number of classes it was trained to classify.
             | For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
-        emb_mode : {"cls", "cell", "gene"}
-            | Whether to output CLS, cell, or gene embeddings.
-            | CLS embeddings are cell embeddings derived from the CLS token in the front of the rank value encoding.
-        cell_emb_style : {"mean_pool"}
-            | Method for summarizing cell embeddings if not using CLS token.
             | Currently only option is mean pooling of gene embeddings for given cell.
         gene_emb_style : "mean_pool"
             | Method for summarizing gene embeddings.
@@ -457,7 +432,6 @@ class EmbExtractor:
             | Non-exact recommended if encountering memory constraints while generating goal embedding positions.
             | Non-exact is slower but more memory-efficient.
         token_dictionary_file : Path
-            | Default is the Geneformer token dictionary
             | Path to pickle file containing token dictionary (Ensembl ID:token).
         **Examples:**
@@ -487,7 +461,6 @@ class EmbExtractor:
         self.emb_layer = emb_layer
         self.emb_label = emb_label
         self.labels_to_plot = labels_to_plot
-        self.token_dictionary_file = token_dictionary_file
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
         if (summary_stat is not None) and ("exact" in summary_stat):
@@ -500,8 +473,6 @@ class EmbExtractor:
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
-        if self.token_dictionary_file is None:
-            token_dictionary_file = TOKEN_DICTIONARY_FILE
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
@@ -517,7 +488,7 @@ class EmbExtractor:
                     continue
             valid_type = False
             for option in valid_options:
-                if (option in [int, list, dict, bool, str]) and isinstance(
                     attr_value, option
                 ):
                     valid_type = True
@@ -591,14 +562,13 @@ class EmbExtractor:
         )
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         embs = get_embs(
-            model=model,
-            filtered_input_data=downsampled_data,
-            emb_mode=self.emb_mode,
-            layer_to_quant=layer_to_quant,
-            pad_token_id=self.pad_token_id,
-            forward_batch_size=self.forward_batch_size,
-            token_gene_dict=self.token_gene_dict,
-            summary_stat=self.summary_stat,
         )
         if self.emb_mode == "cell":
@@ -612,8 +582,6 @@ class EmbExtractor:
             elif self.summary_stat is not None:
                 embs_df = pd.DataFrame(embs).T
                 embs_df.index = [self.token_gene_dict[token] for token in embs_df.index]
-        elif self.emb_mode == "cls":
-            embs_df = label_cell_embs(embs, downsampled_data, self.emb_label)
         # save embeddings to output_path
         if cell_state is None:
@@ -622,15 +590,13 @@ class EmbExtractor:
         if self.exact_summary_stat == "exact_mean":
             embs = embs.mean(dim=0)
-            emb_dims = pu.get_model_emb_dims(model)
             embs_df = pd.DataFrame(
-                embs_df[0:emb_dims-1].mean(axis="rows"), columns=[self.exact_summary_stat]
             ).T
         elif self.exact_summary_stat == "exact_median":
             embs = torch.median(embs, dim=0)[0]
-            emb_dims = pu.get_model_emb_dims(model)
             embs_df = pd.DataFrame(
-                embs_df[0:emb_dims-1].median(axis="rows"), columns=[self.exact_summary_stat]
             ).T
         if cell_state is not None:
@@ -813,11 +779,11 @@ class EmbExtractor:
                         f"not present in provided embeddings dataframe."
                     )
                     continue
-                output_prefix_label = output_prefix + f"_umap_{label}"
                 output_file = (
                     Path(output_directory) / output_prefix_label
                 ).with_suffix(".pdf")
-                plot_umap(embs, emb_dims, label, output_file, kwargs_dict)
         if plot_style == "heatmap":
             for label in self.labels_to_plot:
@@ -831,4 +797,4 @@ class EmbExtractor:
                 output_file = (
                     Path(output_directory) / output_prefix_label
                 ).with_suffix(".pdf")
-                plot_heatmap(embs, emb_dims, label, output_file, kwargs_dict)

 from tqdm.auto import trange
 from . import perturber_utils as pu
+from .tokenizer import TOKEN_DICTIONARY_FILE
 logger = logging.getLogger(__name__)
     layer_to_quant,
     pad_token_id,
     forward_batch_size,
     summary_stat=None,
     silent=False,
 ):
     model_input_size = pu.get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     if summary_stat is None:
         embs_list = []
     elif summary_stat is not None:
+        # test embedding extraction for example cell and extract # emb dims
+        example = filtered_input_data.select([i for i in range(1)])
+        example.set_format(type="torch")
+        emb_dims = test_emb(model, example["input_ids"], layer_to_quant)
         if emb_mode == "cell":
             # initiate tdigests for # of emb dims
             embs_tdigests = [TDigest() for _ in range(emb_dims)]
                 k: [TDigest() for _ in range(emb_dims)] for k in gene_set
             }
     overall_max_len = 0
     for i in trange(0, total_batch_length, forward_batch_size, leave=(not silent)):
         max_range = min(i + forward_batch_size, total_batch_length)
         embs_i = outputs.hidden_states[layer_to_quant]
         if emb_mode == "cell":
+            mean_embs = pu.mean_nonpadding_embs(embs_i, original_lens)
             if summary_stat is None:
                 embs_list.append(mean_embs)
             elif summary_stat is not None:
                         accumulate_tdigests(
                             embs_tdigests_dict[int(k)], dict_h[k], emb_dims
                         )
         overall_max_len = max(overall_max_len, max_len)
         del outputs
         del minibatch
         del embs_i
         torch.cuda.empty_cache()
     if summary_stat is None:
+        if emb_mode == "cell":
             embs_stack = torch.cat(embs_list, dim=0)
         elif emb_mode == "gene":
             embs_stack = pu.pad_tensor_list(
         for j in range(emb_dims)
     ]
 def update_tdigest_dict(embs_tdigests_dict, gene, gene_embs, emb_dims):
     embs_tdigests_dict[gene] = accumulate_tdigests(
         embs_tdigests_dict[gene], gene_embs, emb_dims
     return [embs_tdigests[i].percentile(50) for i in range(emb_dims)]
+def test_emb(model, example, layer_to_quant):
+    with torch.no_grad():
+        outputs = model(input_ids=example.to("cuda"))
+    embs_test = outputs.hidden_states[layer_to_quant]
+    return embs_test.size()[2]
 def label_cell_embs(embs, downsampled_data, emb_labels):
     embs_df = pd.DataFrame(embs.cpu().numpy())
     if emb_labels is not None:
     return embs_df
+def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict):
     only_embs_df = embs_df.iloc[:, :emb_dims]
     only_embs_df.index = pd.RangeIndex(0, only_embs_df.shape[0], name=None).astype(str)
     only_embs_df.columns = pd.RangeIndex(0, only_embs_df.shape[1], name=None).astype(
     obs_dict = {"cell_id": list(only_embs_df.index), f"{label}": list(embs_df[label])}
     adata = anndata.AnnData(X=only_embs_df, obs=obs_dict, var=vars_dict)
     sc.tl.pca(adata, svd_solver="arpack")
+    sc.pp.neighbors(adata)
+    sc.tl.umap(adata)
     sns.set(rc={"figure.figsize": (10, 10)}, font_scale=2.3)
     sns.set_style("white")
     default_kwargs_dict = {"palette": "Set2", "size": 200}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
+    sc.pl.umap(adata, color=label, save=output_file, **default_kwargs_dict)
 def gen_heatmap_class_colors(labels, df):
             bbox_to_anchor=(0.5, 1),
             facecolor="white",
         )
     plt.savefig(output_file, bbox_inches="tight")
     valid_option_dict = {
         "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
         "num_classes": {int},
+        "emb_mode": {"cell", "gene"},
         "cell_emb_style": {"mean_pool"},
         "gene_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "emb_label": {None, list},
         "labels_to_plot": {None, list},
         "forward_batch_size": {int},
         "nproc": {int},
         "summary_stat": {None, "mean", "median", "exact_mean", "exact_median"},
     }
         forward_batch_size=100,
         nproc=4,
         summary_stat=None,
+        token_dictionary_file=TOKEN_DICTIONARY_FILE,
     ):
         """
         Initialize embedding extractor.
         num_classes : int
             | If model is a gene or cell classifier, specify number of classes it was trained to classify.
             | For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
+        emb_mode : {"cell", "gene"}
+            | Whether to output cell or gene embeddings.
+        cell_emb_style : "mean_pool"
+            | Method for summarizing cell embeddings.
             | Currently only option is mean pooling of gene embeddings for given cell.
         gene_emb_style : "mean_pool"
             | Method for summarizing gene embeddings.
             | Non-exact recommended if encountering memory constraints while generating goal embedding positions.
             | Non-exact is slower but more memory-efficient.
         token_dictionary_file : Path
             | Path to pickle file containing token dictionary (Ensembl ID:token).
         **Examples:**
         self.emb_layer = emb_layer
         self.emb_label = emb_label
         self.labels_to_plot = labels_to_plot
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
         if (summary_stat is not None) and ("exact" in summary_stat):
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
                     continue
             valid_type = False
             for option in valid_options:
+                if (option in [int, list, dict, bool]) and isinstance(
                     attr_value, option
                 ):
                     valid_type = True
         )
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         embs = get_embs(
+            model,
+            downsampled_data,
+            self.emb_mode,
+            layer_to_quant,
+            self.pad_token_id,
+            self.forward_batch_size,
+            self.summary_stat,
         )
         if self.emb_mode == "cell":
             elif self.summary_stat is not None:
                 embs_df = pd.DataFrame(embs).T
                 embs_df.index = [self.token_gene_dict[token] for token in embs_df.index]
         # save embeddings to output_path
         if cell_state is None:
         if self.exact_summary_stat == "exact_mean":
             embs = embs.mean(dim=0)
             embs_df = pd.DataFrame(
+                embs_df[0:255].mean(axis="rows"), columns=[self.exact_summary_stat]
             ).T
         elif self.exact_summary_stat == "exact_median":
             embs = torch.median(embs, dim=0)[0]
             embs_df = pd.DataFrame(
+                embs_df[0:255].median(axis="rows"), columns=[self.exact_summary_stat]
             ).T
         if cell_state is not None:
                         f"not present in provided embeddings dataframe."
                     )
                     continue
+                output_prefix_label = "_" + output_prefix + f"_umap_{label}"
                 output_file = (
                     Path(output_directory) / output_prefix_label
                 ).with_suffix(".pdf")
+                plot_umap(embs, emb_dims, label, output_prefix_label, kwargs_dict)
         if plot_style == "heatmap":
             for label in self.labels_to_plot:
                 output_file = (
                     Path(output_directory) / output_prefix_label
                 ).with_suffix(".pdf")
+                plot_heatmap(embs, emb_dims, label, output_file, kwargs_dict)

geneformer/evaluation_utils.py CHANGED Viewed

@@ -21,7 +21,7 @@ from sklearn.metrics import (
 from tqdm.auto import trange
 from .emb_extractor import make_colorbar
-from . import TOKEN_DICTIONARY_FILE
 logger = logging.getLogger(__name__)

 from tqdm.auto import trange
 from .emb_extractor import make_colorbar
+from .tokenizer import TOKEN_DICTIONARY_FILE
 logger = logging.getLogger(__name__)

geneformer/in_silico_perturber.py CHANGED Viewed

@@ -38,18 +38,19 @@ import logging
 import os
 import pickle
 from collections import defaultdict
-from multiprocess import set_start_method
 from typing import List
 import torch
-from datasets import Dataset, disable_progress_bars
 from tqdm.auto import trange
 from . import perturber_utils as pu
 from .emb_extractor import get_embs
-from . import TOKEN_DICTIONARY_FILE
-disable_progress_bars()
 logger = logging.getLogger(__name__)
@@ -184,10 +185,6 @@ class InSilicoPerturber:
         token_dictionary_file : Path
             | Path to pickle file containing token dictionary (Ensembl ID:token).
         """
-        try:
-            set_start_method("spawn")
-        except RuntimeError:
-            pass
         self.perturb_type = perturb_type
         self.perturb_rank_shift = perturb_rank_shift
@@ -225,7 +222,6 @@ class InSilicoPerturber:
         # load token dictionary (Ensembl IDs:token)
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
-        self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
         self.pad_token_id = self.gene_token_dict.get("<pad>")
@@ -426,7 +422,6 @@ class InSilicoPerturber:
         self.max_len = pu.get_model_input_size(model)
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         ### filter input data ###
         # general filtering of input data based on filter_data argument
         filtered_input_data = pu.load_and_filter(
@@ -525,7 +520,6 @@ class InSilicoPerturber:
         perturbed_data = filtered_input_data.map(
             make_group_perturbation_batch, num_proc=self.nproc
         )
         if self.perturb_type == "overexpress":
             filtered_input_data = filtered_input_data.add_column(
                 "n_overflow", perturbed_data["n_overflow"]
@@ -558,7 +552,6 @@ class InSilicoPerturber:
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
-                    token_gene_dict=self.token_gene_dict,
                     summary_stat=None,
                     silent=True,
                 )
@@ -578,7 +571,6 @@ class InSilicoPerturber:
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
-                    token_gene_dict=self.token_gene_dict,
                     summary_stat=None,
                     silent=True,
                 )
@@ -738,7 +730,6 @@ class InSilicoPerturber:
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
-                token_gene_dict=self.token_gene_dict,
                 summary_stat=None,
                 silent=True,
             )
@@ -766,7 +757,6 @@ class InSilicoPerturber:
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
-                token_gene_dict=self.token_gene_dict,
                 summary_stat=None,
                 silent=True,
             )

 import os
 import pickle
 from collections import defaultdict
 from typing import List
+import seaborn as sns
 import torch
+from datasets import Dataset
 from tqdm.auto import trange
 from . import perturber_utils as pu
 from .emb_extractor import get_embs
+from .tokenizer import TOKEN_DICTIONARY_FILE
+sns.set()
 logger = logging.getLogger(__name__)
         token_dictionary_file : Path
             | Path to pickle file containing token dictionary (Ensembl ID:token).
         """
         self.perturb_type = perturb_type
         self.perturb_rank_shift = perturb_rank_shift
         # load token dictionary (Ensembl IDs:token)
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         self.pad_token_id = self.gene_token_dict.get("<pad>")
         self.max_len = pu.get_model_input_size(model)
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         ### filter input data ###
         # general filtering of input data based on filter_data argument
         filtered_input_data = pu.load_and_filter(
         perturbed_data = filtered_input_data.map(
             make_group_perturbation_batch, num_proc=self.nproc
         )
         if self.perturb_type == "overexpress":
             filtered_input_data = filtered_input_data.add_column(
                 "n_overflow", perturbed_data["n_overflow"]
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
                     summary_stat=None,
                     silent=True,
                 )
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
                     summary_stat=None,
                     silent=True,
                 )
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
                 summary_stat=None,
                 silent=True,
             )
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
                 summary_stat=None,
                 silent=True,
             )

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -38,7 +38,9 @@ from sklearn.mixture import GaussianMixture
 from tqdm.auto import tqdm, trange
 from .perturber_utils import flatten_list, validate_cell_states_to_model
-from . import TOKEN_DICTIONARY_FILE, ENSEMBL_DICTIONARY_FILE
 logger = logging.getLogger(__name__)
@@ -190,48 +192,22 @@ def get_impact_component(test_value, gaussian_mixture_model):
 # aggregate data for single perturbation in multiple cells
-def isp_aggregate_grouped_perturb(cos_sims_df, dict_list, genes_perturbed):
-    names = ["Cosine_sim", "Gene"]
-    cos_sims_full_dfs = []
-    if isinstance(genes_perturbed,list):
-        if len(genes_perturbed)>1:
-            gene_ids_df = cos_sims_df.loc[np.isin([set(idx) for idx in cos_sims_df["Ensembl_ID"]], set(genes_perturbed)), :]
-        else:
-            gene_ids_df = cos_sims_df.loc[np.isin(cos_sims_df["Ensembl_ID"], genes_perturbed), :]
-    else:
-        logger.error(
-                        "aggregate_data is for perturbation of single gene or single group of genes. genes_to_perturb should be formatted as list."
-                    )
-        raise
-    if gene_ids_df.empty:
-        logger.error(
-                        "genes_to_perturb not found in data."
-                    )
-        raise
-    tokens = gene_ids_df["Gene"]
-    symbols = gene_ids_df["Gene_name"]
-    for token, symbol in zip(tokens, symbols):
-        cos_shift_data = []
-        for dict_i in dict_list:
-            cos_shift_data += dict_i.get((token, "cell_emb"), [])
-        df = pd.DataFrame(columns=names)
-        df["Cosine_sim"] = cos_shift_data
-        df["Gene"] = symbol
-        cos_sims_full_dfs.append(df)
-    return pd.concat(cos_sims_full_dfs)
 def find(variable, x):
     try:
         if x in variable:  # Test if variable is iterable and contains x
             return True
-        elif x == variable:
-            return True
     except (ValueError, TypeError):
         return x == variable  # Test if variable is x if non-iterable
@@ -272,15 +248,15 @@ def isp_aggregate_gene_shifts(
     cos_sims_full_df["Affected_Ensembl_ID"] = [
         gene_token_id_dict.get(token, np.nan) for token in cos_sims_full_df["Affected"]
     ]
-    cos_sims_full_df["Cosine_sim_mean"] = [v[0] for k, v in cos_data_mean.items()]
-    cos_sims_full_df["Cosine_sim_stdev"] = [v[1] for k, v in cos_data_mean.items()]
     cos_sims_full_df["N_Detections"] = [v[2] for k, v in cos_data_mean.items()]
     specific_val = "cell_emb"
     cos_sims_full_df["temp"] = list(cos_sims_full_df["Affected"] == specific_val)
-    # reorder so cell embs are at the top and all are subordered by magnitude of cosine sim
     cos_sims_full_df = cos_sims_full_df.sort_values(
-        by=(["temp", "Cosine_sim_mean"]), ascending=[False, True]
     ).drop("temp", axis=1)
     return cos_sims_full_df
@@ -671,7 +647,7 @@ class InSilicoPerturberStats:
         cell_states_to_model=None,
         pickle_suffix="_raw.pickle",
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
-        gene_name_id_dictionary_file=ENSEMBL_DICTIONARY_FILE,
     ):
         """
         Initialize in silico perturber stats generator.
@@ -938,11 +914,11 @@ class InSilicoPerturberStats:
         |     1: within impact component; 0: not within impact component
         | "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
-        | In case of aggregating data / gene shifts:
         | "Perturbed": ID(s) of gene(s) being perturbed
         | "Affected": ID of affected gene or "cell_emb" indicating the impact on the cell embedding as a whole
-        | "Cosine_sim_mean": mean of cosine similarity of cell or affected gene in original vs. perturbed
-        | "Cosine_sim_stdev": standard deviation of cosine similarity of cell or affected gene in original vs. perturbed
         """
         if self.mode not in [
@@ -1041,8 +1017,8 @@ class InSilicoPerturberStats:
                 cos_sims_df_initial, dict_list, self.combos, self.anchor_token
             )
-        elif self.mode == "aggregate_data":
-            cos_sims_df = isp_aggregate_grouped_perturb(cos_sims_df_initial, dict_list, self.genes_perturbed)
         elif self.mode == "aggregate_gene_shifts":
             cos_sims_df = isp_aggregate_gene_shifts(

 from tqdm.auto import tqdm, trange
 from .perturber_utils import flatten_list, validate_cell_states_to_model
+from .tokenizer import TOKEN_DICTIONARY_FILE
+GENE_NAME_ID_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
 logger = logging.getLogger(__name__)
 # aggregate data for single perturbation in multiple cells
+def isp_aggregate_grouped_perturb(cos_sims_df, dict_list):
+    names = ["Cosine_shift"]
+    cos_sims_full_df = pd.DataFrame(columns=names)
+    cos_shift_data = []
+    token = cos_sims_df["Gene"][0]
+    for dict_i in dict_list:
+        cos_shift_data += dict_i.get((token, "cell_emb"), [])
+    cos_sims_full_df["Cosine_shift"] = cos_shift_data
+    return cos_sims_full_df
 def find(variable, x):
     try:
         if x in variable:  # Test if variable is iterable and contains x
             return True
     except (ValueError, TypeError):
         return x == variable  # Test if variable is x if non-iterable
     cos_sims_full_df["Affected_Ensembl_ID"] = [
         gene_token_id_dict.get(token, np.nan) for token in cos_sims_full_df["Affected"]
     ]
+    cos_sims_full_df["Cosine_shift_mean"] = [v[0] for k, v in cos_data_mean.items()]
+    cos_sims_full_df["Cosine_shift_stdev"] = [v[1] for k, v in cos_data_mean.items()]
     cos_sims_full_df["N_Detections"] = [v[2] for k, v in cos_data_mean.items()]
     specific_val = "cell_emb"
     cos_sims_full_df["temp"] = list(cos_sims_full_df["Affected"] == specific_val)
+    # reorder so cell embs are at the top and all are subordered by magnitude of cosine shift
     cos_sims_full_df = cos_sims_full_df.sort_values(
+        by=(["temp", "Cosine_shift_mean"]), ascending=[False, False]
     ).drop("temp", axis=1)
     return cos_sims_full_df
         cell_states_to_model=None,
         pickle_suffix="_raw.pickle",
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
+        gene_name_id_dictionary_file=GENE_NAME_ID_DICTIONARY_FILE,
     ):
         """
         Initialize in silico perturber stats generator.
         |     1: within impact component; 0: not within impact component
         | "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
+        | In case of aggregating gene shifts:
         | "Perturbed": ID(s) of gene(s) being perturbed
         | "Affected": ID of affected gene or "cell_emb" indicating the impact on the cell embedding as a whole
+        | "Cosine_shift_mean": mean of cosine shift of modeled perturbation on affected gene or cell
+        | "Cosine_shift_stdev": standard deviation of cosine shift of modeled perturbation on affected gene or cell
         """
         if self.mode not in [
                 cos_sims_df_initial, dict_list, self.combos, self.anchor_token
             )
+        elif self.mode == "aggregate_data":
+            cos_sims_df = isp_aggregate_grouped_perturb(cos_sims_df_initial, dict_list)
         elif self.mode == "aggregate_gene_shifts":
             cos_sims_df = isp_aggregate_gene_shifts(

geneformer/perturber_utils.py CHANGED Viewed

@@ -4,8 +4,6 @@ import pickle
 import re
 from collections import defaultdict
 from typing import List
-from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -18,8 +16,7 @@ from transformers import (
     BertForTokenClassification,
 )
-from . import GENE_MEDIAN_FILE, TOKEN_DICTIONARY_FILE, ENSEMBL_DICTIONARY_FILE
 logger = logging.getLogger(__name__)
@@ -152,12 +149,8 @@ def quant_layers(model):
     return int(max(layer_nums)) + 1
-def get_model_emb_dims(model):
-    return model.config.hidden_size
 def get_model_input_size(model):
-    return model.config.max_position_embeddings
 def flatten_list(megalist):
@@ -588,11 +581,9 @@ def quant_cos_sims(
     elif emb_mode == "cell":
         cos = torch.nn.CosineSimilarity(dim=1)
-    # if emb_mode == "gene", can only calculate gene cos sims
-    # against original cell anyways
-    if cell_states_to_model is None or emb_mode == "gene":
         cos_sims = cos(perturbation_emb, original_emb).to("cuda")
-    elif cell_states_to_model is not None and emb_mode == "cell":
         possible_states = get_possible_states(cell_states_to_model)
         cos_sims = dict(zip(possible_states, [[] for _ in range(len(possible_states))]))
         for state in possible_states:
@@ -714,48 +705,3 @@ def validate_cell_states_to_model(cell_states_to_model):
                 "'alt_states': ['hcm', 'other1', 'other2']}"
             )
             raise
-class GeneIdHandler:
-    def __init__(self, raise_errors=False):
-        def invert_dict(dict_obj):
-            return {v:k for k,v in dict_obj.items()}
-        self.raise_errors = raise_errors
-        with open(TOKEN_DICTIONARY_FILE, 'rb') as f:
-            self.gene_token_dict = pickle.load(f)
-            self.token_gene_dict = invert_dict(self.gene_token_dict)
-        with open(ENSEMBL_DICTIONARY_FILE, 'rb') as f:
-            self.id_gene_dict = pickle.load(f)
-            self.gene_id_dict = invert_dict(self.id_gene_dict)
-    def ens_to_token(self, ens_id):
-        if not self.raise_errors:
-            return self.gene_token_dict.get(ens_id, ens_id)
-        else:
-            return self.gene_token_dict[ens_id]
-    def token_to_ens(self, token):
-        if not self.raise_errors:
-            return self.token_gene_dict.get(token, token)
-        else:
-            return self.token_gene_dict[token]
-    def ens_to_symbol(self, ens_id):
-        if not self.raise_errors:
-            return self.gene_id_dict.get(ens_id, ens_id)
-        else:
-            return self.gene_id_dict[ens_id]
-    def symbol_to_ens(self, symbol):
-        if not self.raise_errors:
-            return self.id_gene_dict.get(symbol, symbol)
-        else:
-            return self.id_gene_dict[symbol]
-    def token_to_symbol(self, token):
-        return self.ens_to_symbol(self.token_to_ens(token))
-    def symbol_to_token(self, symbol):
-        return self.ens_to_token(self.symbol_to_ens(symbol))

 import re
 from collections import defaultdict
 from typing import List
 import numpy as np
 import pandas as pd
     BertForTokenClassification,
 )
+sns.set()
 logger = logging.getLogger(__name__)
     return int(max(layer_nums)) + 1
 def get_model_input_size(model):
+    return int(re.split("\(|,", str(model.bert.embeddings.position_embeddings))[1])
 def flatten_list(megalist):
     elif emb_mode == "cell":
         cos = torch.nn.CosineSimilarity(dim=1)
+    if cell_states_to_model is None:
         cos_sims = cos(perturbation_emb, original_emb).to("cuda")
+    else:
         possible_states = get_possible_states(cell_states_to_model)
         cos_sims = dict(zip(possible_states, [[] for _ in range(len(possible_states))]))
         for state in possible_states:
                 "'alt_states': ['hcm', 'other1', 'other2']}"
             )
             raise

geneformer/pretrainer.py CHANGED Viewed

@@ -32,7 +32,7 @@ from transformers.training_args import ParallelMode
 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
-from . import TOKEN_DICTIONARY_FILE
 logger = logging.get_logger(__name__)
 EncodedInput = List[int]
@@ -106,8 +106,9 @@ class TensorType(ExplicitEnum):
 class GeneformerPreCollator(SpecialTokensMixin):
     def __init__(self, *args, **kwargs) -> None:
-        super().__init__(mask_token="<mask>", pad_token="<pad>")
         self.token_dictionary = kwargs.get("token_dictionary")
         # self.mask_token = "<mask>"
         # self.mask_token_id = self.token_dictionary.get("<mask>")
@@ -119,8 +120,8 @@ class GeneformerPreCollator(SpecialTokensMixin):
         #     self.token_dictionary.get("<pad>"),
         # ]
         self.model_input_names = ["input_ids"]
-    def convert_ids_to_tokens(self, value):
         return self.token_dictionary.get(value)
     def _get_padding_truncation_strategies(
@@ -390,6 +391,7 @@ class GeneformerPreCollator(SpecialTokensMixin):
             for key, value in encoded_inputs.items():
                 encoded_inputs[key] = to_py_obj(value)
         # Convert padding_strategy in PaddingStrategy
         padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
@@ -594,17 +596,15 @@ class GeneformerPreCollator(SpecialTokensMixin):
 class GeneformerPretrainer(Trainer):
     def __init__(self, *args, **kwargs):
-        data_collator = kwargs.get("data_collator", None)
         token_dictionary = kwargs.pop("token_dictionary")
-        mlm = kwargs.pop("mlm", True)
-        mlm_probability = kwargs.pop("mlm_probability", 0.15)
         if data_collator is None:
             precollator = GeneformerPreCollator(token_dictionary=token_dictionary)
             # # Data Collator Functions
             data_collator = DataCollatorForLanguageModeling(
-                tokenizer=precollator, mlm=mlm, mlm_probability=mlm_probability
             )
             kwargs["data_collator"] = data_collator
@@ -694,7 +694,6 @@ class CustomDistributedLengthGroupedSampler(DistributedLengthGroupedSampler):
     Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
     length while keeping a bit of randomness.
     """
     # Copied and adapted from PyTorch DistributedSampler.
     def __init__(
         self,
@@ -758,7 +757,7 @@ class CustomDistributedLengthGroupedSampler(DistributedLengthGroupedSampler):
         # Deterministically shuffle based on epoch and seed
         g = torch.Generator()
         g.manual_seed(self.seed + self.epoch)
         indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)
         if not self.drop_last:

 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
+from .tokenizer import TOKEN_DICTIONARY_FILE
 logger = logging.get_logger(__name__)
 EncodedInput = List[int]
 class GeneformerPreCollator(SpecialTokensMixin):
     def __init__(self, *args, **kwargs) -> None:
+        super().__init__(mask_token = "<mask>", pad_token = "<pad>")
         self.token_dictionary = kwargs.get("token_dictionary")
         # self.mask_token = "<mask>"
         # self.mask_token_id = self.token_dictionary.get("<mask>")
         #     self.token_dictionary.get("<pad>"),
         # ]
         self.model_input_names = ["input_ids"]
+    def convert_ids_to_tokens(self,value):
         return self.token_dictionary.get(value)
     def _get_padding_truncation_strategies(
             for key, value in encoded_inputs.items():
                 encoded_inputs[key] = to_py_obj(value)
         # Convert padding_strategy in PaddingStrategy
         padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
 class GeneformerPretrainer(Trainer):
     def __init__(self, *args, **kwargs):
+        data_collator = kwargs.get("data_collator",None)
         token_dictionary = kwargs.pop("token_dictionary")
         if data_collator is None:
             precollator = GeneformerPreCollator(token_dictionary=token_dictionary)
             # # Data Collator Functions
             data_collator = DataCollatorForLanguageModeling(
+                tokenizer=precollator, mlm=True, mlm_probability=0.15
             )
             kwargs["data_collator"] = data_collator
     Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
     length while keeping a bit of randomness.
     """
     # Copied and adapted from PyTorch DistributedSampler.
     def __init__(
         self,
         # Deterministically shuffle based on epoch and seed
         g = torch.Generator()
         g.manual_seed(self.seed + self.epoch)
         indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)
         if not self.drop_last:

geneformer/tokenizer.py CHANGED Viewed

@@ -52,7 +52,8 @@ import loompy as lp  # noqa
 logger = logging.getLogger(__name__)
-from . import GENE_MEDIAN_FILE, TOKEN_DICTIONARY_FILE
 def rank_genes(gene_vector, gene_tokens):
@@ -102,7 +103,7 @@ class TranscriptomeTokenizer:
         model_input_size : int = 2048
             | Max input size of model to truncate input to.
         special_token : bool = False
-            | Adds CLS token before and EOS token after rank value encoding.
         gene_median_file : Path
             | Path to pickle file containing dictionary of non-zero median
             | gene expression values across Genecorpus-30M.
@@ -122,7 +123,7 @@ class TranscriptomeTokenizer:
         # input size for tokenization
         self.model_input_size = model_input_size
-        # add CLS and EOS tokens
         self.special_token = special_token
         # load dictionary of gene normalization factors
@@ -175,7 +176,7 @@ class TranscriptomeTokenizer:
         )
         output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
-        tokenized_dataset.save_to_disk(str(output_path))
     def tokenize_files(
         self, data_directory, file_format: Literal["loom", "h5ad"] = "loom"
@@ -377,14 +378,14 @@ class TranscriptomeTokenizer:
             if self.special_token:
                 example["input_ids"] = example["input_ids"][
                     0 : self.model_input_size - 2
-                ]  # truncate to leave space for CLS and EOS token
                 example["input_ids"] = np.insert(
                     example["input_ids"], 0, self.gene_token_dict.get("<cls>")
                 )
                 example["input_ids"] = np.insert(
                     example["input_ids"],
                     len(example["input_ids"]),
-                    self.gene_token_dict.get("<eos>"),
                 )
             else:
                 # Truncate/Crop input_ids to input size

 logger = logging.getLogger(__name__)
+GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary.pkl"
+TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary.pkl"
 def rank_genes(gene_vector, gene_tokens):
         model_input_size : int = 2048
             | Max input size of model to truncate input to.
         special_token : bool = False
+            | Adds CLS token before and SEP token after rank value encoding.
         gene_median_file : Path
             | Path to pickle file containing dictionary of non-zero median
             | gene expression values across Genecorpus-30M.
         # input size for tokenization
         self.model_input_size = model_input_size
+        # add CLS and SEP tokens
         self.special_token = special_token
         # load dictionary of gene normalization factors
         )
         output_path = (Path(output_directory) / output_prefix).with_suffix(".dataset")
+        tokenized_dataset.save_to_disk(output_path)
     def tokenize_files(
         self, data_directory, file_format: Literal["loom", "h5ad"] = "loom"
             if self.special_token:
                 example["input_ids"] = example["input_ids"][
                     0 : self.model_input_size - 2
+                ]  # truncate to leave space for CLS and SEP token
                 example["input_ids"] = np.insert(
                     example["input_ids"], 0, self.gene_token_dict.get("<cls>")
                 )
                 example["input_ids"] = np.insert(
                     example["input_ids"],
                     len(example["input_ids"]),
+                    self.gene_token_dict.get("<sep>"),
                 )
             else:
                 # Truncate/Crop input_ids to input size