GenePooler

#429

by jamieb-nvs - opened Oct 2, 2024

base: refs/heads/main

←

from: refs/pr/429

Discussion Files changed

+354

-292

Files changed (14) hide show

README.md +0 -3
examples/cell_classification.ipynb +3 -7
examples/extract_and_plot_cell_embeddings.ipynb +4 -7
examples/gene_classification.ipynb +3 -6
examples/in_silico_perturbation.ipynb +8 -18
examples/tokenizing_scRNAseq_data.ipynb +5 -15
geneformer/emb_extractor.py +1 -13
geneformer/gene_name_id_dict_gc95M.pkl +2 -2
geneformer/in_silico_perturber.py +6 -28
geneformer/in_silico_perturber_stats.py +4 -10
geneformer/mtl/data.py +105 -117
geneformer/pretrainer.py +176 -6
geneformer/tokenizer.py +36 -59
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -1,9 +1,6 @@
 ---
 datasets: ctheodoris/Genecorpus-30M
 license: apache-2.0
-tags:
-- single-cell
-- genomics
 ---
 # Geneformer
 Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.

 ---
 datasets: ctheodoris/Genecorpus-30M
 license: apache-2.0
 ---
 # Geneformer
 Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.

examples/cell_classification.ipynb CHANGED Viewed

@@ -68,10 +68,6 @@
     "    \"per_device_train_batch_size\": 12,\n",
     "    \"seed\": 73,\n",
     "}\n",
-    "\n",
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the Classifier will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
     "cc = Classifier(classifier=\"cell\",\n",
     "                cell_state_dict = {\"state_key\": \"disease\", \"states\": \"all\"},\n",
     "                filter_data=filter_data_dict,\n",
@@ -129,7 +125,7 @@
     "                            \"train\": train_ids+eval_ids,\n",
     "                            \"test\": test_ids}\n",
     "\n",
-    "# Example input_data_file for 30M model: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
     "cc.prepare_data(input_data_file=\"/path/to/human_dcm_hcm_nf_2048_w_length.dataset\",\n",
     "                output_directory=output_dir,\n",
     "                output_prefix=output_prefix,\n",
@@ -264,7 +260,7 @@
     "                            \"train\": train_ids,\n",
     "                            \"eval\": eval_ids}\n",
     "\n",
-    "# Example 6 layer 30M Geneformer model: https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors\n",
     "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
     "                          prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled_train.dataset\",\n",
     "                          id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
@@ -450,7 +446,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

     "    \"per_device_train_batch_size\": 12,\n",
     "    \"seed\": 73,\n",
     "}\n",
     "cc = Classifier(classifier=\"cell\",\n",
     "                cell_state_dict = {\"state_key\": \"disease\", \"states\": \"all\"},\n",
     "                filter_data=filter_data_dict,\n",
     "                            \"train\": train_ids+eval_ids,\n",
     "                            \"test\": test_ids}\n",
     "\n",
+    "# Example input_data_file: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
     "cc.prepare_data(input_data_file=\"/path/to/human_dcm_hcm_nf_2048_w_length.dataset\",\n",
     "                output_directory=output_dir,\n",
     "                output_prefix=output_prefix,\n",
     "                            \"train\": train_ids,\n",
     "                            \"eval\": eval_ids}\n",
     "\n",
+    "# 6 layer Geneformer: https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors\n",
     "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
     "                          prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled_train.dataset\",\n",
     "                          id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

examples/extract_and_plot_cell_embeddings.ipynb CHANGED Viewed

@@ -18,8 +18,6 @@
    "outputs": [],
    "source": [
     "# initiate EmbExtractor\n",
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the EmbExtractor will use the current default model dictionary)\n",
     "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
     "                     num_classes=3,\n",
     "                     filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
@@ -28,13 +26,12 @@
     "                     emb_label=[\"disease\",\"cell_type\"],\n",
     "                     labels_to_plot=[\"disease\"],\n",
     "                     forward_batch_size=200,\n",
-    "                     nproc=16,\n",
-    "                     token_dictionary_file=\"./gene_dictionaries_30m/token_dictionary_gc30M.pkl\") # change from current default dictionary for 30M model series\n",
     "\n",
     "# extracts embedding from input data\n",
     "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
-    "# example dataset for 30M model series: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
-    "embs = embex.extract_embs(\"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
     "                          \"path/to/input_data/\",\n",
     "                          \"path/to/output_directory/\",\n",
     "                          \"output_prefix\")\n"
@@ -132,7 +129,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

    "outputs": [],
    "source": [
     "# initiate EmbExtractor\n",
     "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
     "                     num_classes=3,\n",
     "                     filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
     "                     emb_label=[\"disease\",\"cell_type\"],\n",
     "                     labels_to_plot=[\"disease\"],\n",
     "                     forward_batch_size=200,\n",
+    "                     nproc=16)\n",
     "\n",
     "# extracts embedding from input data\n",
     "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
+    "# example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
+    "embs = embex.extract_embs(\"../fine_tuned_models/geneformer-6L-30M_CellClassifier_cardiomyopathies_220224\",\n",
     "                          \"path/to/input_data/\",\n",
     "                          \"path/to/output_directory/\",\n",
     "                          \"output_prefix\")\n"
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

examples/gene_classification.ipynb CHANGED Viewed

@@ -71,9 +71,6 @@
     }
    ],
    "source": [
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the Classifier will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
     "cc = Classifier(classifier=\"gene\",\n",
     "                gene_class_dict = gene_class_dict,\n",
     "                max_ncells = 10_000,\n",
@@ -105,7 +102,7 @@
     }
    ],
    "source": [
-    "# Example input_data_file for 30M model series: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/gene_classification/dosage_sensitive_tfs/gc-30M_sample50k.dataset\n",
     "cc.prepare_data(input_data_file=\"/path/to/gc-30M_sample50k.dataset\",\n",
     "                output_directory=output_dir,\n",
     "                output_prefix=output_prefix)"
@@ -843,7 +840,7 @@
     }
    ],
    "source": [
-    "# 6 layer 30M Geneformer model: https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors\n",
     "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
     "                          prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled.dataset\",\n",
     "                          id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
@@ -1243,7 +1240,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

     }
    ],
    "source": [
     "cc = Classifier(classifier=\"gene\",\n",
     "                gene_class_dict = gene_class_dict,\n",
     "                max_ncells = 10_000,\n",
     }
    ],
    "source": [
+    "# Example input_data_file: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/gene_classification/dosage_sensitive_tfs/gc-30M_sample50k.dataset\n",
     "cc.prepare_data(input_data_file=\"/path/to/gc-30M_sample50k.dataset\",\n",
     "                output_directory=output_dir,\n",
     "                output_prefix=output_prefix)"
     }
    ],
    "source": [
+    "# 6 layer Geneformer: https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors\n",
     "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
     "                          prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled.dataset\",\n",
     "                          id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,

examples/in_silico_perturbation.ipynb CHANGED Viewed

@@ -39,10 +39,7 @@
     "\n",
     "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n",
     "\n",
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the EmbExtractor will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
-    "embex = EmbExtractor(model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n",
     "                     num_classes=3,\n",
     "                     filter_data=filter_data_dict,\n",
     "                     max_ncells=1000,\n",
@@ -52,7 +49,7 @@
     "                     nproc=16)\n",
     "\n",
     "state_embs_dict = embex.get_state_embs(cell_states_to_model,\n",
-    "                                       \"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
     "                                       \"path/to/input_data\",\n",
     "                                       \"path/to/output_directory\",\n",
     "                                       \"output_prefix\")"
@@ -67,15 +64,12 @@
    },
    "outputs": [],
    "source": [
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the InSilicoPerturber will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
     "                        perturb_rank_shift=None,\n",
     "                        genes_to_perturb=\"all\",\n",
     "                        combos=0,\n",
     "                        anchor_gene=None,\n",
-    "                        model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n",
     "                        num_classes=3,\n",
     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
@@ -96,10 +90,9 @@
    "outputs": [],
    "source": [
     "# outputs intermediate files from in silico perturbation\n",
-    "\n",
-    "isp.perturb_data(\"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
     "                 \"path/to/input_data\",\n",
-    "                 \"path/to/isp_output_directory\",\n",
     "                 \"output_prefix\")"
    ]
   },
@@ -110,9 +103,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the InSilicoPerturberStats will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
     "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
     "                                  genes_perturbed=\"all\",\n",
     "                                  combos=0,\n",
@@ -128,9 +118,9 @@
    "outputs": [],
    "source": [
     "# extracts data from intermediate files and processes stats to output in final .csv\n",
-    "ispstats.get_stats(\"path/to/isp_output_directory\", # this should be the directory \n",
     "                   None,\n",
-    "                   \"path/to/isp_stats_output_directory\",\n",
     "                   \"output_prefix\")"
    ]
   }
@@ -151,7 +141,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

     "\n",
     "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n",
     "\n",
+    "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
     "                     num_classes=3,\n",
     "                     filter_data=filter_data_dict,\n",
     "                     max_ncells=1000,\n",
     "                     nproc=16)\n",
     "\n",
     "state_embs_dict = embex.get_state_embs(cell_states_to_model,\n",
+    "                                       \"path/to/model\",\n",
     "                                       \"path/to/input_data\",\n",
     "                                       \"path/to/output_directory\",\n",
     "                                       \"output_prefix\")"
    },
    "outputs": [],
    "source": [
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
     "                        perturb_rank_shift=None,\n",
     "                        genes_to_perturb=\"all\",\n",
     "                        combos=0,\n",
     "                        anchor_gene=None,\n",
+    "                        model_type=\"CellClassifier\",\n",
     "                        num_classes=3,\n",
     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
    "outputs": [],
    "source": [
     "# outputs intermediate files from in silico perturbation\n",
+    "isp.perturb_data(\"path/to/model\",\n",
     "                 \"path/to/input_data\",\n",
+    "                 \"path/to/output_directory\",\n",
     "                 \"output_prefix\")"
    ]
   },
    "metadata": {},
    "outputs": [],
    "source": [
     "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
     "                                  genes_perturbed=\"all\",\n",
     "                                  combos=0,\n",
    "outputs": [],
    "source": [
     "# extracts data from intermediate files and processes stats to output in final .csv\n",
+    "ispstats.get_stats(\"path/to/input_data\",\n",
     "                   None,\n",
+    "                   \"path/to/output_directory\",\n",
     "                   \"output_prefix\")"
    ]
   }
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

examples/tokenizing_scRNAseq_data.ipynb CHANGED Viewed

@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "1fe86f48-5578-47df-b373-58c21ec170ab",
    "metadata": {},
    "source": [
     "#### Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.\n",
@@ -25,21 +25,11 @@
     "\n",
     "#### Additionally, if the original .loom file contains a cell column attribute called \"filter_pass\", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with \"1\" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.\n",
     "\n",
-    "#### If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32c69493-4e5a-4b07-8dc1-958ff2ee7d0b",
-   "metadata": {},
-   "source": [
-    "**********************************************************************************************************\n",
     "#### OF NOTE: PLEASE ENSURE THE CORRECT TOKEN DICTIONARY AND GENE MEDIAN FILE IS USED FOR THE CORRECT MODEL.\n",
-    "#### 95M: current defaults; 30M: https://huggingface.co/ctheodoris/Geneformer/tree/main/geneformer/gene_dictionaries_30m\n",
     "\n",
-    "#### ADDITIONALLY:\n",
-    "#### The 95M model series require the special_token argument to be set to True and model_input_size to be 4096. (current defaults)\n",
-    "#### The 30M model series require the special_token argument to be set to False and the model_input_size to be 2048."
    ]
   },
   {
@@ -83,7 +73,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "markdown",
+   "id": "350e6252-b783-494b-9767-f087eb868a15",
    "metadata": {},
    "source": [
     "#### Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.\n",
     "\n",
     "#### Additionally, if the original .loom file contains a cell column attribute called \"filter_pass\", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with \"1\" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.\n",
     "\n",
+    "#### If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.\n",
+    "\n",
     "#### OF NOTE: PLEASE ENSURE THE CORRECT TOKEN DICTIONARY AND GENE MEDIAN FILE IS USED FOR THE CORRECT MODEL.\n",
     "\n",
+    "#### The 95M model series also require the special_token argument to be set to True and model_input_size to be 4096."
    ]
   },
   {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

geneformer/emb_extractor.py CHANGED Viewed

@@ -411,7 +411,7 @@ class EmbExtractor:
         self,
         model_type="Pretrained",
         num_classes=0,
-        emb_mode="cls",
         cell_emb_style="mean_pool",
         gene_emb_style="mean_pool",
         filter_data=None,
@@ -596,12 +596,6 @@ class EmbExtractor:
         filtered_input_data = pu.load_and_filter(
             self.filter_data, self.nproc, input_data_file
         )
-        # Check to make sure that all the labels exist in the tokenized data:
-        if self.emb_label is not None:
-            for label in self.emb_label:
-                assert label in filtered_input_data.features.keys(), f"Attribute `{label}` not present in dataset features"
         if cell_state is not None:
             filtered_input_data = pu.filter_by_dict(
                 filtered_input_data, cell_state, self.nproc
@@ -725,12 +719,6 @@ class EmbExtractor:
             )
             raise
-        if self.emb_label is not None:
-            logger.error(
-                "For extracting state embs, emb_label should be None since labels are based on state embs dict keys."
-            )
-            raise
         state_embs_dict = dict()
         state_key = cell_states_to_model["state_key"]
         for k, v in cell_states_to_model.items():

         self,
         model_type="Pretrained",
         num_classes=0,
+        emb_mode="cell",
         cell_emb_style="mean_pool",
         gene_emb_style="mean_pool",
         filter_data=None,
         filtered_input_data = pu.load_and_filter(
             self.filter_data, self.nproc, input_data_file
         )
         if cell_state is not None:
             filtered_input_data = pu.filter_by_dict(
                 filtered_input_data, cell_state, self.nproc
             )
             raise
         state_embs_dict = dict()
         state_key = cell_states_to_model["state_key"]
         for k, v in cell_states_to_model.items():

geneformer/gene_name_id_dict_gc95M.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fabfa0c2f49c598c59ae432a32c3499a5908c033756c663b5e0cddf58deea8e1
-size 1660882

 version https://git-lfs.github.com/spec/v1
+oid sha256:8b0fd0521406ed18b2e341ef0acb5f53aa1a62457a07ca5840e1c142f46dd326
+size 2038812

geneformer/in_silico_perturber.py CHANGED Viewed

@@ -40,7 +40,7 @@ import pickle
 from collections import defaultdict
 import torch
-from datasets import Dataset
 from multiprocess import set_start_method
 from tqdm.auto import trange
@@ -48,9 +48,7 @@ from . import TOKEN_DICTIONARY_FILE
 from . import perturber_utils as pu
 from .emb_extractor import get_embs
-import datasets
-datasets.logging.disable_progress_bar()
 logger = logging.getLogger(__name__)
@@ -86,7 +84,7 @@ class InSilicoPerturber:
         anchor_gene=None,
         model_type="Pretrained",
         num_classes=0,
-        emb_mode="cls",
         cell_emb_style="mean_pool",
         filter_data=None,
         cell_states_to_model=None,
@@ -796,8 +794,6 @@ class InSilicoPerturber:
             return example
         total_batch_length = len(filtered_input_data)
         if self.cell_states_to_model is None:
             cos_sims_dict = defaultdict(list)
         else:
@@ -882,7 +878,7 @@ class InSilicoPerturber:
                         )
             ##### CLS and Gene Embedding Mode #####
-            elif self.emb_mode == "cls_and_gene":
                 full_original_emb = get_embs(
                     model,
                     minibatch,
@@ -895,7 +891,6 @@ class InSilicoPerturber:
                     silent=True,
                 )
                 indices_to_perturb = perturbation_batch["perturb_index"]
                 # remove indices that were perturbed
                 original_emb = pu.remove_perturbed_indices_set(
                     full_original_emb,
@@ -904,7 +899,6 @@ class InSilicoPerturber:
                     self.tokens_to_perturb,
                     minibatch["length"],
                 )
                 full_perturbation_emb = get_embs(
                     model,
                     perturbation_batch,
@@ -916,7 +910,7 @@ class InSilicoPerturber:
                     summary_stat=None,
                     silent=True,
                 )
                 # remove special tokens and padding
                 original_emb = original_emb[:, 1:-1, :]
                 if self.perturb_type == "overexpress":
@@ -927,25 +921,9 @@ class InSilicoPerturber:
                     perturbation_emb = full_perturbation_emb[
                         :, 1 : max(perturbation_batch["length"]) - 1, :
                     ]
-                n_perturbation_genes = perturbation_emb.size()[1]
-                # truncate the original embedding as necessary
-                if self.perturb_type == "overexpress":
-                    def calc_perturbation_length(ids):
-                        if ids == [-100]:
-                            return 0
-                        else:
-                            return len(ids)
-                    max_tensor_size = max([length - calc_perturbation_length(ids) - 2 for length, ids in zip(minibatch["length"], indices_to_perturb)])
-                    max_n_overflow = max(minibatch["n_overflow"])
-                    if max_n_overflow > 0 and perturbation_emb.size()[1] < original_emb.size()[1]:
-                        original_emb = original_emb[:, 0 : perturbation_emb.size()[1], :]
-                    elif perturbation_emb.size()[1] < original_emb.size()[1]:
-                        original_emb = original_emb[:, 0:max_tensor_size, :]
                 gene_cos_sims = pu.quant_cos_sims(
                     perturbation_emb,
                     original_emb,

 from collections import defaultdict
 import torch
+from datasets import Dataset, disable_progress_bars
 from multiprocess import set_start_method
 from tqdm.auto import trange
 from . import perturber_utils as pu
 from .emb_extractor import get_embs
+disable_progress_bars()
 logger = logging.getLogger(__name__)
         anchor_gene=None,
         model_type="Pretrained",
         num_classes=0,
+        emb_mode="cell",
         cell_emb_style="mean_pool",
         filter_data=None,
         cell_states_to_model=None,
             return example
         total_batch_length = len(filtered_input_data)
         if self.cell_states_to_model is None:
             cos_sims_dict = defaultdict(list)
         else:
                         )
             ##### CLS and Gene Embedding Mode #####
+            elif self.emb_mode == "cls_and_gene":
                 full_original_emb = get_embs(
                     model,
                     minibatch,
                     silent=True,
                 )
                 indices_to_perturb = perturbation_batch["perturb_index"]
                 # remove indices that were perturbed
                 original_emb = pu.remove_perturbed_indices_set(
                     full_original_emb,
                     self.tokens_to_perturb,
                     minibatch["length"],
                 )
                 full_perturbation_emb = get_embs(
                     model,
                     perturbation_batch,
                     summary_stat=None,
                     silent=True,
                 )
                 # remove special tokens and padding
                 original_emb = original_emb[:, 1:-1, :]
                 if self.perturb_type == "overexpress":
                     perturbation_emb = full_perturbation_emb[
                         :, 1 : max(perturbation_batch["length"]) - 1, :
                     ]
+                n_perturbation_genes = perturbation_emb.size()[1]
                 gene_cos_sims = pu.quant_cos_sims(
                     perturbation_emb,
                     original_emb,

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -640,16 +640,10 @@ def isp_stats_mixture_model(cos_sims_df, dict_list, combos, anchor_token):
         cos_sims_full_df = pd.concat([cos_sims_full_df, cos_sims_df_i])
     # quantify number of detections of each gene
-    if anchor_token is None:
-        cos_sims_full_df["N_Detections"] = [
-            n_detections(i, dict_list, "cell", anchor_token)
-            for i in cos_sims_full_df["Gene"]
-        ]
-    else:
-        cos_sims_full_df["N_Detections"] = [
-            n_detections(i, dict_list, "gene", anchor_token)
-            for i in cos_sims_full_df["Gene"]
-        ]
     if combos == 0:
         cos_sims_full_df = cos_sims_full_df.sort_values(

         cos_sims_full_df = pd.concat([cos_sims_full_df, cos_sims_df_i])
     # quantify number of detections of each gene
+    cos_sims_full_df["N_Detections"] = [
+        n_detections(i, dict_list, "gene", anchor_token)
+        for i in cos_sims_full_df["Gene"]
+    ]
     if combos == 0:
         cos_sims_full_df = cos_sims_full_df.sort_values(

geneformer/mtl/data.py CHANGED Viewed

@@ -1,162 +1,150 @@
 import os
 from .collators import DataCollatorForMultitaskCellClassification
 from .imports import *
-def validate_columns(dataset, required_columns, dataset_type):
-    """Ensures required columns are present in the dataset."""
-    missing_columns = [col for col in required_columns if col not in dataset.column_names]
-    if missing_columns:
-        raise KeyError(
-            f"Missing columns in {dataset_type} dataset: {missing_columns}. "
-            f"Available columns: {dataset.column_names}"
-        )
-def create_label_mappings(dataset, task_to_column):
-    """Creates label mappings for the dataset."""
-    task_label_mappings = {}
-    num_labels_list = []
-    for task, column in task_to_column.items():
-        unique_values = sorted(set(dataset[column]))
-        mapping = {label: idx for idx, label in enumerate(unique_values)}
-        task_label_mappings[task] = mapping
-        num_labels_list.append(len(unique_values))
-    return task_label_mappings, num_labels_list
-def save_label_mappings(mappings, path):
-    """Saves label mappings to a pickle file."""
-    with open(path, "wb") as f:
-        pickle.dump(mappings, f)
-def load_label_mappings(path):
-    """Loads label mappings from a pickle file."""
-    with open(path, "rb") as f:
-        return pickle.load(f)
-def transform_dataset(dataset, task_to_column, task_label_mappings, config, is_test):
-    """Transforms the dataset to the required format."""
-    transformed_dataset = []
-    cell_id_mapping = {}
-    for idx, record in enumerate(dataset):
-        transformed_record = {
-            "input_ids": torch.tensor(record["input_ids"], dtype=torch.long),
-            "cell_id": idx,  # Index-based cell ID
-        }
-        if not is_test:
-            label_dict = {
-                task: task_label_mappings[task][record[column]]
-                for task, column in task_to_column.items()
-            }
-        else:
-            label_dict = {task: -1 for task in config["task_names"]}
-        transformed_record["label"] = label_dict
-        transformed_dataset.append(transformed_record)
-        cell_id_mapping[idx] = record.get("unique_cell_id", idx)
-    return transformed_dataset, cell_id_mapping
 def load_and_preprocess_data(dataset_path, config, is_test=False, dataset_type=""):
-    """Main function to load and preprocess data."""
     try:
         dataset = load_from_disk(dataset_path)
-        # Setup task and column mappings
         task_names = [f"task{i+1}" for i in range(len(config["task_columns"]))]
         task_to_column = dict(zip(task_names, config["task_columns"]))
         config["task_names"] = task_names
-        label_mappings_path = os.path.join(
-            config["results_dir"],
-            f"task_label_mappings{'_val' if dataset_type == 'validation' else ''}.pkl"
-        )
         if not is_test:
-            validate_columns(dataset, task_to_column.values(), dataset_type)
-            # Create and save label mappings
-            task_label_mappings, num_labels_list = create_label_mappings(dataset, task_to_column)
-            save_label_mappings(task_label_mappings, label_mappings_path)
         else:
-            # Load existing mappings for test data
-            task_label_mappings = load_label_mappings(label_mappings_path)
-            num_labels_list = [len(mapping) for mapping in task_label_mappings.values()]
-        # Transform dataset
-        transformed_dataset, cell_id_mapping = transform_dataset(
-            dataset, task_to_column, task_label_mappings, config, is_test
-        )
-        return transformed_dataset, cell_id_mapping, num_labels_list
     except KeyError as e:
-        raise ValueError(f"Configuration error or dataset key missing: {e}")
     except Exception as e:
-        raise RuntimeError(f"Error during data loading or preprocessing: {e}")
 def preload_and_process_data(config):
-    """Preloads and preprocesses train and validation datasets."""
-    # Process train data and save mappings
-    train_data = load_and_preprocess_data(config["train_path"], config, dataset_type="train")
-    # Process validation data and save mappings
-    val_data = load_and_preprocess_data(config["val_path"], config, dataset_type="validation")
-    # Validate that the mappings match
-    validate_label_mappings(config)
-    return (*train_data[:2], *val_data)  # Return train and val data along with mappings
-def validate_label_mappings(config):
-    """Ensures train and validation label mappings are consistent."""
-    train_mappings_path = os.path.join(config["results_dir"], "task_label_mappings.pkl")
-    val_mappings_path = os.path.join(config["results_dir"], "task_label_mappings_val.pkl")
-    train_mappings = load_label_mappings(train_mappings_path)
-    val_mappings = load_label_mappings(val_mappings_path)
-    for task_name in config["task_names"]:
-        if train_mappings[task_name] != val_mappings[task_name]:
-            raise ValueError(
-                f"Mismatch in label mappings for task '{task_name}'.\n"
-                f"Train Mapping: {train_mappings[task_name]}\n"
-                f"Validation Mapping: {val_mappings[task_name]}"
-            )
-def get_data_loader(preprocessed_dataset, batch_size):
-    """Creates a DataLoader with optimal settings."""
-    return DataLoader(
         preprocessed_dataset,
         batch_size=batch_size,
         shuffle=True,
-        collate_fn=DataCollatorForMultitaskCellClassification(),
-        num_workers=os.cpu_count(),
         pin_memory=True,
     )
 def preload_data(config):
-    """Preprocesses train and validation data for trials."""
-    train_loader = get_data_loader(*preload_and_process_data(config)[:2], config["batch_size"])
-    val_loader = get_data_loader(*preload_and_process_data(config)[2:4], config["batch_size"])
     return train_loader, val_loader
 def load_and_preprocess_test_data(config):
-    """Loads and preprocesses test data."""
     return load_and_preprocess_data(config["test_path"], config, is_test=True)
 def prepare_test_loader(config):
-    """Prepares DataLoader for test data."""
-    test_dataset, cell_id_mapping, num_labels_list = load_and_preprocess_test_data(config)
     test_loader = get_data_loader(test_dataset, config["batch_size"])
     return test_loader, cell_id_mapping, num_labels_list

 import os
 from .collators import DataCollatorForMultitaskCellClassification
 from .imports import *
 def load_and_preprocess_data(dataset_path, config, is_test=False, dataset_type=""):
     try:
         dataset = load_from_disk(dataset_path)
         task_names = [f"task{i+1}" for i in range(len(config["task_columns"]))]
         task_to_column = dict(zip(task_names, config["task_columns"]))
         config["task_names"] = task_names
         if not is_test:
+            available_columns = set(dataset.column_names)
+            for column in task_to_column.values():
+                if column not in available_columns:
+                    raise KeyError(
+                        f"Column {column} not found in the dataset. Available columns: {list(available_columns)}"
+                    )
+        label_mappings = {}
+        task_label_mappings = {}
+        cell_id_mapping = {}
+        num_labels_list = []
+        # Load or create task label mappings
+        if not is_test:
+            for task, column in task_to_column.items():
+                unique_values = sorted(set(dataset[column]))  # Ensure consistency
+                label_mappings[column] = {
+                    label: idx for idx, label in enumerate(unique_values)
+                }
+                task_label_mappings[task] = label_mappings[column]
+                num_labels_list.append(len(unique_values))
+            # Print the mappings for each task with dataset type prefix
+            for task, mapping in task_label_mappings.items():
+                print(
+                    f"{dataset_type.capitalize()} mapping for {task}: {mapping}"
+                )  # sanity check, for train/validation splits
+            # Save the task label mappings as a pickle file
+            with open(f"{config['results_dir']}/task_label_mappings.pkl", "wb") as f:
+                pickle.dump(task_label_mappings, f)
         else:
+            # Load task label mappings from pickle file for test data
+            with open(f"{config['results_dir']}/task_label_mappings.pkl", "rb") as f:
+                task_label_mappings = pickle.load(f)
+            # Infer num_labels_list from task_label_mappings
+            for task, mapping in task_label_mappings.items():
+                num_labels_list.append(len(mapping))
+        # Store unique cell IDs in a separate dictionary
+        for idx, record in enumerate(dataset):
+            cell_id = record.get("unique_cell_id", idx)
+            cell_id_mapping[idx] = cell_id
+        # Transform records to the desired format
+        transformed_dataset = []
+        for idx, record in enumerate(dataset):
+            transformed_record = {}
+            transformed_record["input_ids"] = torch.tensor(
+                record["input_ids"], dtype=torch.long
+            )
+            # Use index-based cell ID for internal tracking
+            transformed_record["cell_id"] = idx
+            if not is_test:
+                # Prepare labels
+                label_dict = {}
+                for task, column in task_to_column.items():
+                    label_value = record[column]
+                    label_index = task_label_mappings[task][label_value]
+                    label_dict[task] = label_index
+                transformed_record["label"] = label_dict
+            else:
+                # Create dummy labels for test data
+                label_dict = {task: -1 for task in config["task_names"]}
+                transformed_record["label"] = label_dict
+            transformed_dataset.append(transformed_record)
+        return transformed_dataset, cell_id_mapping, num_labels_list
     except KeyError as e:
+        print(f"Missing configuration or dataset key: {e}")
     except Exception as e:
+        print(f"An error occurred while loading or preprocessing data: {e}")
+        return None, None, None
 def preload_and_process_data(config):
+    # Load and preprocess data once
+    train_dataset, train_cell_id_mapping, num_labels_list = load_and_preprocess_data(
+        config["train_path"], config, dataset_type="train"
+    )
+    val_dataset, val_cell_id_mapping, _ = load_and_preprocess_data(
+        config["val_path"], config, dataset_type="validation"
+    )
+    return (
+        train_dataset,
+        train_cell_id_mapping,
+        val_dataset,
+        val_cell_id_mapping,
+        num_labels_list,
+    )
+def get_data_loader(preprocessed_dataset, batch_size):
+    nproc = os.cpu_count()  ### I/O operations
+    data_collator = DataCollatorForMultitaskCellClassification()
+    loader = DataLoader(
         preprocessed_dataset,
         batch_size=batch_size,
         shuffle=True,
+        collate_fn=data_collator,
+        num_workers=nproc,
         pin_memory=True,
     )
+    return loader
 def preload_data(config):
+    # Preprocessing the data before the Optuna trials start
+    train_loader = get_data_loader("train", config)
+    val_loader = get_data_loader("val", config)
     return train_loader, val_loader
 def load_and_preprocess_test_data(config):
+    """
+    Load and preprocess test data, treating it as unlabeled.
+    """
     return load_and_preprocess_data(config["test_path"], config, is_test=True)
 def prepare_test_loader(config):
+    """
+    Prepare DataLoader for the test dataset.
+    """
+    test_dataset, cell_id_mapping, num_labels_list = load_and_preprocess_test_data(
+        config
+    )
     test_loader = get_data_loader(test_dataset, config["batch_size"])
     return test_loader, cell_id_mapping, num_labels_list

geneformer/pretrainer.py CHANGED Viewed

@@ -8,12 +8,13 @@ import math
 import pickle
 import warnings
 from enum import Enum
-from typing import Dict, List, Optional, Union
 import numpy as np
 import torch
 from datasets import Dataset
 from packaging import version
 from torch.utils.data.sampler import RandomSampler
 from transformers import (
     BatchEncoding,
@@ -23,8 +24,11 @@ from transformers import (
 )
 from transformers.file_utils import is_datasets_available, is_sagemaker_dp_enabled
 from transformers.trainer_pt_utils import (
     LengthGroupedSampler,
 )
 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
@@ -603,7 +607,7 @@ class GeneformerPretrainer(Trainer):
             )
         super().__init__(*args, **kwargs)
-    # updated to not use distributed sampler since Trainer now distributes with accelerate
     def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
         if not isinstance(self.train_dataset, collections.abc.Sized):
             return None
@@ -626,15 +630,181 @@ class GeneformerPretrainer(Trainer):
                 if self.tokenizer is not None
                 else None
             )
-            return LengthGroupedSampler(
                     dataset=self.train_dataset,
                     batch_size=self.args.train_batch_size,
                     lengths=lengths,
                     model_input_name=model_input_name,
                     generator=generator,
             )
         else:
-            if _is_torch_generator_available:
-                return RandomSampler(self.train_dataset, generator=generator)
-            return RandomSampler(self.train_dataset)

 import pickle
 import warnings
 from enum import Enum
+from typing import Dict, Iterator, List, Optional, Union
 import numpy as np
 import torch
 from datasets import Dataset
 from packaging import version
+from torch.utils.data.distributed import DistributedSampler
 from torch.utils.data.sampler import RandomSampler
 from transformers import (
     BatchEncoding,
 )
 from transformers.file_utils import is_datasets_available, is_sagemaker_dp_enabled
 from transformers.trainer_pt_utils import (
+    DistributedLengthGroupedSampler,
+    DistributedSamplerWithLoop,
     LengthGroupedSampler,
 )
+from transformers.training_args import ParallelMode
 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
             )
         super().__init__(*args, **kwargs)
+    # modify LengthGroupedSampler to avoid dataset[length_column_name] hanging
     def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
         if not isinstance(self.train_dataset, collections.abc.Sized):
             return None
                 if self.tokenizer is not None
                 else None
             )
+            if self.args.world_size <= 1:
+                return LengthGroupedSampler(
                     dataset=self.train_dataset,
                     batch_size=self.args.train_batch_size,
                     lengths=lengths,
                     model_input_name=model_input_name,
                     generator=generator,
+                )
+            else:
+                return CustomDistributedLengthGroupedSampler(
+                    dataset=self.train_dataset,
+                    batch_size=self.args.train_batch_size,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    lengths=lengths,
+                    model_input_name=model_input_name,
+                    seed=self.args.seed,
+                )
+        else:
+            if self.args.world_size <= 1:
+                if _is_torch_generator_available:
+                    return RandomSampler(self.train_dataset, generator=generator)
+                return RandomSampler(self.train_dataset)
+            elif (
+                self.args.parallel_mode
+                in [ParallelMode.TPU, ParallelMode.SAGEMAKER_MODEL_PARALLEL]
+                and not self.args.dataloader_drop_last
+            ):
+                # Use a loop for TPUs when drop_last is False to have all batches have the same size.
+                return DistributedSamplerWithLoop(
+                    self.train_dataset,
+                    batch_size=self.args.per_device_train_batch_size,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    seed=self.args.seed,
+                )
+            else:
+                return DistributedSampler(
+                    self.train_dataset,
+                    num_replicas=self.args.world_size,
+                    rank=self.args.process_index,
+                    seed=self.args.seed,
+                )
+class CustomDistributedLengthGroupedSampler(DistributedLengthGroupedSampler):
+    r"""
+    Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
+    length while keeping a bit of randomness.
+    """
+    # Copied and adapted from PyTorch DistributedSampler.
+    def __init__(
+        self,
+        dataset: Dataset,
+        batch_size: int,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        seed: int = 0,
+        drop_last: bool = False,
+        lengths: Optional[List[int]] = None,
+        model_input_name: Optional[str] = None,
+    ):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.num_replicas) / self.num_replicas
             )
+        else:
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
+        self.total_size = self.num_samples * self.num_replicas
+        self.seed = seed
+        self.model_input_name = (
+            model_input_name if model_input_name is not None else "input_ids"
+        )
+        if lengths is None:
+            print("Lengths is none - calculating lengths.")
+            if (
+                not (
+                    isinstance(dataset[0], dict)
+                    or isinstance(dataset[0], BatchEncoding)
+                )
+                or self.model_input_name not in dataset[0]
+            ):
+                raise ValueError(
+                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
+                    f"'{self.model_input_name}' key."
+                )
+            lengths = [len(feature[self.model_input_name]) for feature in dataset]
+        self.lengths = lengths
+    def __iter__(self) -> Iterator:
+        # Deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+        indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            indices += indices[: (self.total_size - len(indices))]
         else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+def get_length_grouped_indices(
+    lengths, batch_size, mega_batch_mult=None, generator=None
+):
+    """
+    Return a list of indices so that each slice of :obj:`batch_size` consecutive indices correspond to elements of
+    similar lengths. To do this, the indices are:
+    - randomly permuted
+    - grouped in mega-batches of size :obj:`mega_batch_mult * batch_size`
+    - sorted by length in each mega-batch
+    The result is the concatenation of all mega-batches, with the batch of :obj:`batch_size` containing the element of
+    maximum length placed first, so that an OOM happens sooner rather than later.
+    """
+    # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller.
+    if mega_batch_mult is None:
+        # mega_batch_mult = min(len(lengths) // (batch_size * 4), 50)
+        mega_batch_mult = min(len(lengths) // (batch_size * 4), 1000)
+        # Just in case, for tiny datasets
+        if mega_batch_mult == 0:
+            mega_batch_mult = 1
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = mega_batch_mult * batch_size
+    megabatches = [
+        indices[i : i + megabatch_size].tolist()
+        for i in range(0, len(lengths), megabatch_size)
+    ]
+    megabatches = [
+        list(sorted(megabatch, key=lambda i: lengths[i], reverse=True))
+        for megabatch in megabatches
+    ]
+    # The rest is to get the biggest batch first.
+    # Since each megabatch is sorted by descending length, the longest element is the first
+    megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches]
+    max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item()
+    # Switch to put the longest element in first position
+    megabatches[0][0], megabatches[max_idx][0] = (
+        megabatches[max_idx][0],
+        megabatches[0][0],
+    )
+    return [item for sublist in megabatches for item in sublist]

geneformer/tokenizer.py CHANGED Viewed

@@ -88,7 +88,6 @@ def sum_ensembl_ids(
     collapse_gene_ids,
     gene_mapping_dict,
     gene_token_dict,
-    custom_attr_name_dict,
     file_format="loom",
     chunk_size=512,
 ):
@@ -104,45 +103,33 @@ def sum_ensembl_ids(
             assert (
                 "ensembl_id_collapsed" not in data.ra.keys()
             ), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
-            assert (
-                "n_counts" in data.ca.keys()
-            ), "'n_counts' column missing from data.ca.keys()"
-            if custom_attr_name_dict is not None:
-                for label in custom_attr_name_dict:
-                    assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features"
-            # Get the ensembl ids that exist in data
-            ensembl_ids = data.ra.ensembl_id
             # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
             # Comparing to gene_token_dict here, would not perform any mapping steps
-            if not collapse_gene_ids:
-                ensembl_id_check = [
-                    gene for gene in ensembl_ids if gene in gene_token_dict.keys()
-                ]
-                if len(ensembl_id_check) == len(set(ensembl_id_check)):
                     return data_directory
                 else:
                     raise ValueError("Error: data Ensembl IDs non-unique.")
-            # Get the genes that exist in the mapping dictionary and the value of those genes
-            genes_in_map_dict = [gene for gene in ensembl_ids if gene in gene_mapping_dict.keys()]
-            vals_from_map_dict = [gene_mapping_dict.get(gene) for gene in genes_in_map_dict]
-            # if the genes in the mapping dict and the value of those genes are of the same length,
-            # simply return the mapped values
-            if(len(set(genes_in_map_dict)) == len(set(vals_from_map_dict))):
-                mapped_vals = [gene_mapping_dict.get(gene.upper()) for gene in data.ra["ensembl_id"]]
-                data.ra["ensembl_id_collapsed"] = mapped_vals
                 return data_directory
-            # Genes need to be collapsed
             else:
                 dedup_filename = data_directory.with_name(
                     data_directory.stem + "__dedup.loom"
                 )
-                mapped_vals = [gene_mapping_dict.get(gene.upper()) for gene in data.ra["ensembl_id"]]
-                data.ra["ensembl_id_collapsed"] = mapped_vals
                 dup_genes = [
                     idx
                     for idx, count in Counter(data.ra["ensembl_id_collapsed"]).items()
@@ -216,41 +203,33 @@ def sum_ensembl_ids(
         assert (
             "ensembl_id_collapsed" not in data.var.columns
         ), "'ensembl_id_collapsed' column already exists in data.var"
-        assert (
-            "n_counts" in data.obs.columns
-        ), "'n_counts' column missing from data.obs"
-        if custom_attr_name_dict is not None:
-            for label in custom_attr_name_dict:
-                assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs"
-        # Get the ensembl ids that exist in data
-        ensembl_ids = data.var.ensembl_id
         # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
         # Comparing to gene_token_dict here, would not perform any mapping steps
-        if not collapse_gene_ids:
-            ensembl_id_check = [
-                gene for gene in ensembl_ids if gene in gene_token_dict.keys()
-            ]
-            if len(ensembl_id_check) == len(set(ensembl_id_check)):
-                return data_directory
             else:
                 raise ValueError("Error: data Ensembl IDs non-unique.")
-        # Get the genes that exist in the mapping dictionary and the value of those genes
-        genes_in_map_dict = [gene for gene in ensembl_ids if gene in gene_mapping_dict.keys()]
-        vals_from_map_dict = [gene_mapping_dict.get(gene) for gene in genes_in_map_dict]
-        # if the genes in the mapping dict and the value of those genes are of the same length,
-        # simply return the mapped values
-        if(len(set(genes_in_map_dict)) == len(set(vals_from_map_dict))):
-            data.var["ensembl_id_collapsed"] = data.var.ensembl_id.str.upper().map(gene_mapping_dict)
             return data
-        # Genes need to be collapsed
         else:
-            data.var["ensembl_id_collapsed"] = data.var.ensembl_id.str.upper().map(gene_mapping_dict)
-            data.var_names = data.var["ensembl_id_collapsed"]
             data = data[:, ~data.var.index.isna()]
             dup_genes = [
                 idx for idx, count in Counter(data.var_names).items() if count > 1
@@ -476,7 +455,6 @@ class TranscriptomeTokenizer:
             self.collapse_gene_ids,
             self.gene_mapping_dict,
             self.gene_token_dict,
-            self.custom_attr_name_dict,
             file_format="h5ad",
             chunk_size=self.chunk_size,
         )
@@ -553,7 +531,6 @@ class TranscriptomeTokenizer:
             self.collapse_gene_ids,
             self.gene_mapping_dict,
             self.gene_token_dict,
-            self.custom_attr_name_dict,
             file_format="loom",
             chunk_size=self.chunk_size,
         )

     collapse_gene_ids,
     gene_mapping_dict,
     gene_token_dict,
     file_format="loom",
     chunk_size=512,
 ):
             assert (
                 "ensembl_id_collapsed" not in data.ra.keys()
             ), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
             # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
             # Comparing to gene_token_dict here, would not perform any mapping steps
+            gene_ids_in_dict = [
+                gene for gene in data.ra.ensembl_id if gene in gene_token_dict.keys()
+            ]
+            if collapse_gene_ids is False:
+                if len(gene_ids_in_dict) == len(set(gene_ids_in_dict)):
                     return data_directory
                 else:
                     raise ValueError("Error: data Ensembl IDs non-unique.")
+            gene_ids_collapsed = [
+                gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id
+            ]
+            gene_ids_collapsed_in_dict = [
+                gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
+            ]
+            if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
+                data.ra["ensembl_id_collapsed"] = gene_ids_collapsed
                 return data_directory
             else:
                 dedup_filename = data_directory.with_name(
                     data_directory.stem + "__dedup.loom"
                 )
+                data.ra["ensembl_id_collapsed"] = gene_ids_collapsed
                 dup_genes = [
                     idx
                     for idx, count in Counter(data.ra["ensembl_id_collapsed"]).items()
         assert (
             "ensembl_id_collapsed" not in data.var.columns
         ), "'ensembl_id_collapsed' column already exists in data.var"
         # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
         # Comparing to gene_token_dict here, would not perform any mapping steps
+        gene_ids_in_dict = [
+            gene for gene in data.var.ensembl_id if gene in gene_token_dict.keys()
+        ]
+        if collapse_gene_ids is False:
+            if len(gene_ids_in_dict) == len(set(gene_ids_in_dict)):
+                return data
             else:
                 raise ValueError("Error: data Ensembl IDs non-unique.")
+        # Check for when if collapse_gene_ids is True
+        gene_ids_collapsed = [
+            gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id
+        ]
+        gene_ids_collapsed_in_dict = [
+            gene for gene in gene_ids_collapsed if gene in gene_token_dict.keys()
+        ]
+        if len(set(gene_ids_in_dict)) == len(set(gene_ids_collapsed_in_dict)):
+            data.var["ensembl_id_collapsed"] = data.var.ensembl_id.map(gene_mapping_dict)
             return data
         else:
+            data.var["ensembl_id_collapsed"] = gene_ids_collapsed
+            data.var_names = gene_ids_collapsed
             data = data[:, ~data.var.index.isna()]
             dup_genes = [
                 idx for idx, count in Counter(data.var_names).items() if count > 1
             self.collapse_gene_ids,
             self.gene_mapping_dict,
             self.gene_token_dict,
             file_format="h5ad",
             chunk_size=self.chunk_size,
         )
             self.collapse_gene_ids,
             self.gene_mapping_dict,
             self.gene_token_dict,
             file_format="loom",
             chunk_size=self.chunk_size,
         )

requirements.txt CHANGED Viewed

@@ -22,4 +22,4 @@ tdigest>=0.5.2
 tensorboard>=2.15
 torch>=2.0.1
 tqdm>=4.65
-transformers>=4.40

 tensorboard>=2.15
 torch>=2.0.1
 tqdm>=4.65
+transformers>=4.28