ctheodoris commited on
Commit
da8cf3d
1 Parent(s): cb1b0d5

update tokenizer to defaults for 95M models for special token and input size

Browse files
Files changed (1) hide show
  1. geneformer/tokenizer.py +17 -4
geneformer/tokenizer.py CHANGED
@@ -18,6 +18,9 @@ Geneformer tokenizer.
18
  | No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes "cell_type" and "organ_major" and one would like to retain these attributes as labels in the tokenized dataset with the new names "cell_type" and "organ", respectively, the following custom attribute dictionary should be provided: {"cell_type": "cell_type", "organ_major": "organ"}.
19
  | Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.
20
  | If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.
 
 
 
21
  """
22
 
23
  from __future__ import annotations
@@ -255,8 +258,8 @@ class TranscriptomeTokenizer:
255
  custom_attr_name_dict=None,
256
  nproc=1,
257
  chunk_size=512,
258
- model_input_size=2048,
259
- special_token=False,
260
  collapse_gene_ids=True,
261
  gene_median_file=GENE_MEDIAN_FILE,
262
  token_dictionary_file=TOKEN_DICTIONARY_FILE,
@@ -273,10 +276,12 @@ class TranscriptomeTokenizer:
273
  | Number of processes to use for dataset mapping.
274
  chunk_size : int = 512
275
  | Chunk size for anndata tokenizer.
276
- model_input_size : int = 2048
277
  | Max input size of model to truncate input to.
278
- special_token : bool = False
 
279
  | Adds CLS token before and EOS token after rank value encoding.
 
280
  collapse_gene_ids : bool = True
281
  | Whether to collapse gene IDs based on gene mapping dictionary.
282
  gene_median_file : Path
@@ -321,6 +326,14 @@ class TranscriptomeTokenizer:
321
  )
322
  raise
323
 
 
 
 
 
 
 
 
 
324
  # if collapsing duplicate gene IDs
325
  self.collapse_gene_ids = collapse_gene_ids
326
 
 
18
  | No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes "cell_type" and "organ_major" and one would like to retain these attributes as labels in the tokenized dataset with the new names "cell_type" and "organ", respectively, the following custom attribute dictionary should be provided: {"cell_type": "cell_type", "organ_major": "organ"}.
19
  | Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.
20
  | If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.
21
+ | OF NOTE: Take care that the correct token dictionary and gene median file is used for the correct model.
22
+ | OF NOTE: For 95M model series, special_token should be True and model_input_size should be 4096.
23
+ | OF NOTE: For 30M model series, special_token should be False and model_input_size should be 2048.
24
  """
25
 
26
  from __future__ import annotations
 
258
  custom_attr_name_dict=None,
259
  nproc=1,
260
  chunk_size=512,
261
+ model_input_size=4096,
262
+ special_token=True,
263
  collapse_gene_ids=True,
264
  gene_median_file=GENE_MEDIAN_FILE,
265
  token_dictionary_file=TOKEN_DICTIONARY_FILE,
 
276
  | Number of processes to use for dataset mapping.
277
  chunk_size : int = 512
278
  | Chunk size for anndata tokenizer.
279
+ model_input_size : int = 4096
280
  | Max input size of model to truncate input to.
281
+ | For the 30M model series, should be 2048. For the 95M model series, should be 4096.
282
+ special_token : bool = True
283
  | Adds CLS token before and EOS token after rank value encoding.
284
+ | For the 30M model series, should be False. For the 95M model series, should be True.
285
  collapse_gene_ids : bool = True
286
  | Whether to collapse gene IDs based on gene mapping dictionary.
287
  gene_median_file : Path
 
326
  )
327
  raise
328
 
329
+ if not self.special_token:
330
+ if ("<cls>" in self.gene_token_dict.keys()) and (
331
+ "<eos>" in self.gene_token_dict.keys()
332
+ ):
333
+ logger.warning(
334
+ "<cls> and <eos> are in gene_token_dict but special_token = False. Please note that for 95M model series, special_token should be True."
335
+ )
336
+
337
  # if collapsing duplicate gene IDs
338
  self.collapse_gene_ids = collapse_gene_ids
339