Christina Theodoris commited on
Commit
e3330a6
1 Parent(s): d1931b1

edit docstring format to highlight options

Browse files
Files changed (1) hide show
  1. geneformer/tokenizer.py +10 -10
geneformer/tokenizer.py CHANGED
@@ -94,14 +94,14 @@ class TranscriptomeTokenizer:
94
  | Keys are the names of the attributes in the loom file.
95
  | Values are the names of the attributes in the dataset.
96
  nproc : int
97
- Number of processes to use for dataset mapping.
98
  chunk_size: int = 512
99
- Chunk size for anndata tokenizer.
100
  gene_median_file : Path
101
- Path to pickle file containing dictionary of non-zero median
102
- gene expression values across Genecorpus-30M.
103
  token_dictionary_file : Path
104
- Path to pickle file containing token dictionary (Ensembl IDs:token).
105
  """
106
  # dictionary of custom attributes {output dataset column name: input .loom column name}
107
  self.custom_attr_name_dict = custom_attr_name_dict
@@ -141,15 +141,15 @@ class TranscriptomeTokenizer:
141
  **Parameters:**
142
 
143
  data_directory : Path
144
- Path to directory containing loom files or anndata files
145
  output_directory : Path
146
- Path to directory where tokenized data will be saved as .dataset
147
  output_prefix : str
148
- Prefix for output .dataset
149
  file_format : str
150
- Format of input files. Can be "loom" or "h5ad".
151
  use_generator : bool
152
- Whether to use generator or dict for tokenization.
153
  """
154
  tokenized_cells, cell_metadata = self.tokenize_files(
155
  Path(data_directory), file_format
 
94
  | Keys are the names of the attributes in the loom file.
95
  | Values are the names of the attributes in the dataset.
96
  nproc : int
97
+ | Number of processes to use for dataset mapping.
98
  chunk_size: int = 512
99
+ | Chunk size for anndata tokenizer.
100
  gene_median_file : Path
101
+ | Path to pickle file containing dictionary of non-zero median
102
+ | gene expression values across Genecorpus-30M.
103
  token_dictionary_file : Path
104
+ | Path to pickle file containing token dictionary (Ensembl IDs:token).
105
  """
106
  # dictionary of custom attributes {output dataset column name: input .loom column name}
107
  self.custom_attr_name_dict = custom_attr_name_dict
 
141
  **Parameters:**
142
 
143
  data_directory : Path
144
+ | Path to directory containing loom files or anndata files
145
  output_directory : Path
146
+ | Path to directory where tokenized data will be saved as .dataset
147
  output_prefix : str
148
+ | Prefix for output .dataset
149
  file_format : str
150
+ | Format of input files. Can be "loom" or "h5ad".
151
  use_generator : bool
152
+ | Whether to use generator or dict for tokenization.
153
  """
154
  tokenized_cells, cell_metadata = self.tokenize_files(
155
  Path(data_directory), file_format