issue with single gene perturbation

#358
by cstrlln - opened

I'm getting this error when trying to do in silico perturb with a single gene:

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

here is my code

# first obtain start, goal, and alt embedding positions
# this function was changed to be separate from perturb_data
# to avoid repeating calcuations when parallelizing perturb_data
cell_states_to_model={"state_key": "cell_type", 
                      "start_state": "Mid", 
                      "goal_state": "Late_3", 
                      "alt_states": ["Late_1","Late_2","Late_4"]}

filter_data_dict={"cell_type":["Mid","Late_3","Late_1","Late_2","Late_4"]}

# embex = EmbExtractor(model_type="CellClassifier",
#                      num_classes=3,
#                      filter_data=filter_data_dict,
#                      max_ncells=1000,
#                      emb_layer=0,
#                      summary_stat="exact_mean",
#                      forward_batch_size=32,
#                      nproc=16)

embex = EmbExtractor(model_type="CellClassifier",
                     num_classes=10,
                     max_ncells=1000,
                     emb_layer=0,
                     summary_stat="exact_mean",
                     forward_batch_size=10,
                     nproc=8)

state_embs_dict = embex.get_state_embs(cell_states_to_model,
                                       "classifier/240625155408/240625_geneformer_cellClassifier_asc_classifier_test/ksplit1",
                                       "data_for_geneformer/asc_organs.dataset",
                                       output_directory = output_dir,
                                       output_prefix = output_prefix)

isp = InSilicoPerturber(perturb_type="overexpress",
                        perturb_rank_shift=None,
                        genes_to_perturb= ['ENSG00000171791'],
                        combos=0,
                        anchor_gene=None,
                        model_type="CellClassifier",
                        num_classes=10,
                        emb_mode="cell",
                        cell_emb_style="mean_pool",
                        cell_states_to_model=cell_states_to_model,
                        state_embs_dict=state_embs_dict,
                        max_ncells=2000,
                        emb_layer=0,
                        forward_batch_size=4,
                        nproc=1)

isp.perturb_data("classifier/240625155408/240625_geneformer_cellClassifier_asc_classifier_test/ksplit1",
                 "data_for_geneformer/asc_organs.dataset",
                 output_directory = output_dir,
                 output_prefix = "pert")

ispstats = InSilicoPerturberStats(mode="goal_state_shift",
                                  genes_perturbed=genes,
                                  combos=0,
                                  anchor_gene=None,
                                  cell_states_to_model=cell_states_to_model)

ispstats.get_stats(input_data_directory = output_dir,
                   null_dist_data_directory = None,
                   output_directory = output_dir,
                   output_prefix = "stats_bcl2_over2")

here is full error, fails only in last step:



100%
 4/4 [00:00<00:00, 437.56it/s]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[30], line 2
      1 # extracts data from intermediate files and processes stats to output in final .csv
----> 2 ispstats.get_stats(input_data_directory = output_dir,
      3                    null_dist_data_directory = None,
      4                    output_directory = output_dir,
      5                    output_prefix = "stats_bcl2_over2")

File ~/miniforge3/envs/geneformer/lib/python3.10/site-packages/geneformer/in_silico_perturber_stats.py:975, in InSilicoPerturberStats.get_stats(self, input_data_directory, null_dist_data_directory, output_directory, output_prefix, null_dict_list)
    966 else:
    967     # cos sim data for effect of gene perturbation on the embedding of each cell
    968     dict_list = read_dictionaries(
    969         input_data_directory,
    970         "cell",
   (...)
    973         self.pickle_suffix,
    974     )
--> 975     gene_list = get_gene_list(dict_list, "cell")
    977 # initiate results dataframe
    978 cos_sims_df_initial = pd.DataFrame(
    979     {
    980         "Gene": gene_list,
   (...)
    991     index=[i for i in range(len(gene_list))],
    992 )

File ~/miniforge3/envs/geneformer/lib/python3.10/site-packages/geneformer/in_silico_perturber_stats.py:153, in get_gene_list(dict_list, mode)
    151 if mode == "gene":
    152     gene_list.remove("cell_emb")
--> 153 gene_list.sort()
    154 return gene_list

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Sign up or log in to comment