In [3]:
import os
import numpy as np
import pandas as pd
import deepsort
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm
import pickle

In [4]:
def compute_metrics(labels, preds):

 # calculate accuracy and macro f1 using sklearn's function
 acc = accuracy_score(labels, preds)
 macro_f1 = f1_score(labels, preds, average='macro')
 return {
 'accuracy': acc,
 'macro_f1': macro_f1
 }

In [5]:
rootdir = "/path/to/data/"

dir_list = []
for dir_i in os.listdir(rootdir):
 if ("results" not in dir_i) & (os.path.isdir(os.path.join(rootdir, dir_i))):
 dir_list += [dir_i]
dir_list

['pancreas',
 'liver',
 'blood',
 'lung',
 'spleen',
 'placenta',
 'colorectum',
 'kidney',
 'brain']

In [None]:
output_dir = "results_EDefault_filtered"
n_epochs = "Default" # scDeepsort default epochs = 300

results_dict = dict()
for dir_name in tqdm(dir_list):
 print(f"TRAINING: {dir_name}")
 subrootdir = f"{rootdir}{dir_name}/"
 train_files = [(f"{subrootdir}{dir_name}_filtered_data_train.csv",f"{subrootdir}{dir_name}_filtered_celltype_train.csv")]
 test_file = f"{subrootdir}{dir_name}_filtered_data_test.csv"
 label_file = f"{subrootdir}{dir_name}_filtered_celltype_test.csv"
 
 # define the model
 model = deepsort.DeepSortClassifier(species='human',
 tissue=dir_name,
 gpu_id=0,
 random_seed=1,
 validation_fraction=0) # use all training data (already held out 20% in test data file)

 # fit the model
 model.fit(train_files, save_path=f"{subrootdir}{output_dir}")
 
 # use the saved model to predict cell types in test data
 model.predict(input_file=test_file,
 model_path=f"{subrootdir}{output_dir}",
 save_path=f"{subrootdir}{output_dir}",
 unsure_rate=0,
 file_type='csv')
 labels_df = pd.read_csv(label_file)
 preds_df = pd.read_csv(f"{subrootdir}{output_dir}/human_{dir_name}_{dir_name}_filtered_data_test.csv")
 label_cell_ids = labels_df["Cell"]
 pred_cell_ids = preds_df["index"]
 assert list(label_cell_ids) == list(pred_cell_ids)
 labels = list(labels_df["Cell_type"])
 if isinstance(preds_df["cell_subtype"][0],float):
 if np.isnan(preds_df["cell_subtype"][0]):
 preds = list(preds_df["cell_type"])
 results = compute_metrics(labels, preds)
 else:
 preds1 = list(preds_df["cell_type"])
 preds2 = list(preds_df["cell_subtype"])
 results1 = compute_metrics(labels, preds1)
 results2 = compute_metrics(labels, preds2)
 if results2["accuracy"] > results1["accuracy"]:
 results = results2
 else:
 results = results1
 
 print(f"{dir_name}: {results}")
 results_dict[dir_name] = results
 with open(f"{subrootdir}deepsort_E{n_epochs}_filtered_pred_{dir_name}.pickle", "wb") as output_file:
 pickle.dump(results, output_file)

# save results
with open(f"{rootdir}deepsort_E{n_epochs}_filtered_pred_dict.pickle", "wb") as output_file:
 pickle.dump(results_dict, output_file)
 