browser-backend / prepare_pickle.py
atwang's picture
add pickle prep script and hot fix app to not load dna
df89a31
raw
history blame
6.5 kB
import pickle
from pathlib import Path
import numpy as np
import h5py
import faiss
import click
def getFlatIP():
test_index = faiss.IndexFlatIP(768)
return test_index
def getFlatL2():
test_index = faiss.IndexFlatL2(768)
return test_index
def getIVFFlat(all_keys, seen_test, unseen_test, seen_val, unseen_val):
quantizer = faiss.IndexFlatIP(768)
test_index = faiss.IndexIVFFlat(quantizer, 768, 128)
test_index.train(all_keys)
test_index.train(seen_test)
test_index.train(unseen_test)
test_index.train(seen_val)
test_index.train(unseen_val)
return test_index
def getHNSW():
# 16: connections for each vertex. efSearch: depth of search during search. efConstruction: depth of search during build
test_index = faiss.IndexHNSWFlat(768, 16)
test_index.hnsw.efSearch = 32
test_index.hnsw.efConstruction = 64
return test_index
def getLSH():
test_index = faiss.IndexLSH(768, 768 * 2)
return test_index
def getIdToEmbedding(allid, stid, utid, svalid, uvalid, all_keys, seen_test, unseen_test, seen_val, unseen_val):
id_to_emb_dict = dict()
i = 0
for id in allid:
id_to_emb_dict[id] = np.array([all_keys[i]])
i += 1
for id in stid:
id_to_emb_dict[id] = np.array([seen_test[i]])
i += 1
for id in utid:
id_to_emb_dict[id] = np.array([unseen_test[i]])
i += 1
for id in svalid:
id_to_emb_dict[id] = np.array([seen_val[i]])
i += 1
for id in uvalid:
id_to_emb_dict[id] = np.array([unseen_val[i]])
i += 1
return id_to_emb_dict
@click.command()
@click.option(
"--input",
type=click.Path(path_type=Path),
default="bioscan-clip-scripts/extracted_features",
help="Path to extracted features",
)
@click.option(
"--metadata", type=click.Path(path_type=Path), default="data/BIOSCAN_5M/BIOSCAN_5M.hdf5", help="Path to metadata"
)
@click.option(
"--output", type=click.Path(path_type=Path), default="bioscan-clip-scripts/index", help="Path to save the index"
)
def main(input, metadata, output):
# initialize data
all_keys = h5py.File(input / "extracted_features_of_all_keys.hdf5", "r", libver="latest")
all_keys_dna = all_keys["encoded_dna_feature"][:]
all_keys_im = all_keys["encoded_image_feature"][:]
seen_test = h5py.File(input / "extracted_features_of_seen_test.hdf5", "r", libver="latest")
seen_test_dna = seen_test["encoded_dna_feature"][:]
seen_test_im = seen_test["encoded_image_feature"][:]
unseen_test = h5py.File(input / "extracted_features_of_unseen_test.hdf5", "r", libver="latest")
unseen_test_dna = unseen_test["encoded_dna_feature"][:]
unseen_test_im = unseen_test["encoded_image_feature"][:]
seen_val = h5py.File(input / "extracted_features_of_seen_val.hdf5", "r", libver="latest")
seen_val_dna = seen_val["encoded_dna_feature"][:]
seen_val_im = seen_val["encoded_image_feature"][:]
unseen_val = h5py.File(input / "extracted_features_of_unseen_val.hdf5", "r", libver="latest")
unseen_val_dna = unseen_val["encoded_dna_feature"][:]
unseen_val_im = unseen_val["encoded_image_feature"][:]
dataset = h5py.File(metadata, "r", libver="latest")
id_field = "sampleid" # "processid"
allid = [item.decode("utf-8") for item in dataset["all_keys"][id_field][:]]
stid = [item.decode("utf-8") for item in dataset["test_seen"][id_field][:]]
utid = [item.decode("utf-8") for item in dataset["test_unseen"][id_field][:]]
svalid = [item.decode("utf-8") for item in dataset["val_seen"][id_field][:]]
uvalid = [item.decode("utf-8") for item in dataset["val_unseen"][id_field][:]]
all_keys = dataset["all_keys"]
seen_test = dataset["test_seen"]
unseen_test = dataset["test_unseen"]
seen_val = dataset["val_seen"]
unseen_val = dataset["val_unseen"]
# d = getIdToEmbedding(allid, stid, utid, svalid, uvalid, all_keys_dna, seen_test_dna, unseen_test_dna, seen_val_dna, unseen_val_dna)
# d = getIdToEmbedding(allid, stid, utid, svalid, uvalid, all_keys_im, seen_test_im, unseen_test_im, seen_val_im, unseen_val_im)
big_id_to_image_emb_dict = dict()
i = 0
for object in allid:
big_id_to_image_emb_dict[object] = np.array([all_keys_im[i]])
i += 1
i = 0
for object in stid:
big_id_to_image_emb_dict[object] = np.array([seen_test_im[i]])
i += 1
i = 0
for object in utid:
big_id_to_image_emb_dict[object] = np.array([unseen_test_im[i]])
i += 1
i = 0
for object in svalid:
big_id_to_image_emb_dict[object] = np.array([seen_val_im[i]])
i += 1
i = 0
for object in uvalid:
big_id_to_image_emb_dict[object] = np.array([unseen_val_im[i]])
i += 1
###
big_id_to_dna_emb_dict = dict()
i = 0
for object in allid:
big_id_to_dna_emb_dict[object] = np.array([all_keys_dna[i]])
i += 1
i = 0
for object in stid:
big_id_to_dna_emb_dict[object] = np.array([seen_test_dna[i]])
i += 1
i = 0
for object in utid:
big_id_to_dna_emb_dict[object] = np.array([unseen_test_dna[i]])
i += 1
i = 0
for object in svalid:
big_id_to_dna_emb_dict[object] = np.array([seen_val_dna[i]])
i += 1
i = 0
for object in uvalid:
big_id_to_dna_emb_dict[object] = np.array([unseen_val_dna[i]])
i += 1
###
processid_to_indx = dict()
big_indx_to_id_dict = dict()
i = 0
for object in allid:
big_indx_to_id_dict[i] = object
processid_to_indx[object] = i
i += 1
for object in stid:
big_indx_to_id_dict[i] = object
processid_to_indx[object] = i
i += 1
for object in utid:
big_indx_to_id_dict[i] = object
processid_to_indx[object] = i
i += 1
for object in svalid:
big_indx_to_id_dict[i] = object
processid_to_indx[object] = i
i += 1
for object in uvalid:
big_indx_to_id_dict[i] = object
processid_to_indx[object] = i
i += 1
###
with open(output / "big_id_to_image_emb_dict.pickle", "wb") as f:
pickle.dump(big_id_to_image_emb_dict, f)
with open(output / "big_id_to_dna_emb_dict.pickle", "wb") as f:
pickle.dump(big_id_to_dna_emb_dict, f)
with open(output / "big_indx_to_id_dict.pickle", "wb") as f:
pickle.dump(big_indx_to_id_dict, f)
if __name__ == "__main__":
main()