Spaces:
Running
Running
import pickle | |
from pathlib import Path | |
import numpy as np | |
import h5py | |
import faiss | |
import click | |
def getFlatIP(): | |
test_index = faiss.IndexFlatIP(768) | |
return test_index | |
def getFlatL2(): | |
test_index = faiss.IndexFlatL2(768) | |
return test_index | |
def getIVFFlat(all_keys, seen_test, unseen_test, seen_val, unseen_val): | |
quantizer = faiss.IndexFlatIP(768) | |
test_index = faiss.IndexIVFFlat(quantizer, 768, 128) | |
test_index.train(all_keys) | |
test_index.train(seen_test) | |
test_index.train(unseen_test) | |
test_index.train(seen_val) | |
test_index.train(unseen_val) | |
return test_index | |
def getHNSW(): | |
# 16: connections for each vertex. efSearch: depth of search during search. efConstruction: depth of search during build | |
test_index = faiss.IndexHNSWFlat(768, 16) | |
test_index.hnsw.efSearch = 32 | |
test_index.hnsw.efConstruction = 64 | |
return test_index | |
def getLSH(): | |
test_index = faiss.IndexLSH(768, 768 * 2) | |
return test_index | |
def getIdToEmbedding(allid, stid, utid, svalid, uvalid, all_keys, seen_test, unseen_test, seen_val, unseen_val): | |
id_to_emb_dict = dict() | |
i = 0 | |
for id in allid: | |
id_to_emb_dict[id] = np.array([all_keys[i]]) | |
i += 1 | |
for id in stid: | |
id_to_emb_dict[id] = np.array([seen_test[i]]) | |
i += 1 | |
for id in utid: | |
id_to_emb_dict[id] = np.array([unseen_test[i]]) | |
i += 1 | |
for id in svalid: | |
id_to_emb_dict[id] = np.array([seen_val[i]]) | |
i += 1 | |
for id in uvalid: | |
id_to_emb_dict[id] = np.array([unseen_val[i]]) | |
i += 1 | |
return id_to_emb_dict | |
def main(input, metadata, output): | |
# initialize data | |
all_keys = h5py.File(input / "extracted_features_of_all_keys.hdf5", "r", libver="latest") | |
all_keys_dna = all_keys["encoded_dna_feature"][:] | |
all_keys_im = all_keys["encoded_image_feature"][:] | |
seen_test = h5py.File(input / "extracted_features_of_seen_test.hdf5", "r", libver="latest") | |
seen_test_dna = seen_test["encoded_dna_feature"][:] | |
seen_test_im = seen_test["encoded_image_feature"][:] | |
unseen_test = h5py.File(input / "extracted_features_of_unseen_test.hdf5", "r", libver="latest") | |
unseen_test_dna = unseen_test["encoded_dna_feature"][:] | |
unseen_test_im = unseen_test["encoded_image_feature"][:] | |
seen_val = h5py.File(input / "extracted_features_of_seen_val.hdf5", "r", libver="latest") | |
seen_val_dna = seen_val["encoded_dna_feature"][:] | |
seen_val_im = seen_val["encoded_image_feature"][:] | |
unseen_val = h5py.File(input / "extracted_features_of_unseen_val.hdf5", "r", libver="latest") | |
unseen_val_dna = unseen_val["encoded_dna_feature"][:] | |
unseen_val_im = unseen_val["encoded_image_feature"][:] | |
dataset = h5py.File(metadata, "r", libver="latest") | |
id_field = "sampleid" # "processid" | |
allid = [item.decode("utf-8") for item in dataset["all_keys"][id_field][:]] | |
stid = [item.decode("utf-8") for item in dataset["test_seen"][id_field][:]] | |
utid = [item.decode("utf-8") for item in dataset["test_unseen"][id_field][:]] | |
svalid = [item.decode("utf-8") for item in dataset["val_seen"][id_field][:]] | |
uvalid = [item.decode("utf-8") for item in dataset["val_unseen"][id_field][:]] | |
all_keys = dataset["all_keys"] | |
seen_test = dataset["test_seen"] | |
unseen_test = dataset["test_unseen"] | |
seen_val = dataset["val_seen"] | |
unseen_val = dataset["val_unseen"] | |
# d = getIdToEmbedding(allid, stid, utid, svalid, uvalid, all_keys_dna, seen_test_dna, unseen_test_dna, seen_val_dna, unseen_val_dna) | |
# d = getIdToEmbedding(allid, stid, utid, svalid, uvalid, all_keys_im, seen_test_im, unseen_test_im, seen_val_im, unseen_val_im) | |
big_id_to_image_emb_dict = dict() | |
i = 0 | |
for object in allid: | |
big_id_to_image_emb_dict[object] = np.array([all_keys_im[i]]) | |
i += 1 | |
i = 0 | |
for object in stid: | |
big_id_to_image_emb_dict[object] = np.array([seen_test_im[i]]) | |
i += 1 | |
i = 0 | |
for object in utid: | |
big_id_to_image_emb_dict[object] = np.array([unseen_test_im[i]]) | |
i += 1 | |
i = 0 | |
for object in svalid: | |
big_id_to_image_emb_dict[object] = np.array([seen_val_im[i]]) | |
i += 1 | |
i = 0 | |
for object in uvalid: | |
big_id_to_image_emb_dict[object] = np.array([unseen_val_im[i]]) | |
i += 1 | |
### | |
big_id_to_dna_emb_dict = dict() | |
i = 0 | |
for object in allid: | |
big_id_to_dna_emb_dict[object] = np.array([all_keys_dna[i]]) | |
i += 1 | |
i = 0 | |
for object in stid: | |
big_id_to_dna_emb_dict[object] = np.array([seen_test_dna[i]]) | |
i += 1 | |
i = 0 | |
for object in utid: | |
big_id_to_dna_emb_dict[object] = np.array([unseen_test_dna[i]]) | |
i += 1 | |
i = 0 | |
for object in svalid: | |
big_id_to_dna_emb_dict[object] = np.array([seen_val_dna[i]]) | |
i += 1 | |
i = 0 | |
for object in uvalid: | |
big_id_to_dna_emb_dict[object] = np.array([unseen_val_dna[i]]) | |
i += 1 | |
### | |
processid_to_indx = dict() | |
big_indx_to_id_dict = dict() | |
i = 0 | |
for object in allid: | |
big_indx_to_id_dict[i] = object | |
processid_to_indx[object] = i | |
i += 1 | |
for object in stid: | |
big_indx_to_id_dict[i] = object | |
processid_to_indx[object] = i | |
i += 1 | |
for object in utid: | |
big_indx_to_id_dict[i] = object | |
processid_to_indx[object] = i | |
i += 1 | |
for object in svalid: | |
big_indx_to_id_dict[i] = object | |
processid_to_indx[object] = i | |
i += 1 | |
for object in uvalid: | |
big_indx_to_id_dict[i] = object | |
processid_to_indx[object] = i | |
i += 1 | |
### | |
with open(output / "big_id_to_image_emb_dict.pickle", "wb") as f: | |
pickle.dump(big_id_to_image_emb_dict, f) | |
with open(output / "big_id_to_dna_emb_dict.pickle", "wb") as f: | |
pickle.dump(big_id_to_dna_emb_dict, f) | |
with open(output / "big_indx_to_id_dict.pickle", "wb") as f: | |
pickle.dump(big_indx_to_id_dict, f) | |
if __name__ == "__main__": | |
main() | |