Spaces:
Running
on
T4
Running
on
T4
import os, sys | |
import numpy as np | |
import scann | |
import argparse | |
import glob | |
from multiprocessing import cpu_count | |
from tqdm import tqdm | |
from ldm.util import parallel_data_prefetch | |
def search_bruteforce(searcher): | |
return searcher.score_brute_force().build() | |
def search_partioned_ah(searcher, dims_per_block, aiq_threshold, reorder_k, | |
partioning_trainsize, num_leaves, num_leaves_to_search): | |
return searcher.tree(num_leaves=num_leaves, | |
num_leaves_to_search=num_leaves_to_search, | |
training_sample_size=partioning_trainsize). \ | |
score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder(reorder_k).build() | |
def search_ah(searcher, dims_per_block, aiq_threshold, reorder_k): | |
return searcher.score_ah(dims_per_block, anisotropic_quantization_threshold=aiq_threshold).reorder( | |
reorder_k).build() | |
def load_datapool(dpath): | |
def load_single_file(saved_embeddings): | |
compressed = np.load(saved_embeddings) | |
database = {key: compressed[key] for key in compressed.files} | |
return database | |
def load_multi_files(data_archive): | |
database = {key: [] for key in data_archive[0].files} | |
for d in tqdm(data_archive, desc=f'Loading datapool from {len(data_archive)} individual files.'): | |
for key in d.files: | |
database[key].append(d[key]) | |
return database | |
print(f'Load saved patch embedding from "{dpath}"') | |
file_content = glob.glob(os.path.join(dpath, '*.npz')) | |
if len(file_content) == 1: | |
data_pool = load_single_file(file_content[0]) | |
elif len(file_content) > 1: | |
data = [np.load(f) for f in file_content] | |
prefetched_data = parallel_data_prefetch(load_multi_files, data, | |
n_proc=min(len(data), cpu_count()), target_data_type='dict') | |
data_pool = {key: np.concatenate([od[key] for od in prefetched_data], axis=1)[0] for key in prefetched_data[0].keys()} | |
else: | |
raise ValueError(f'No npz-files in specified path "{dpath}" is this directory existing?') | |
print(f'Finished loading of retrieval database of length {data_pool["embedding"].shape[0]}.') | |
return data_pool | |
def train_searcher(opt, | |
metric='dot_product', | |
partioning_trainsize=None, | |
reorder_k=None, | |
# todo tune | |
aiq_thld=0.2, | |
dims_per_block=2, | |
num_leaves=None, | |
num_leaves_to_search=None,): | |
data_pool = load_datapool(opt.database) | |
k = opt.knn | |
if not reorder_k: | |
reorder_k = 2 * k | |
# normalize | |
# embeddings = | |
searcher = scann.scann_ops_pybind.builder(data_pool['embedding'] / np.linalg.norm(data_pool['embedding'], axis=1)[:, np.newaxis], k, metric) | |
pool_size = data_pool['embedding'].shape[0] | |
print(*(['#'] * 100)) | |
print('Initializing scaNN searcher with the following values:') | |
print(f'k: {k}') | |
print(f'metric: {metric}') | |
print(f'reorder_k: {reorder_k}') | |
print(f'anisotropic_quantization_threshold: {aiq_thld}') | |
print(f'dims_per_block: {dims_per_block}') | |
print(*(['#'] * 100)) | |
print('Start training searcher....') | |
print(f'N samples in pool is {pool_size}') | |
# this reflects the recommended design choices proposed at | |
# https://github.com/google-research/google-research/blob/aca5f2e44e301af172590bb8e65711f0c9ee0cfd/scann/docs/algorithms.md | |
if pool_size < 2e4: | |
print('Using brute force search.') | |
searcher = search_bruteforce(searcher) | |
elif 2e4 <= pool_size and pool_size < 1e5: | |
print('Using asymmetric hashing search and reordering.') | |
searcher = search_ah(searcher, dims_per_block, aiq_thld, reorder_k) | |
else: | |
print('Using using partioning, asymmetric hashing search and reordering.') | |
if not partioning_trainsize: | |
partioning_trainsize = data_pool['embedding'].shape[0] // 10 | |
if not num_leaves: | |
num_leaves = int(np.sqrt(pool_size)) | |
if not num_leaves_to_search: | |
num_leaves_to_search = max(num_leaves // 20, 1) | |
print('Partitioning params:') | |
print(f'num_leaves: {num_leaves}') | |
print(f'num_leaves_to_search: {num_leaves_to_search}') | |
# self.searcher = self.search_ah(searcher, dims_per_block, aiq_thld, reorder_k) | |
searcher = search_partioned_ah(searcher, dims_per_block, aiq_thld, reorder_k, | |
partioning_trainsize, num_leaves, num_leaves_to_search) | |
print('Finish training searcher') | |
searcher_savedir = opt.target_path | |
os.makedirs(searcher_savedir, exist_ok=True) | |
searcher.serialize(searcher_savedir) | |
print(f'Saved trained searcher under "{searcher_savedir}"') | |
if __name__ == '__main__': | |
sys.path.append(os.getcwd()) | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--database', | |
'-d', | |
default='data/rdm/retrieval_databases/openimages', | |
type=str, | |
help='path to folder containing the clip feature of the database') | |
parser.add_argument('--target_path', | |
'-t', | |
default='data/rdm/searchers/openimages', | |
type=str, | |
help='path to the target folder where the searcher shall be stored.') | |
parser.add_argument('--knn', | |
'-k', | |
default=20, | |
type=int, | |
help='number of nearest neighbors, for which the searcher shall be optimized') | |
opt, _ = parser.parse_known_args() | |
train_searcher(opt,) |