Spaces:
Sleeping
Sleeping
import chromadb | |
from sentence_transformers import CrossEncoder, SentenceTransformer | |
import json | |
def chroma_client_setup(): | |
print("Setup client") | |
chroma_client = chromadb.Client() | |
collection = chroma_client.create_collection( | |
name="food_collection", | |
metadata={"hnsw:space": "cosine"} # l2 is the default | |
) | |
return collection | |
def load_data(): | |
print("load data") | |
with open("test_json.json", "r") as f: | |
data = json.load(f) | |
return data | |
def embedding_function(items_to_embed: list[str]): | |
print("embedding") | |
sentence_model = SentenceTransformer( | |
"mixedbread-ai/mxbai-embed-large-v1" | |
) | |
embedded_items = sentence_model.encode( | |
items_to_embed | |
) | |
print(len(embedded_items)) | |
print(type(embedded_items[0])) | |
print(type(embedded_items[0][0])) | |
embedded_list = [item.tolist() for item in embedded_items] | |
print(len(embedded_list)) | |
print(type(embedded_list[0])) | |
print(type(embedded_list[0][0])) | |
return embedded_list | |
def chroma_upserting(collection, payload:list[dict]): | |
print('upserting') | |
print("printing item:") | |
embedding = embedding_function([item['doc'] for item in payload]) | |
print(type(embedding)) | |
collection.add( | |
documents=[item['doc'] for item in payload], | |
embeddings=embedding, | |
#metadatas=item, | |
ids=[f"id_{idx}" for idx, _ in enumerate(payload)] | |
) | |
def search_chroma(collection, query:str): | |
results = collection.query( | |
query_embeddings=embedding_function([query]), | |
n_results=5 | |
) | |
return results | |
def reranking_results(query: str, top_k_results: list[str]): | |
# Load the model, here we use our base sized model | |
rerank_model = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1") | |
reranked_results = rerank_model.rank(query, top_k_results, return_documents=True) | |
return reranked_results | |