Spaces:
Sleeping
Sleeping
import chromadb | |
from sentence_transformers import CrossEncoder, SentenceTransformer | |
import json | |
def chroma_client_setup(): | |
print("Setup client") | |
chroma_client = chromadb.Client() | |
collection = chroma_client.create_collection( | |
name="food_collection", | |
metadata={"hnsw:space": "cosine"} # l2 is the default | |
) | |
return collection | |
def load_data(): | |
print("load data") | |
with open("test_json.json", "r") as f: | |
data = json.load(f) | |
return data | |
def embedding_function(items_to_embed: list[str]): | |
print("embedding") | |
sentence_model = SentenceTransformer( | |
"mixedbread-ai/mxbai-embed-large-v1" | |
) | |
embedded_items = sentence_model.encode( | |
items_to_embed | |
) | |
print(len(embedded_items)) | |
print(type(embedded_items[0])) | |
print(type(embedded items[0][0])) | |
return embedded_items | |
def chroma_upserting(collection, payload:list[dict]): | |
print('upserting') | |
#for idx, item in enumerate(payload): | |
# print("printing item:") | |
# print(type(item)) | |
embedding = list(embedding_function(item['doc'])) | |
# print(type(embedding)) | |
# collection.add( | |
# documents=item['doc'], | |
# embeddings=[embedding], | |
# #metadatas=item, | |
# ids=f"id_{idx}" | |
# ) | |
def search_chroma(collection, query:str): | |
results = collection.query( | |
query_embeddings=embedding_function([query]), | |
n_results=5 | |
) | |
return results | |
def reranking_results(query: str, top_k_results: list[str]): | |
# Load the model, here we use our base sized model | |
rerank_model = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1") | |
reranked_results = rerank_model.rank(query, top_k_results, return_documents=True) | |
return reranked_results | |