timeki's picture
Add content recommandation (#17)
bcc8503 verified
import io
import os
from PIL import Image
from azure.storage.blob import ContainerClient
def get_file_from_azure_blob_storage(path):
AZURE_SAS_URL_TRD = os.environ["AZURE_SAS_URL_TRD"]
container_client = ContainerClient.from_container_url(AZURE_SAS_URL_TRD)
blob_client = container_client.get_blob_client(path)
stream = blob_client.download_blob().readall()
file_object = io.BytesIO(stream)
return file_object
def get_image_from_azure_blob_storage(path):
base_path = "climateqa/documents/"
path = os.path.join(base_path, path)
file_object = get_file_from_azure_blob_storage(path)
image = Image.open(file_object)
return image
def remove_duplicates_keep_highest_score(documents):
unique_docs = {}
for doc in documents:
doc_id = doc.metadata.get('doc_id')
if doc_id in unique_docs:
if doc.metadata['reranking_score'] > unique_docs[doc_id].metadata['reranking_score']:
unique_docs[doc_id] = doc
else:
unique_docs[doc_id] = doc
return list(unique_docs.values())