Spaces:

bstraehle
/

advanced-rag

Running

File size: 3,108 Bytes

1ebe7e2

import time

from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel

database_name = "airbnb_dataset"
collection_name = "listings_reviews"

def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""

    # gateway to interacting with a MongoDB database cluster
    client = MongoClient(mongo_uri, appname="devrel.deeplearningai.lesson1.python")
    print("Connection to MongoDB successful")
    return client

if not MONGO_URI:
    print("MONGO_URI not set in environment variables")

def create_db():
    mongo_client = get_mongo_client(MONGO_URI)
    
    # Pymongo client of database and collection
    db = mongo_client.get_database(database_name)
    collection = db.get_collection(collection_name)
    
    # Delete any existing records in the collection
    collection.delete_many({})

def ingest_data():
    # The ingestion process might take a few minutes
    collection.insert_many(listings)
    print("Data ingestion into MongoDB completed")

def create_vector_search_index():
    # NOTE: This dataset contains text and image embeddings, but this lessons only uses the text embeddings
    # The field containing the text embeddings on each document within the listings_reviews collection 
    text_embedding_field_name = "text_embeddings"
    # MongoDB Atlas Vector Search index name
    vector_search_index_name_text = "vector_index_text"
    
    vector_search_index_model = SearchIndexModel(
        definition={
            "mappings": { # describes how fields in the database documents are indexed and stored
                "dynamic": True, # automatically index new fields that appear in the document
                "fields": { # properties of the fields that will be indexed.
                    text_embedding_field_name: { 
                        "dimensions": 1536, # size of the vector.
                        "similarity": "cosine", # algorithm used to compute the similarity between vectors
                        "type": "knnVector",
                    }
                },
            }
        },
        name=vector_search_index_name_text, # identifier for the vector search index
    )
    
    # Check if the index already exists
    index_exists = False
    for index in collection.list_indexes():
        print(index)
        if index['name'] == vector_search_index_name_text:
            index_exists = True
            break

    # Create the index if it doesn't exist
    if not index_exists:
        try:
            result = collection.create_search_index(model=vector_search_index_model)
            print("Creating index...")
            time.sleep(20)  # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
            print("Index created successfully:", result)
            print("Wait a few minutes before conducting search with index to ensure index intialization")
        except Exception as e:
            print(f"Error creating vector search index: {str(e)}")
    else:
        print(f"Index '{vector_search_index_name_text}' already exists.")