import time from pymongo.mongo_client import MongoClient from pymongo.operations import SearchIndexModel database_name = "airbnb_dataset" collection_name = "listings_reviews" def get_mongo_client(mongo_uri): """Establish connection to the MongoDB.""" # gateway to interacting with a MongoDB database cluster client = MongoClient(mongo_uri, appname="devrel.deeplearningai.lesson1.python") print("Connection to MongoDB successful") return client if not MONGO_URI: print("MONGO_URI not set in environment variables") def create_db(): mongo_client = get_mongo_client(MONGO_URI) # Pymongo client of database and collection db = mongo_client.get_database(database_name) collection = db.get_collection(collection_name) # Delete any existing records in the collection collection.delete_many({}) def ingest_data(): # The ingestion process might take a few minutes collection.insert_many(listings) print("Data ingestion into MongoDB completed") def create_vector_search_index(): # NOTE: This dataset contains text and image embeddings, but this lessons only uses the text embeddings # The field containing the text embeddings on each document within the listings_reviews collection text_embedding_field_name = "text_embeddings" # MongoDB Atlas Vector Search index name vector_search_index_name_text = "vector_index_text" vector_search_index_model = SearchIndexModel( definition={ "mappings": { # describes how fields in the database documents are indexed and stored "dynamic": True, # automatically index new fields that appear in the document "fields": { # properties of the fields that will be indexed. text_embedding_field_name: { "dimensions": 1536, # size of the vector. "similarity": "cosine", # algorithm used to compute the similarity between vectors "type": "knnVector", } }, } }, name=vector_search_index_name_text, # identifier for the vector search index ) # Check if the index already exists index_exists = False for index in collection.list_indexes(): print(index) if index['name'] == vector_search_index_name_text: index_exists = True break # Create the index if it doesn't exist if not index_exists: try: result = collection.create_search_index(model=vector_search_index_model) print("Creating index...") time.sleep(20) # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization print("Index created successfully:", result) print("Wait a few minutes before conducting search with index to ensure index intialization") except Exception as e: print(f"Error creating vector search index: {str(e)}") else: print(f"Index '{vector_search_index_name_text}' already exists.")