advanced-rag / mongodb.py
bstraehle's picture
Create mongodb.py
1ebe7e2 verified
raw
history blame
3.11 kB
import time
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel
database_name = "airbnb_dataset"
collection_name = "listings_reviews"
def get_mongo_client(mongo_uri):
"""Establish connection to the MongoDB."""
# gateway to interacting with a MongoDB database cluster
client = MongoClient(mongo_uri, appname="devrel.deeplearningai.lesson1.python")
print("Connection to MongoDB successful")
return client
if not MONGO_URI:
print("MONGO_URI not set in environment variables")
def create_db():
mongo_client = get_mongo_client(MONGO_URI)
# Pymongo client of database and collection
db = mongo_client.get_database(database_name)
collection = db.get_collection(collection_name)
# Delete any existing records in the collection
collection.delete_many({})
def ingest_data():
# The ingestion process might take a few minutes
collection.insert_many(listings)
print("Data ingestion into MongoDB completed")
def create_vector_search_index():
# NOTE: This dataset contains text and image embeddings, but this lessons only uses the text embeddings
# The field containing the text embeddings on each document within the listings_reviews collection
text_embedding_field_name = "text_embeddings"
# MongoDB Atlas Vector Search index name
vector_search_index_name_text = "vector_index_text"
vector_search_index_model = SearchIndexModel(
definition={
"mappings": { # describes how fields in the database documents are indexed and stored
"dynamic": True, # automatically index new fields that appear in the document
"fields": { # properties of the fields that will be indexed.
text_embedding_field_name: {
"dimensions": 1536, # size of the vector.
"similarity": "cosine", # algorithm used to compute the similarity between vectors
"type": "knnVector",
}
},
}
},
name=vector_search_index_name_text, # identifier for the vector search index
)
# Check if the index already exists
index_exists = False
for index in collection.list_indexes():
print(index)
if index['name'] == vector_search_index_name_text:
index_exists = True
break
# Create the index if it doesn't exist
if not index_exists:
try:
result = collection.create_search_index(model=vector_search_index_model)
print("Creating index...")
time.sleep(20) # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
print("Index created successfully:", result)
print("Wait a few minutes before conducting search with index to ensure index intialization")
except Exception as e:
print(f"Error creating vector search index: {str(e)}")
else:
print(f"Index '{vector_search_index_name_text}' already exists.")