Spaces:

bstraehle
/

advanced-rag

Running

App Files Files Community

advanced-rag / mongodb.py

bstraehle

Create mongodb.py

1ebe7e2 verified 7 months ago

raw

history blame

3.11 kB

	import time

	from pymongo.mongo_client import MongoClient
	from pymongo.operations import SearchIndexModel

	database_name = "airbnb_dataset"
	collection_name = "listings_reviews"

	def get_mongo_client(mongo_uri):
	"""Establish connection to the MongoDB."""

	# gateway to interacting with a MongoDB database cluster
	client = MongoClient(mongo_uri, appname="devrel.deeplearningai.lesson1.python")
	print("Connection to MongoDB successful")
	return client

	if not MONGO_URI:
	print("MONGO_URI not set in environment variables")

	def create_db():
	mongo_client = get_mongo_client(MONGO_URI)

	# Pymongo client of database and collection
	db = mongo_client.get_database(database_name)
	collection = db.get_collection(collection_name)

	# Delete any existing records in the collection
	collection.delete_many({})

	def ingest_data():
	# The ingestion process might take a few minutes
	collection.insert_many(listings)
	print("Data ingestion into MongoDB completed")

	def create_vector_search_index():
	# NOTE: This dataset contains text and image embeddings, but this lessons only uses the text embeddings
	# The field containing the text embeddings on each document within the listings_reviews collection
	text_embedding_field_name = "text_embeddings"
	# MongoDB Atlas Vector Search index name
	vector_search_index_name_text = "vector_index_text"

	vector_search_index_model = SearchIndexModel(
	definition={
	"mappings": { # describes how fields in the database documents are indexed and stored
	"dynamic": True, # automatically index new fields that appear in the document
	"fields": { # properties of the fields that will be indexed.
	text_embedding_field_name: {
	"dimensions": 1536, # size of the vector.
	"similarity": "cosine", # algorithm used to compute the similarity between vectors
	"type": "knnVector",
	}
	},
	}
	},
	name=vector_search_index_name_text, # identifier for the vector search index
	)

	# Check if the index already exists
	index_exists = False
	for index in collection.list_indexes():
	print(index)
	if index['name'] == vector_search_index_name_text:
	index_exists = True
	break

	# Create the index if it doesn't exist
	if not index_exists:
	try:
	result = collection.create_search_index(model=vector_search_index_model)
	print("Creating index...")
	time.sleep(20) # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
	print("Index created successfully:", result)
	print("Wait a few minutes before conducting search with index to ensure index intialization")
	except Exception as e:
	print(f"Error creating vector search index: {str(e)}")
	else:
	print(f"Index '{vector_search_index_name_text}' already exists.")