Spaces:
Running
Running
Update custom_utils.py
Browse files- custom_utils.py +20 -0
custom_utils.py
CHANGED
@@ -11,6 +11,26 @@ from pymongo.mongo_client import MongoClient
|
|
11 |
DB_NAME = "airbnb_dataset"
|
12 |
COLLECTION_NAME = "listings_reviews"
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def process_records(data_frame):
|
15 |
records = data_frame.to_dict(orient='records')
|
16 |
# Handle potential `NaT` values
|
|
|
11 |
DB_NAME = "airbnb_dataset"
|
12 |
COLLECTION_NAME = "listings_reviews"
|
13 |
|
14 |
+
def rag_ingestion():
|
15 |
+
print("111")
|
16 |
+
dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
|
17 |
+
|
18 |
+
#dataset = dataset.take(100)
|
19 |
+
print("222")
|
20 |
+
# Convert the dataset to a pandas dataframe
|
21 |
+
dataset_df = pd.DataFrame(dataset)
|
22 |
+
#dataset_df.head(5)
|
23 |
+
#print("Columns:", dataset_df.columns)
|
24 |
+
|
25 |
+
listings = process_records(dataset_df)
|
26 |
+
print("333")
|
27 |
+
collection.delete_many({})
|
28 |
+
collection.insert_many(listings)
|
29 |
+
print("Data ingestion into MongoDB completed")
|
30 |
+
print("555")
|
31 |
+
|
32 |
+
# Manually create vector search index, feature is not available in free tier
|
33 |
+
|
34 |
def process_records(data_frame):
|
35 |
records = data_frame.to_dict(orient='records')
|
36 |
# Handle potential `NaT` values
|