bstraehle commited on
Commit
3271e72
·
verified ·
1 Parent(s): 838b33c

Update custom_utils.py

Browse files
Files changed (1) hide show
  1. custom_utils.py +20 -0
custom_utils.py CHANGED
@@ -11,6 +11,26 @@ from pymongo.mongo_client import MongoClient
11
  DB_NAME = "airbnb_dataset"
12
  COLLECTION_NAME = "listings_reviews"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def process_records(data_frame):
15
  records = data_frame.to_dict(orient='records')
16
  # Handle potential `NaT` values
 
11
  DB_NAME = "airbnb_dataset"
12
  COLLECTION_NAME = "listings_reviews"
13
 
14
+ def rag_ingestion():
15
+ print("111")
16
+ dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
17
+
18
+ #dataset = dataset.take(100)
19
+ print("222")
20
+ # Convert the dataset to a pandas dataframe
21
+ dataset_df = pd.DataFrame(dataset)
22
+ #dataset_df.head(5)
23
+ #print("Columns:", dataset_df.columns)
24
+
25
+ listings = process_records(dataset_df)
26
+ print("333")
27
+ collection.delete_many({})
28
+ collection.insert_many(listings)
29
+ print("Data ingestion into MongoDB completed")
30
+ print("555")
31
+
32
+ # Manually create vector search index, feature is not available in free tier
33
+
34
  def process_records(data_frame):
35
  records = data_frame.to_dict(orient='records')
36
  # Handle potential `NaT` values