Spaces:
Runtime error
Runtime error
import click | |
from qdrant_client import QdrantClient, models | |
from openai import OpenAI | |
from tqdm import tqdm | |
import json | |
import requests | |
import os | |
from prompts import REVIEWS_SYSTEM_PROMPT, REVIEWS_USER_PROMPT | |
TRIPADVISOR_API_KEY = os.environ.get('TRIPADVISOR_API_KEY') | |
def save_json(data, path): | |
with open(path, "w") as outfile: | |
json.dump(data, outfile) | |
def get_df(dataset_path, is_hf): | |
if is_hf: | |
from datasets import load_dataset | |
dataset = load_dataset(dataset_path) | |
return dataset['train'].to_pandas() | |
else: | |
import pandas as pd | |
return pd.read_csv(dataset_path) | |
def _concat_reviews(df): | |
text = '' | |
for _, row in df.iterrows(): | |
text += '\n' | |
if row.review_title: | |
text += '\nTitle:\n' + row.review_title | |
if row.review_text: | |
text += '\nReview:\n' + row.review_text | |
return text | |
def create_reviews_symmary(df, model, hotels, pos_rate=4.0, neg_rate=4.0, n_reviews=6): | |
"""Create a summary of reviews for each hotel, based on the most positive and most negative reviews. | |
Args: | |
df (pd.DataFrame): hotels dataset | |
model (str): OpenAI model name | |
hotels (list): list of hotels to create summaries for | |
pos_rate (float): minimum positive rate, inclusive | |
neg_rate (float): maximum negative rate, exclusive | |
n_reviews (int): number of reviews to consider for each category | |
Returns: | |
dict: hotel name -> reviews summary | |
""" | |
df['review_text_len'] = df.review_text.str.len().fillna(value=0) | |
df['review_title_len'] = df.review_title.str.len().fillna(value=0) | |
client = OpenAI() | |
hotels_reviews_summary = {} | |
for hotel in tqdm(hotels): | |
temp = df[df.hotel_name.eq(hotel)] | |
temp_pos = temp[temp.rate >= pos_rate].nlargest(n_reviews, 'review_text_len') | |
temp_neg = temp[temp.rate < neg_rate].nlargest(n_reviews, 'review_text_len') | |
if len(temp_pos) == 0 and len(temp_neg) == 0: | |
temp_pos = temp.nlargest(n_reviews, 'review_title_len') | |
text = _concat_reviews(temp_pos) + _concat_reviews(temp_neg) | |
if text: | |
response = client.chat.completions.create( | |
model=model, | |
messages=[ | |
{"role": "system", "content": REVIEWS_SYSTEM_PROMPT}, | |
{"role": "user", "content": REVIEWS_USER_PROMPT.format(text=text)}, | |
] | |
) | |
hotels_reviews_summary[hotel] = response.choices[0].message.content | |
return hotels_reviews_summary | |
def _get_loc_id(hotel): | |
""" Given a hotel name, receive location id. | |
In order to get the hotel info, we need to get the location id first. | |
Args: | |
hotel (str): hotel name | |
Returns: | |
str: location id | |
""" | |
url = "https://api.content.tripadvisor.com/api/v1/location/search?key={key}&searchQuery={hotel}&category=hotels&language=en" | |
headers = {"accept": "application/json"} | |
response = requests.get(url.format(hotel=hotel, key=TRIPADVISOR_API_KEY), headers=headers) | |
try: | |
return response.json()['data'][0]['location_id'] | |
except Exception as e: | |
print(f'{response.status_code=}') | |
print(f'{response.text=}') | |
print(f'Error: {e}') | |
return None | |
def get_hotel_info(hotel): | |
"""Get hotel info from TripAdvisor. | |
The following information is retrieved using the TripAdvisor API: | |
- rank | |
- ratings distributions | |
- subratings | |
- amenities | |
Args: | |
hotel (str): hotel name | |
Returns: | |
dict: hotel info | |
""" | |
url = "https://api.content.tripadvisor.com/api/v1/location/{loc_id}/details?key={key}&language=en¤cy=USD" | |
headers = {"accept": "application/json"} | |
loc_id = _get_loc_id(hotel) | |
if loc_id is None: | |
return None | |
response = requests.get(url.format(loc_id=loc_id, key=TRIPADVISOR_API_KEY), headers=headers) | |
try: | |
response = response.json() | |
except Exception as e: | |
print(f'{response.status_code=}') | |
print(f'{response.text=}') | |
print(f'Error: {e}') | |
return None | |
rank = response['ranking_data'].get('ranking_string') | |
reviews_ratings = response.get('review_rating_count') | |
subratings = {} | |
for d in response['subratings']: | |
subratings[response['subratings'][d]['name']] = response['subratings'][d]['value'] | |
amenities = response.get('amenities', []) | |
return dict( | |
rank=rank, | |
reviews_ratings=reviews_ratings, | |
subratings=subratings, | |
amenities=amenities, | |
) | |
def get_desc(hotel, data): | |
"""Create a text description of the hotel based on the retrieved data from TripAdvisor. | |
Args: | |
hotel (str): hotel name | |
data (dict): hotel info | |
Returns: | |
str: hotel text description | |
""" | |
rating = "Rating: "+str(data[hotel]['rank'])+". " | |
distr_ranks = "Rating distribution " | |
for key in data[hotel]['reviews_ratings'].keys(): | |
distr_ranks += str(key) + ": " + str(data[hotel]['reviews_ratings'][key] + ", ") | |
distr_ranks = distr_ranks[:-2]+". " | |
sub_ranks = "Specific ratings: " | |
if 'rate_location' in data[hotel]['subratings'].keys(): | |
sub_ranks += "Location " + data[hotel]['subratings']['rate_location'] + ", " | |
if 'rate_sleep' in data[hotel]['subratings'].keys(): | |
sub_ranks += "Sleep " + data[hotel]['subratings']['rate_sleep'] + ", " | |
if 'rate_room' in data[hotel]['subratings'].keys(): | |
sub_ranks += "Room " + data[hotel]['subratings']['rate_room'] + ", " | |
if 'rate_service' in data[hotel]['subratings'].keys(): | |
sub_ranks += "Service " + data[hotel]['subratings']['rate_service'] + ", " | |
if 'rate_cleanliness' in data[hotel]['subratings'].keys(): | |
sub_ranks += "Cleanliness " + data[hotel]['subratings']['rate_cleanliness'] | |
sub_ranks += ". " | |
amenities = "Amenities available: " | |
for i in data[hotel]['amenities']: | |
amenities += str(i) + ", " | |
amenities = amenities[:-2] + "." | |
total_desc = rating + distr_ranks + sub_ranks + amenities | |
return total_desc | |
def get_payload(hotel, df): | |
"""Create a metadata which will be stored in the database. | |
Args: | |
hotel (str): hotel name | |
df (pd.DataFrame): hotels dataset | |
Returns: | |
dict: metadata | |
""" | |
temp = df[df.hotel_name.eq(hotel)] | |
rating = temp.rating_value.value_counts().index[0] | |
city = temp.locality.value_counts().index[0] | |
country = temp.country.value_counts().index[0] | |
price = temp.price_range.str.split(' ').str[0].value_counts().index[0] | |
return dict( | |
hotel_name=hotel, | |
rating=rating, | |
city=city, | |
country=country, | |
price=price | |
) | |
def create_vector_db(dataset_path, is_hf, db_path, collection_name, embeddings_model, embeddings_size, reviews_model): | |
REVIEW_SUMMARIES_PATH = 'reviews_summary.json' | |
HOTELS_INFO_PATH = 'hotels_info.json' | |
df = get_df(dataset_path, is_hf) | |
# Create a collection if it does not exist and filter out hotels that are already in the collection | |
qdrant_client = QdrantClient(path=db_path) | |
if not qdrant_client.collection_exists(collection_name): | |
qdrant_client.create_collection( | |
collection_name=collection_name, | |
vectors_config=models.VectorParams(size=embeddings_size, distance=models.Distance.COSINE), | |
) | |
hotels = df.hotel_name.unique() | |
else: | |
docs, _ = qdrant_client.scroll( | |
collection_name=collection_name, | |
limit=1e9, | |
with_payload=True, | |
with_vectors=False, | |
) | |
hotels = set(df.hotel_name.unique()) - set([doc.payload['hotel_name'] for doc in docs]) | |
if len(hotels) == 0: | |
return | |
# Create reviews summary using OpenAI | |
reviews_summary = create_reviews_symmary(df, reviews_model, hotels) | |
save_json(reviews_summary, REVIEW_SUMMARIES_PATH) | |
# Get hotel info from TripAdvisor | |
hotels_info = {} | |
for hotel in tqdm(hotels): | |
hotels_info[hotel] = get_hotel_info(hotel) | |
save_json(hotels_info, HOTELS_INFO_PATH) | |
# Create descriptions and payloads for each hotel | |
texts = [] | |
payloads = [] | |
for hotel in hotels: | |
trip_desc_hotel = get_desc(hotel, hotels_info) | |
review_hotel = reviews_summary.get(hotel) | |
payload = get_payload(hotel, df) | |
text = trip_desc_hotel if trip_desc_hotel else '' + '\n' + review_hotel if review_hotel else '' | |
payload['description'] = text | |
payloads.append(payload) | |
texts.append(text) | |
# Create description embeddings and upsert them to the database | |
openai_client = OpenAI() | |
embeddings = openai_client.embeddings.create(input=texts, model=embeddings_model) | |
points = [ | |
models.PointStruct( | |
id=idx, | |
vector=data.embedding, | |
payload=payload, | |
) | |
for idx, (data, payload) in enumerate(zip(embeddings.data, payloads)) | |
] | |
qdrant_client.upsert(collection_name, points) | |
if __name__ == '__main__': | |
create_vector_db() | |